diff options
| author | Paul Buetow <paul@buetow.org> | 2025-02-10 22:35:49 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2025-02-10 22:35:49 +0200 |
| commit | f41e8589393d9c4efa53490a63d96377e91ab0d8 (patch) | |
| tree | 52cd56213944608920df1146b115e56f9240dd80 /frontends | |
| parent | bef65c4f5fd9c41444ae637e939c4e2d038cb531 (diff) | |
add foostats odds
Diffstat (limited to 'frontends')
| -rw-r--r-- | frontends/Rexfile | 8 | ||||
| -rw-r--r-- | frontends/scripts/foostats.pl | 1015 | ||||
| -rw-r--r-- | frontends/scripts/odds.txt | 16 |
3 files changed, 670 insertions, 369 deletions
diff --git a/frontends/Rexfile b/frontends/Rexfile index c1ea2f8..eff2776 100644 --- a/frontends/Rexfile +++ b/frontends/Rexfile @@ -540,6 +540,8 @@ task 'foostats', use File::Copy; my $git_script_path = $ENV{HOME} . '/git/foostats/foostats.pl'; copy( $git_script_path, './scripts/foostats.pl' ) if -f $git_script_path; + my $git_odds_path = $ENV{HOME} . '/git/foostats/odds.txt'; + copy( $git_odds_path, './scripts/odds.txt' ) if -f $git_odds_path; file '/usr/local/bin/foostats.pl', source => './scripts/foostats.pl', @@ -547,6 +549,12 @@ task 'foostats', group => 'wheel', mode => '500'; + file '/var/www/htdocs/buetow.org/self/foostats/odds.txt', + source => './scripts/odds.txt', + owner => 'root', + group => 'wheel', + mode => '440'; + append_if_no_such_line '/etc/daily.local', 'perl /usr/local/bin/foostats.pl --parse-logs --replicate'; my @deps = qw(p5-Digest-SHA3 p5-PerlIO-gzip p5-JSON p5-String-Util p5-LWP-Protocol-https); diff --git a/frontends/scripts/foostats.pl b/frontends/scripts/foostats.pl index 56ad152..ea4d7a5 100644 --- a/frontends/scripts/foostats.pl +++ b/frontends/scripts/foostats.pl @@ -6,451 +6,728 @@ use v5.38; # use strict; # use warnings; -use builtin qw(true false); +use builtin qw(true false); use experimental qw(builtin); use feature qw(refaliasing); no warnings qw(experimental::refaliasing); -# use diagnostics; -# use Data::Dumper; +# TODO: UNDO +use diagnostics; # TODO: Blog post about this script and the new Perl features used. # TODO NEXT: -# 1) Implement replicator -# 2) Write out a nice output from each merged file, also merge if multiple hosts results +# * Write out a nice output from each merged file, also merge if multiple hosts results +# * Fix bug with .gmi.*.gmi in the log parser +# * Nicely formatted .txt output by stats by count by date +# * Print out all UAs, to add new excludes/blocked IPs package FileHelper { - use JSON; - - sub write ($path, $content) { - open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!"; - print $fh $content; - close $fh; - - rename "$path.tmp", $path; - } - - sub write_json_gz ($path, $data) { - my $json = encode_json $data; - - say "Writing $path"; - open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!"; - print $fd $json; - close $fd; - - rename "$path.tmp", $path or die "$path.tmp: $!"; - } - - sub read_json_gz ($path) { - say "Reading $path"; - open my $fd, '<:gzip', $path or die "$path: $!"; - my $json = decode_json <$fd>; - close $fd; - return $json; - } -} + use JSON; -package Foostats::Logreader { - use Digest::SHA3 'sha3_512_base64'; - use File::stat; - use PerlIO::gzip; - use Time::Piece; - use String::Util qw(contains startswith endswith); + sub write ( $path, $content ) { + open my $fh, '>', "$path.tmp" + or die "\nCannot open file: $!"; + print $fh $content; + close $fh; - use constant { - GEMINI_LOGS_GLOB => '/var/log/daemon*', - WEB_LOGS_GLOB => '/var/www/logs/access.log*', - }; + rename + "$path.tmp", + $path; + } - sub anonymize_ip ($ip) { - my $ip_proto = contains($ip, ':') ? 'IPv6' : 'IPv4'; - my $ip_hash = sha3_512_base64 $ip; - return ($ip_hash, $ip_proto); - } + sub write_json_gz ( $path, $data ) { + my $json = encode_json $data; - sub read_lines ($glob, $cb) { - my sub year ($path) { localtime( (stat $path)->mtime )->strftime('%Y') } - - my sub open_file ($path) { - my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<'; - open my $fd, $flag, $path or die "$path: $!"; - return $fd; - } - - my $last = false; - - say 'File path glob matches: ' . join(' ', glob $glob); - - LAST: - for my $path ( sort { -M $a <=> -M $b } glob $glob) { - say "Processing $path"; - - my $file = open_file $path; - my $year = year $file; - - while (<$file>) { - next if contains($_, 'logfile turned over'); - - # last == true means: After this file, don't process more - $last = true unless defined $cb->($year, split / +/); - } - - say "Closing $path (last:$last)"; - close $file; - last LAST if $last; - } - } - - sub parse_web_logs ($last_processed_date, $cb) { - my sub parse_date ($date) { - my $t = Time::Piece->strptime($date, '[%d/%b/%Y:%H:%M:%S'); - return ($t->strftime('%Y%m%d'), $t->strftime('%H%M%S')); - } - - my sub parse_web_line (@line) { - my ($date, $time) = parse_date $line[4]; - return undef if $date < $last_processed_date; - - # X-Forwarded-For? - my $ip = $line[-2] eq '-' ? $line[1] : $line[-2]; - my ($ip_hash, $ip_proto) = anonymize_ip $ip; - - return { - proto => 'web', - host => $line[0], - ip_hash => $ip_hash, - ip_proto => $ip_proto, - date => $date, - time => $time, - uri_path => $line[7], - status => $line[9], - }; - } - - read_lines WEB_LOGS_GLOB, sub ($year, @line) { $cb->(parse_web_line @line) }; - } - - sub parse_gemini_logs ($last_processed_date, $cb) { - my sub parse_date ($year, @line) { - my $timestr = "$line[0] $line[1]"; - return Time::Piece->strptime($timestr, '%b %d')->strftime("$year%m%d"); - } - - my sub parse_vger_line ($year, @line) { - my $full_path = $line[5]; - $full_path =~ s/"//g; - my ($proto, undef, $host, $uri_path) = split '/', $full_path, 4; - $uri_path = '' unless defined $uri_path; - - return { - proto => 'gemini', - host => $host, - uri_path => "/$uri_path", - status => $line[6], - date => int(parse_date($year, @line)), - time => $line[2], - }; - } - - my sub parse_relayd_line ($year, @line) { - my $date = int(parse_date($year, @line)); + say "Writing $path"; + open my $fd, '>:gzip', "$path.tmp" + or die "$path.tmp: $!"; + print $fd $json; + close $fd; - my ($ip_hash, $ip_proto) = anonymize_ip $line[12]; - return { - ip_hash => $ip_hash, - ip_proto => $ip_proto, - date => $date, - time => $line[2], - }; + rename "$path.tmp", $path + or die "$path.tmp: $!"; } - # Expect one vger and one relayd log line per event! So collect - # both events (one from one log line each) and then merge the result hash! - my ($vger, $relayd); - read_lines GEMINI_LOGS_GLOB, sub ($year, @line) { - if ($line[4] eq 'vger:') { - $vger = parse_vger_line $year, @line; - } elsif ($line[5] eq 'relay' and startswith($line[6], 'gemini')) { - $relayd = parse_relayd_line $year, @line; - return undef if $relayd->{date} < $last_processed_date; - } + sub read_json_gz ($path) { + say "Reading $path"; + open my $fd, '<:gzip', $path + or die "$path: $!"; + my $json = decode_json <$fd>; + close $fd; + return $json; + } - if (defined $vger and defined $relayd and $vger->{time} eq $relayd->{time}) { - $cb->({ %$vger, %$relayd }); - $vger = $relayd = undef; - } - - true; - }; - } + sub read_lines ($path) { + my @lines; + open( my $fh, '<', $path ) + or die "$path: $!"; + chomp( @lines = <$fh> ); + close($fh); + return @lines; + } +} - sub parse_logs ($last_web_date, $last_gemini_date) { - my $agg = Foostats::Aggregator->new; +package DateHelper { + use Time::Piece; - say "Last web date: $last_web_date"; - say "Last gemini date: $last_gemini_date"; + sub last_month_dates () { + my $today = localtime; + my @dates; - parse_web_logs $last_web_date, sub ($event) { $agg->add($event) }; - parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) }; + for my $days_ago ( 0 .. 30 ) { + my $date = $today - ( $days_ago * 24 * 60 * 60 ); + push + @dates, + $date->strftime('%Y%m%d'); + } - return $agg->{stats}; - } + return @dates; + } } -package Foostats::Filter { - use String::Util qw(contains startswith endswith); - use constant WARN_ODD => false; +package Foostats::Logreader { + use Digest::SHA3 'sha3_512_base64'; + use File::stat; + use PerlIO::gzip; + use Time::Piece; + use String::Util qw(contains startswith endswith); + + use constant { + GEMINI_LOGS_GLOB => '/var/log/daemon*', + WEB_LOGS_GLOB => '/var/www/logs/access.log*', + }; - sub new ($class) { - bless { - odds => [qw( - .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin - .git microsoft.exchange .lua /owa/ - )] - }, $class; - } + sub anonymize_ip ($ip) { + my $ip_proto = + contains( $ip, ':' ) + ? 'IPv6' + : 'IPv4'; + my $ip_hash = sha3_512_base64 $ip; + return ( $ip_hash, $ip_proto ); + } - sub ok ($self, $event) { - state %blocked = (); - return false if exists $blocked{$event->{ip_hash}}; + sub read_lines ( $glob, $cb ) { + my sub year ($path) { + localtime( ( stat $path )->mtime )->strftime('%Y'); + } + + my sub open_file ($path) { + my $flag = + $path =~ /\.gz$/ + ? '<:gzip' + : '<'; + open my $fd, $flag, $path + or die "$path: $!"; + return $fd; + } + + my $last = false; + + say 'File path glob matches: ' . join( ' ', glob $glob ); + + LAST: + for my $path ( sort { -M $a <=> -M $b } glob $glob ) { + say "Processing $path"; + + my $file = open_file $path; + my $year = year $file; + + while (<$file>) { + next + if contains( $_, 'logfile turned over' ); + + # last == true means: After this file, don't process more + $last = true + unless defined $cb->( $year, split / +/ ); + } + + say "Closing $path (last:$last)"; + close $file; + last LAST + if $last; + } + } - if ($self->odd($event) or $self->excessive($event)) { - ($blocked{$event->{ip_hash}} //= 0)++; - return false; - } else { - return true; + sub parse_web_logs ( $last_processed_date, $cb ) { + my sub parse_date ($date) { + my $t = Time::Piece->strptime( $date, '[%d/%b/%Y:%H:%M:%S' ); + return ( $t->strftime('%Y%m%d'), $t->strftime('%H%M%S') ); + } + + my sub parse_web_line (@line) { + my ( $date, $time ) = parse_date $line [4]; + return undef + if $date < $last_processed_date; + + # X-Forwarded-For? + my $ip = + $line[-2] eq '-' + ? $line[1] + : $line[-2]; + my ( $ip_hash, $ip_proto ) = anonymize_ip $ip; + + return { + proto => 'web', + host => $line[0], + ip_hash => $ip_hash, + ip_proto => $ip_proto, + date => $date, + time => $time, + uri_path => $line[7], + status => $line[9], + }; + } + + read_lines WEB_LOGS_GLOB, sub ( $year, @line ) { + $cb->( parse_web_line @line ); + }; } - } - sub odd ($self, $event) { - \my $uri_path = \$event->{uri_path}; + sub parse_gemini_logs ( $last_processed_date, $cb ) { + my sub parse_date ( $year, @line ) { + my $timestr = "$line[0] $line[1]"; + return Time::Piece->strptime( $timestr, '%b %d' )->strftime("$year%m%d"); + } + + my sub parse_vger_line ( $year, @line ) { + my $full_path = $line[5]; + $full_path =~ s/"//g; + my ( $proto, undef, $host, $uri_path ) = + split '/', + $full_path, + 4; + $uri_path = '' + unless defined $uri_path; + + return { + proto => 'gemini', + host => $host, + uri_path => "/$uri_path", + status => $line[6], + date => int( parse_date( $year, @line ) ), + time => $line[2], + }; + } + + my sub parse_relayd_line ( $year, @line ) { + my $date = int( parse_date( $year, @line ) ); + + my ( $ip_hash, $ip_proto ) = anonymize_ip $line [12]; + return { + ip_hash => $ip_hash, + ip_proto => $ip_proto, + date => $date, + time => $line[2], + }; + } + + # Expect one vger and one relayd log line per event! So collect + # both events (one from one log line each) and then merge the result hash! + my ( $vger, $relayd ); + read_lines GEMINI_LOGS_GLOB, sub ( $year, @line ) { + if ( $line[4] eq 'vger:' ) { + $vger = parse_vger_line $year, @line; + } + elsif ( $line[5] eq 'relay' + and startswith( $line[6], 'gemini' ) ) + { + $relayd = parse_relayd_line $year, @line; + return undef + if $relayd->{date} < $last_processed_date; + } + + if ( defined $vger + and defined $relayd + and $vger->{time} eq $relayd->{time} ) + { + $cb->( { %$vger, %$relayd } ); + $vger = $relayd = undef; + } + + true; + }; + } - for ($self->{odds}->@*) { - if (contains($uri_path, $_)) { - say STDERR "Warn: $uri_path contains $_ and is odd and will therefore be blocked!" if WARN_ODD; - return true; - } + sub parse_logs ( $last_web_date, $last_gemini_date, $odds_file ) { + my $agg = Foostats::Aggregator->new($odds_file); + + say "Last web date: $last_web_date"; + say "Last gemini date: $last_gemini_date"; + + parse_web_logs $last_web_date, sub ($event) { + $agg->add($event); + }; + parse_gemini_logs $last_gemini_date, sub ($event) { + $agg->add($event); + }; + + return $agg->{stats}; } +} - return false; - } +# TODO: Write filter summary at the end of the filter log. +package Foostats::Filter { + use String::Util qw(contains startswith endswith); + + sub new ( $class, $odds_file, $log_path = '/var/log/foostats-filter.log' ) { + say "Logging filter to $log_path"; + my @odds = FileHelper::read_lines($odds_file); + unlink $log_path + if -f $log_path; + bless { + odds => \@odds, + log_path => $log_path + }, + $class; + } + + sub ok ( $self, $event ) { + state %blocked = (); + return false + if exists $blocked{ $event->{ip_hash} }; + + if ( $self->odd($event) + or $self->excessive($event) ) + { + ( $blocked{ $event->{ip_hash} } //= 0 )++; + return false; + } + else { + return true; + } + } + + sub odd ( $self, $event ) { + \my $uri_path = \$event->{uri_path}; - sub excessive ($self, $event) { - \my $time = \$event->{time}; - \my $ip_hash = \$event->{ip_hash}; + for ( $self->{odds}->@* ) { + next + unless contains( $uri_path, $_ ); - state $last_time = $time; # Time with second: 'HH:MM:SS' - state %count = (); # IPs accessing within the same second! + $self->log( 'WARN', $uri_path, "contains $_ and is odd and will therefore be blocked!" ); + return true; + } - if ($last_time ne $time) { - $last_time = $time; - %count = (); - return false; + $self->log( 'OK', $uri_path, "appears fine..." ); + return false; } - # IP requested site more than once within the same second!? - if (1 < ++($count{$ip_hash} //= 0)) { - say STDERR "Warn: $ip_hash blocked due to excessive requesting..." if WARN_ODD; - return true; + sub log ( $self, $severity, $subject, $message ) { + state %dedup; + + # Don't log if path was already logged + return + if exists $dedup{$subject}; + $dedup{$subject} = 1; + + open( my $fh, '>>', $self->{log_path} ) + or die $self->{log_path} . ": $!"; + print $fh "$severity: $subject $message\n"; + close($fh); } - return false; - } + sub excessive ( $self, $event ) { + \my $time = \$event->{time}; + \my $ip_hash = \$event->{ip_hash}; + + state $last_time = $time; # Time with second: 'HH:MM:SS' + state %count = (); # IPs accessing within the same second! + + if ( $last_time ne $time ) { + $last_time = $time; + %count = (); + return false; + } + + # IP requested site more than once within the same second!? + if ( 1 < ++( $count{$ip_hash} //= 0 ) ) { + $self->log( 'WARN', $ip_hash, "blocked due to excessive requesting..." ); + return true; + } + + return false; + } } package Foostats::Aggregator { - use String::Util qw(contains startswith endswith); - - use constant { - ATOM_FEED_URI => '/gemfeed/atom.xml', - GEMFEED_URI => '/gemfeed/index.gmi', - GEMFEED_URI_2 => '/gemfeed/', - }; - - sub new ($class) { bless { filter => Foostats::Filter->new, stats => {} }, $class } - - sub add ($self, $event) { - return undef unless defined $event; - - my $date = $event->{date}; - my $date_key = $event->{proto} . "_$date"; - - $self->{stats}{$date_key} //= { - count => { filtered => 0 }, - feed_ips => { atom_feed => {}, gemfeed => {} }, - page_ips => { hosts => {}, urls => {} }, + use String::Util qw(contains startswith endswith); + + use constant { + ATOM_FEED_URI => '/gemfeed/atom.xml', + GEMFEED_URI => '/gemfeed/index.gmi', + GEMFEED_URI_2 => '/gemfeed/', }; - \my $s = \$self->{stats}{$date_key}; - unless ($self->{filter}->ok($event)) { - $s->{count}{filtered}++; - return $event; + sub new ( $class, $odds_file ) { + bless { + filter => Foostats::Filter->new($odds_file), + stats => {} + }, + $class; } - $self->add_count($s, $event); - $self->add_page_ips($s, $event) unless $self->add_feed_ips($s, $event); + sub add ( $self, $event ) { + return undef + unless defined $event; + + my $date = $event->{date}; + my $date_key = $event->{proto} . "_$date"; + + $self->{stats}{$date_key} //= { + count => { + filtered => 0 + }, + feed_ips => { + atom_feed => {}, + gemfeed => {} + }, + page_ips => { + hosts => {}, + urls => {} + }, + }; + + \my $s = \$self->{stats}{$date_key}; + unless ( $self->{filter}->ok($event) ) { + $s->{count}{filtered}++; + return $event; + } + + $self->add_count( $s, $event ); + $self->add_page_ips( $s, $event ) + unless $self->add_feed_ips( $s, $event ); + + return $event; + } + + sub add_count ( $self, $stats, $event ) { + \my $c = \$stats->{count}; + \my $e = \$event; - return $event; - } + ( $c->{ $e->{proto} } //= 0 )++; + ( $c->{ $e->{ip_proto} } //= 0 )++; + } - sub add_count ($self, $stats, $event) { - \my $c = \$stats->{count}; - \my $e = \$event; + sub add_feed_ips ( $self, $stats, $event ) { + \my $f = \$stats->{feed_ips}; + \my $e = \$event; + + if ( endswith( $e->{uri_path}, ATOM_FEED_URI ) ) { + ( $f->{atom_feed}->{ $e->{ip_hash} } //= 0 )++; + } + elsif ( contains( $e->{uri_path}, GEMFEED_URI ) ) { + ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++; + } + elsif ( endswith( $e->{uri_path}, GEMFEED_URI_2 ) ) { + ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++; + } + else { + 0; + } + } - ($c->{$e->{proto}} //= 0)++; - ($c->{$e->{ip_proto}} //= 0)++; - } + sub add_page_ips ( $self, $stats, $event ) { + \my $e = \$event; + \my $p = \$stats->{page_ips}; - sub add_feed_ips ($self, $stats, $event) { - \my $f = \$stats->{feed_ips}; - \my $e = \$event; + return + if !endswith( $e->{uri_path}, '.html' ) + && !endswith( $e->{uri_path}, '.gmi' ); - if (endswith($e->{uri_path}, ATOM_FEED_URI)) { - ($f->{atom_feed}->{$e->{ip_hash}} //= 0)++; - } elsif (contains($e->{uri_path}, GEMFEED_URI)) { - ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++; - } elsif (endswith($e->{uri_path}, GEMFEED_URI_2)) { - ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++; - } else { - 0 + ( $p->{hosts}->{ $e->{host} }->{ $e->{ip_hash} } //= 0 )++; + ( $p->{urls}->{ $e->{host} . $e->{uri_path} }->{ $e->{ip_hash} } //= 0 )++; } - } +} - sub add_page_ips ($self, $stats, $event) { - \my $e = \$event; - \my $p = \$stats->{page_ips}; +package Foostats::FileOutputter { + use JSON; + use Sys::Hostname; + use PerlIO::gzip; - return if !endswith($e->{uri_path}, '.html') - && !endswith($e->{uri_path}, '.gmi'); + sub new ( $class, %args ) { + my $self = bless \%args, $class; + mkdir $self->{stats_dir} + or die $self->{stats_dir} . ": $!" + unless -d $self->{stats_dir}; - ($p->{hosts}->{$e->{host}}->{$e->{ip_hash}} //= 0)++; - ($p->{urls}->{$e->{host}.$e->{uri_path}}->{$e->{ip_hash}} //= 0)++; - } -} + return $self; + } + + sub last_processed_date ( $self, $proto ) { + my $hostname = hostname(); + my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz"; + my ($date) = + @processed + ? ( $processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/ ) + : 0; + + return int($date); + } + + sub write ($self) { + $self->for_dates( + sub ( $self, $date_key, $stats ) { + my $hostname = hostname(); + my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz"; + FileHelper::write_json_gz + $path, + $stats; + } + ); + } -package Foostats::Outputter { - use JSON; - use Sys::Hostname; - use PerlIO::gzip; - - sub new ($class, %args) { - my $self = bless \%args, $class; - mkdir $self->{stats_dir} or die $self->{stats_dir} . ": $!" unless -d $self->{stats_dir}; - - return $self; - } - - sub last_processed_date ($self, $proto) { - my $hostname = hostname(); - my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz"; - my ($date) = @processed ? ($processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/) : 0; - - return int($date); - } - - sub write ($self) { - $self->for_dates(sub ($self, $date_key, $stats) { - my $hostname = hostname(); - my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz"; - FileHelper::write_json_gz $path, $stats; - }); - } - - sub for_dates ($self, $cb) { - $cb->($self, $_, $self->{stats}{$_}) for sort keys $self->{stats}->%*; - } + sub for_dates ( $self, $cb ) { + $cb->( $self, $_, $self->{stats}{$_} ) for sort + keys $self->{stats}->%*; + } } package Foostats::Replicator { - use JSON; - use File::Basename; - use Time::Piece; - use LWP::UserAgent; - use String::Util qw(endswith); - - sub new ($class, %args) { bless \%args, $class } - - sub replicate ($self, $partner_node) { - say "Replicating from $partner_node"; - - for my $proto (qw(gemini web)) { - my $count = 0; - - for my $date (_last_month_dates()) { - my $file_base = "${proto}_${date}"; - my $dest_path = "${file_base}.$partner_node.json.gz"; - - $self->replicate_file( - "https://$partner_node/foostats/$dest_path", - $self->{stats_dir} . '/' . $dest_path, - $count++ < 3, # Always replicate the newest 3 files. - ); - } + use JSON; + use File::Basename; + use LWP::UserAgent; + use String::Util qw(endswith); + + sub replicate ( $stats_dir, $partner_node ) { + say "Replicating from $partner_node"; + + for my $proto (qw(gemini web)) { + my $count = 0; + + for my $date (DateHelper::last_month_dates) { + my $file_base = "${proto}_${date}"; + my $dest_path = "${file_base}.$partner_node.json.gz"; + + replicate_file( + "https://$partner_node/foostats/$dest_path", + "$stats_dir/$dest_path", + $count++ + < + 3 + , # Always replicate the newest 3 files. + ); + } + } } - } - sub replicate_file ($self, $remote_url, $dest_path, $force) { - # $dest_path already exists, not replicating it - return if !$force && -f $dest_path; + sub replicate_file ( $remote_url, $dest_path, $force ) { + + # $dest_path already exists, not replicating it + return + if !$force + && -f $dest_path; + + say "Replicating $remote_url to $dest_path (force:$force)... "; + my $response = LWP::UserAgent->new->get($remote_url); + unless ( $response->is_success ) { + say "\nFailed to fetch the file: " . $response->status_line; + return; + } - say "Replicating $remote_url to $dest_path (force:$force)... "; - my $response = LWP::UserAgent->new->get($remote_url); - unless ($response->is_success) { - say "\nFailed to fetch the file: " . $response->status_line; - return; + FileHelper::write + $dest_path, + $response->decoded_content; + say 'done'; } +} - FileHelper::write $dest_path, $response->decoded_content; - say 'done'; - } +package Foostats::Merger { + use Data::Dumper; # TODO: UNDO - sub _last_month_dates () { - my $today = localtime; - my @last_week; + sub merge ($stats_dir) { + my %merge; + $merge{$_} = merge_for_date( $stats_dir, $_ ) for DateHelper::last_month_dates; + return %merge; + } - for my $days_ago (0..30) { - my $date = $today - ($days_ago * 24 * 60 * 60); - push @last_week, $date->strftime('%Y%m%d'); + sub merge_for_date ( $stats_dir, $date ) { + printf + "Merging for date %s\n", + $date; + + my @stats = stats_for_date( $stats_dir, $date ); + return { + feed_ips => feed_ips(@stats), + count => count(@stats), + page_ips => page_ips(@stats), + }; } - return @last_week; - } -} + sub merge_ips ( $a, $b, $key_transform = undef ) { + my sub merge ( $a, $b ) { + while ( my ( $key, $val ) = each %$b ) { + $a->{$key} //= 0; + $a->{$key} += $val; + } + } + + my $is_num = qr/^\d+(\.\d+)?$/; + + while ( my ( $key, $val ) = each %$b ) { + $key = $key_transform->($key) + if defined $key_transform; + + if ( not exists $a->{$key} ) { + $a->{$key} = $val; + } + elsif (ref( $a->{$key} ) eq 'HASH' + && ref($val) eq 'HASH' ) + { + merge( $a->{$key}, $val ); + } + elsif ($a->{$key} =~ $is_num + && $val =~ $is_num ) + { + $a->{$key} += $val; + } + else { + die + "Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n", + $key, + ref($key), $a->{$key}, + ref( $a->{$key} ), + $val, + ref($val); + } + } + } -package main { - use Getopt::Long; - use Sys::Hostname; + sub feed_ips (@stats) { + my ( %gemini, %web ); + + for my $stats (@stats) { + my $merge = + $stats->{proto} eq 'web' + ? \%web + : \%gemini; + printf + "Merging proto %s feed IPs\n", + $stats->{proto}; + merge_ips( $merge, $stats->{feed_ips} ); + } + + my %total; + merge_ips( \%total, $web{$_} ) for keys %web; + merge_ips( \%total, $gemini{$_} ) for keys %gemini; + + my %merge = ( + 'Total' => scalar keys %total, + 'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*, + 'Gemini Atom' => scalar keys $gemini{atom_feed}->%*, + 'Web Gemfeed' => scalar keys $web{gemfeed}->%*, + 'Web Atom' => scalar keys $web{atom_feed}->%*, + ); - sub parse_logs ($stats_dir) { - my $out = Foostats::Outputter->new(stats_dir => $stats_dir); + return \%merge; + } - $out->{stats} = Foostats::Logreader::parse_logs( - $out->last_processed_date('web'), - $out->last_processed_date('gemini'), - ); + sub count (@stats) { + my %merge; - $out->write; - } + for my $stats (@stats) { + while ( my ( $key, $val ) = each $stats->{count}->%* ) { + $merge{$key} //= 0; + $merge{$key} += $val; + } + } - my ($parse_logs, $replicate, $report, $all); + return \%merge; + } - # With default values - my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats'; - my $partner_node = hostname eq 'fishfinger.buetow.org' - ? 'blowfish.buetow.org' : 'fishfinger.buetow.org'; + sub page_ips (@stats) { + my %merge = ( + urls => {}, + hosts => {} + ); - GetOptions 'parse-logs' => \$parse_logs, - 'replicate' => \$replicate, - 'pretty-print' => \$report, - 'all' => \$all, - 'stats-dir' => \$stats_dir, - 'partner-node' => \$partner_node; + for my $key ( keys %merge ) { + merge_ips( + $merge{$key}, + $_->{page_ips}->{$key}, + sub ($key) { + $key =~ s/\.html$/.../; + $key =~ s/\.gmi$/.../; + $key; + } + ) for @stats; + + # Keep only uniq IP count + $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* for keys $merge{$key}->%*; + } + + return \%merge; + } + + sub stats_for_date ( $stats_dir, $date ) { + my @stats; + + for my $proto (qw(gemini web)) { + for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) { + printf + "Reading %s\n", + $path; + push + @stats, + FileHelper::read_json_gz($path); + @{ $stats[-1] }{qw(proto path)} = ( $proto, $path ); + } + } + + return @stats; + } +} - parse_logs $stats_dir if $parse_logs or $all; +package Foostats::Reporter { + use Data::Dumper; - Foostats::Replicator->new(stats_dir => $stats_dir)->replicate($partner_node) - if $replicate or $all; + sub report (%merged) { + print Dumper %merged; + } +} + +package main { + use Getopt::Long; + use Sys::Hostname; + + sub parse_logs ( $stats_dir, $odds_file ) { + my $out = Foostats::FileOutputter->new( stats_dir => $stats_dir ); + + $out->{stats} = Foostats::Logreader::parse_logs( $out->last_processed_date('web'), + $out->last_processed_date('gemini'), $odds_file, ); + + $out->write; + } - die 'report not yet implemented' if $report or $all; + my ( $parse_logs, $replicate, $report, $all ); + + # With default values + my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats'; + my $odds_file = $stats_dir . '/odds.txt'; + my $partner_node = + hostname eq 'fishfinger.buetow.org' + ? 'blowfish.buetow.org' + : 'fishfinger.buetow.org'; + + # TODO: Add help output + GetOptions + 'parse-logs!' => \$parse_logs, + 'odds-file=s' => \$odds_file, + 'replicate!' => \$replicate, + 'report!' => \$report, + 'all!' => \$all, + 'stats-dir=s' => \$stats_dir, + 'partner-node=s' => \$partner_node; + + parse_logs( $stats_dir, $odds_file ) + if $parse_logs + or $all; + + Foostats::Replicator::replicate( $stats_dir, $partner_node ) + if $replicate + or $all; + + Foostats::Reporter::report( Foostats::Merger::merge($stats_dir) ) + if $report + or $all; } diff --git a/frontends/scripts/odds.txt b/frontends/scripts/odds.txt new file mode 100644 index 0000000..489b2e0 --- /dev/null +++ b/frontends/scripts/odds.txt @@ -0,0 +1,16 @@ +% ++ +.. +/admin +.asp +.env +.git +HNAP1 +.lua +microsoft.exchange +/owa/ +.php +robots.txt +wordpress +/wp +.rar |
