summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2025-02-10 22:35:49 +0200
committerPaul Buetow <paul@buetow.org>2025-02-10 22:35:49 +0200
commitf41e8589393d9c4efa53490a63d96377e91ab0d8 (patch)
tree52cd56213944608920df1146b115e56f9240dd80
parentbef65c4f5fd9c41444ae637e939c4e2d038cb531 (diff)
add foostats odds
-rw-r--r--frontends/Rexfile8
-rw-r--r--frontends/scripts/foostats.pl1015
-rw-r--r--frontends/scripts/odds.txt16
3 files changed, 670 insertions, 369 deletions
diff --git a/frontends/Rexfile b/frontends/Rexfile
index c1ea2f8..eff2776 100644
--- a/frontends/Rexfile
+++ b/frontends/Rexfile
@@ -540,6 +540,8 @@ task 'foostats',
use File::Copy;
my $git_script_path = $ENV{HOME} . '/git/foostats/foostats.pl';
copy( $git_script_path, './scripts/foostats.pl' ) if -f $git_script_path;
+ my $git_odds_path = $ENV{HOME} . '/git/foostats/odds.txt';
+ copy( $git_odds_path, './scripts/odds.txt' ) if -f $git_odds_path;
file '/usr/local/bin/foostats.pl',
source => './scripts/foostats.pl',
@@ -547,6 +549,12 @@ task 'foostats',
group => 'wheel',
mode => '500';
+ file '/var/www/htdocs/buetow.org/self/foostats/odds.txt',
+ source => './scripts/odds.txt',
+ owner => 'root',
+ group => 'wheel',
+ mode => '440';
+
append_if_no_such_line '/etc/daily.local', 'perl /usr/local/bin/foostats.pl --parse-logs --replicate';
my @deps = qw(p5-Digest-SHA3 p5-PerlIO-gzip p5-JSON p5-String-Util p5-LWP-Protocol-https);
diff --git a/frontends/scripts/foostats.pl b/frontends/scripts/foostats.pl
index 56ad152..ea4d7a5 100644
--- a/frontends/scripts/foostats.pl
+++ b/frontends/scripts/foostats.pl
@@ -6,451 +6,728 @@ use v5.38;
# use strict;
# use warnings;
-use builtin qw(true false);
+use builtin qw(true false);
use experimental qw(builtin);
use feature qw(refaliasing);
no warnings qw(experimental::refaliasing);
-# use diagnostics;
-# use Data::Dumper;
+# TODO: UNDO
+use diagnostics;
# TODO: Blog post about this script and the new Perl features used.
# TODO NEXT:
-# 1) Implement replicator
-# 2) Write out a nice output from each merged file, also merge if multiple hosts results
+# * Write out a nice output from each merged file, also merge if multiple hosts results
+# * Fix bug with .gmi.*.gmi in the log parser
+# * Nicely formatted .txt output by stats by count by date
+# * Print out all UAs, to add new excludes/blocked IPs
package FileHelper {
- use JSON;
-
- sub write ($path, $content) {
- open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!";
- print $fh $content;
- close $fh;
-
- rename "$path.tmp", $path;
- }
-
- sub write_json_gz ($path, $data) {
- my $json = encode_json $data;
-
- say "Writing $path";
- open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!";
- print $fd $json;
- close $fd;
-
- rename "$path.tmp", $path or die "$path.tmp: $!";
- }
-
- sub read_json_gz ($path) {
- say "Reading $path";
- open my $fd, '<:gzip', $path or die "$path: $!";
- my $json = decode_json <$fd>;
- close $fd;
- return $json;
- }
-}
+ use JSON;
-package Foostats::Logreader {
- use Digest::SHA3 'sha3_512_base64';
- use File::stat;
- use PerlIO::gzip;
- use Time::Piece;
- use String::Util qw(contains startswith endswith);
+ sub write ( $path, $content ) {
+ open my $fh, '>', "$path.tmp"
+ or die "\nCannot open file: $!";
+ print $fh $content;
+ close $fh;
- use constant {
- GEMINI_LOGS_GLOB => '/var/log/daemon*',
- WEB_LOGS_GLOB => '/var/www/logs/access.log*',
- };
+ rename
+ "$path.tmp",
+ $path;
+ }
- sub anonymize_ip ($ip) {
- my $ip_proto = contains($ip, ':') ? 'IPv6' : 'IPv4';
- my $ip_hash = sha3_512_base64 $ip;
- return ($ip_hash, $ip_proto);
- }
+ sub write_json_gz ( $path, $data ) {
+ my $json = encode_json $data;
- sub read_lines ($glob, $cb) {
- my sub year ($path) { localtime( (stat $path)->mtime )->strftime('%Y') }
-
- my sub open_file ($path) {
- my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<';
- open my $fd, $flag, $path or die "$path: $!";
- return $fd;
- }
-
- my $last = false;
-
- say 'File path glob matches: ' . join(' ', glob $glob);
-
- LAST:
- for my $path ( sort { -M $a <=> -M $b } glob $glob) {
- say "Processing $path";
-
- my $file = open_file $path;
- my $year = year $file;
-
- while (<$file>) {
- next if contains($_, 'logfile turned over');
-
- # last == true means: After this file, don't process more
- $last = true unless defined $cb->($year, split / +/);
- }
-
- say "Closing $path (last:$last)";
- close $file;
- last LAST if $last;
- }
- }
-
- sub parse_web_logs ($last_processed_date, $cb) {
- my sub parse_date ($date) {
- my $t = Time::Piece->strptime($date, '[%d/%b/%Y:%H:%M:%S');
- return ($t->strftime('%Y%m%d'), $t->strftime('%H%M%S'));
- }
-
- my sub parse_web_line (@line) {
- my ($date, $time) = parse_date $line[4];
- return undef if $date < $last_processed_date;
-
- # X-Forwarded-For?
- my $ip = $line[-2] eq '-' ? $line[1] : $line[-2];
- my ($ip_hash, $ip_proto) = anonymize_ip $ip;
-
- return {
- proto => 'web',
- host => $line[0],
- ip_hash => $ip_hash,
- ip_proto => $ip_proto,
- date => $date,
- time => $time,
- uri_path => $line[7],
- status => $line[9],
- };
- }
-
- read_lines WEB_LOGS_GLOB, sub ($year, @line) { $cb->(parse_web_line @line) };
- }
-
- sub parse_gemini_logs ($last_processed_date, $cb) {
- my sub parse_date ($year, @line) {
- my $timestr = "$line[0] $line[1]";
- return Time::Piece->strptime($timestr, '%b %d')->strftime("$year%m%d");
- }
-
- my sub parse_vger_line ($year, @line) {
- my $full_path = $line[5];
- $full_path =~ s/"//g;
- my ($proto, undef, $host, $uri_path) = split '/', $full_path, 4;
- $uri_path = '' unless defined $uri_path;
-
- return {
- proto => 'gemini',
- host => $host,
- uri_path => "/$uri_path",
- status => $line[6],
- date => int(parse_date($year, @line)),
- time => $line[2],
- };
- }
-
- my sub parse_relayd_line ($year, @line) {
- my $date = int(parse_date($year, @line));
+ say "Writing $path";
+ open my $fd, '>:gzip', "$path.tmp"
+ or die "$path.tmp: $!";
+ print $fd $json;
+ close $fd;
- my ($ip_hash, $ip_proto) = anonymize_ip $line[12];
- return {
- ip_hash => $ip_hash,
- ip_proto => $ip_proto,
- date => $date,
- time => $line[2],
- };
+ rename "$path.tmp", $path
+ or die "$path.tmp: $!";
}
- # Expect one vger and one relayd log line per event! So collect
- # both events (one from one log line each) and then merge the result hash!
- my ($vger, $relayd);
- read_lines GEMINI_LOGS_GLOB, sub ($year, @line) {
- if ($line[4] eq 'vger:') {
- $vger = parse_vger_line $year, @line;
- } elsif ($line[5] eq 'relay' and startswith($line[6], 'gemini')) {
- $relayd = parse_relayd_line $year, @line;
- return undef if $relayd->{date} < $last_processed_date;
- }
+ sub read_json_gz ($path) {
+ say "Reading $path";
+ open my $fd, '<:gzip', $path
+ or die "$path: $!";
+ my $json = decode_json <$fd>;
+ close $fd;
+ return $json;
+ }
- if (defined $vger and defined $relayd and $vger->{time} eq $relayd->{time}) {
- $cb->({ %$vger, %$relayd });
- $vger = $relayd = undef;
- }
-
- true;
- };
- }
+ sub read_lines ($path) {
+ my @lines;
+ open( my $fh, '<', $path )
+ or die "$path: $!";
+ chomp( @lines = <$fh> );
+ close($fh);
+ return @lines;
+ }
+}
- sub parse_logs ($last_web_date, $last_gemini_date) {
- my $agg = Foostats::Aggregator->new;
+package DateHelper {
+ use Time::Piece;
- say "Last web date: $last_web_date";
- say "Last gemini date: $last_gemini_date";
+ sub last_month_dates () {
+ my $today = localtime;
+ my @dates;
- parse_web_logs $last_web_date, sub ($event) { $agg->add($event) };
- parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) };
+ for my $days_ago ( 0 .. 30 ) {
+ my $date = $today - ( $days_ago * 24 * 60 * 60 );
+ push
+ @dates,
+ $date->strftime('%Y%m%d');
+ }
- return $agg->{stats};
- }
+ return @dates;
+ }
}
-package Foostats::Filter {
- use String::Util qw(contains startswith endswith);
- use constant WARN_ODD => false;
+package Foostats::Logreader {
+ use Digest::SHA3 'sha3_512_base64';
+ use File::stat;
+ use PerlIO::gzip;
+ use Time::Piece;
+ use String::Util qw(contains startswith endswith);
+
+ use constant {
+ GEMINI_LOGS_GLOB => '/var/log/daemon*',
+ WEB_LOGS_GLOB => '/var/www/logs/access.log*',
+ };
- sub new ($class) {
- bless {
- odds => [qw(
- .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin
- .git microsoft.exchange .lua /owa/
- )]
- }, $class;
- }
+ sub anonymize_ip ($ip) {
+ my $ip_proto =
+ contains( $ip, ':' )
+ ? 'IPv6'
+ : 'IPv4';
+ my $ip_hash = sha3_512_base64 $ip;
+ return ( $ip_hash, $ip_proto );
+ }
- sub ok ($self, $event) {
- state %blocked = ();
- return false if exists $blocked{$event->{ip_hash}};
+ sub read_lines ( $glob, $cb ) {
+ my sub year ($path) {
+ localtime( ( stat $path )->mtime )->strftime('%Y');
+ }
+
+ my sub open_file ($path) {
+ my $flag =
+ $path =~ /\.gz$/
+ ? '<:gzip'
+ : '<';
+ open my $fd, $flag, $path
+ or die "$path: $!";
+ return $fd;
+ }
+
+ my $last = false;
+
+ say 'File path glob matches: ' . join( ' ', glob $glob );
+
+ LAST:
+ for my $path ( sort { -M $a <=> -M $b } glob $glob ) {
+ say "Processing $path";
+
+ my $file = open_file $path;
+ my $year = year $file;
+
+ while (<$file>) {
+ next
+ if contains( $_, 'logfile turned over' );
+
+ # last == true means: After this file, don't process more
+ $last = true
+ unless defined $cb->( $year, split / +/ );
+ }
+
+ say "Closing $path (last:$last)";
+ close $file;
+ last LAST
+ if $last;
+ }
+ }
- if ($self->odd($event) or $self->excessive($event)) {
- ($blocked{$event->{ip_hash}} //= 0)++;
- return false;
- } else {
- return true;
+ sub parse_web_logs ( $last_processed_date, $cb ) {
+ my sub parse_date ($date) {
+ my $t = Time::Piece->strptime( $date, '[%d/%b/%Y:%H:%M:%S' );
+ return ( $t->strftime('%Y%m%d'), $t->strftime('%H%M%S') );
+ }
+
+ my sub parse_web_line (@line) {
+ my ( $date, $time ) = parse_date $line [4];
+ return undef
+ if $date < $last_processed_date;
+
+ # X-Forwarded-For?
+ my $ip =
+ $line[-2] eq '-'
+ ? $line[1]
+ : $line[-2];
+ my ( $ip_hash, $ip_proto ) = anonymize_ip $ip;
+
+ return {
+ proto => 'web',
+ host => $line[0],
+ ip_hash => $ip_hash,
+ ip_proto => $ip_proto,
+ date => $date,
+ time => $time,
+ uri_path => $line[7],
+ status => $line[9],
+ };
+ }
+
+ read_lines WEB_LOGS_GLOB, sub ( $year, @line ) {
+ $cb->( parse_web_line @line );
+ };
}
- }
- sub odd ($self, $event) {
- \my $uri_path = \$event->{uri_path};
+ sub parse_gemini_logs ( $last_processed_date, $cb ) {
+ my sub parse_date ( $year, @line ) {
+ my $timestr = "$line[0] $line[1]";
+ return Time::Piece->strptime( $timestr, '%b %d' )->strftime("$year%m%d");
+ }
+
+ my sub parse_vger_line ( $year, @line ) {
+ my $full_path = $line[5];
+ $full_path =~ s/"//g;
+ my ( $proto, undef, $host, $uri_path ) =
+ split '/',
+ $full_path,
+ 4;
+ $uri_path = ''
+ unless defined $uri_path;
+
+ return {
+ proto => 'gemini',
+ host => $host,
+ uri_path => "/$uri_path",
+ status => $line[6],
+ date => int( parse_date( $year, @line ) ),
+ time => $line[2],
+ };
+ }
+
+ my sub parse_relayd_line ( $year, @line ) {
+ my $date = int( parse_date( $year, @line ) );
+
+ my ( $ip_hash, $ip_proto ) = anonymize_ip $line [12];
+ return {
+ ip_hash => $ip_hash,
+ ip_proto => $ip_proto,
+ date => $date,
+ time => $line[2],
+ };
+ }
+
+ # Expect one vger and one relayd log line per event! So collect
+ # both events (one from one log line each) and then merge the result hash!
+ my ( $vger, $relayd );
+ read_lines GEMINI_LOGS_GLOB, sub ( $year, @line ) {
+ if ( $line[4] eq 'vger:' ) {
+ $vger = parse_vger_line $year, @line;
+ }
+ elsif ( $line[5] eq 'relay'
+ and startswith( $line[6], 'gemini' ) )
+ {
+ $relayd = parse_relayd_line $year, @line;
+ return undef
+ if $relayd->{date} < $last_processed_date;
+ }
+
+ if ( defined $vger
+ and defined $relayd
+ and $vger->{time} eq $relayd->{time} )
+ {
+ $cb->( { %$vger, %$relayd } );
+ $vger = $relayd = undef;
+ }
+
+ true;
+ };
+ }
- for ($self->{odds}->@*) {
- if (contains($uri_path, $_)) {
- say STDERR "Warn: $uri_path contains $_ and is odd and will therefore be blocked!" if WARN_ODD;
- return true;
- }
+ sub parse_logs ( $last_web_date, $last_gemini_date, $odds_file ) {
+ my $agg = Foostats::Aggregator->new($odds_file);
+
+ say "Last web date: $last_web_date";
+ say "Last gemini date: $last_gemini_date";
+
+ parse_web_logs $last_web_date, sub ($event) {
+ $agg->add($event);
+ };
+ parse_gemini_logs $last_gemini_date, sub ($event) {
+ $agg->add($event);
+ };
+
+ return $agg->{stats};
}
+}
- return false;
- }
+# TODO: Write filter summary at the end of the filter log.
+package Foostats::Filter {
+ use String::Util qw(contains startswith endswith);
+
+ sub new ( $class, $odds_file, $log_path = '/var/log/foostats-filter.log' ) {
+ say "Logging filter to $log_path";
+ my @odds = FileHelper::read_lines($odds_file);
+ unlink $log_path
+ if -f $log_path;
+ bless {
+ odds => \@odds,
+ log_path => $log_path
+ },
+ $class;
+ }
+
+ sub ok ( $self, $event ) {
+ state %blocked = ();
+ return false
+ if exists $blocked{ $event->{ip_hash} };
+
+ if ( $self->odd($event)
+ or $self->excessive($event) )
+ {
+ ( $blocked{ $event->{ip_hash} } //= 0 )++;
+ return false;
+ }
+ else {
+ return true;
+ }
+ }
+
+ sub odd ( $self, $event ) {
+ \my $uri_path = \$event->{uri_path};
- sub excessive ($self, $event) {
- \my $time = \$event->{time};
- \my $ip_hash = \$event->{ip_hash};
+ for ( $self->{odds}->@* ) {
+ next
+ unless contains( $uri_path, $_ );
- state $last_time = $time; # Time with second: 'HH:MM:SS'
- state %count = (); # IPs accessing within the same second!
+ $self->log( 'WARN', $uri_path, "contains $_ and is odd and will therefore be blocked!" );
+ return true;
+ }
- if ($last_time ne $time) {
- $last_time = $time;
- %count = ();
- return false;
+ $self->log( 'OK', $uri_path, "appears fine..." );
+ return false;
}
- # IP requested site more than once within the same second!?
- if (1 < ++($count{$ip_hash} //= 0)) {
- say STDERR "Warn: $ip_hash blocked due to excessive requesting..." if WARN_ODD;
- return true;
+ sub log ( $self, $severity, $subject, $message ) {
+ state %dedup;
+
+ # Don't log if path was already logged
+ return
+ if exists $dedup{$subject};
+ $dedup{$subject} = 1;
+
+ open( my $fh, '>>', $self->{log_path} )
+ or die $self->{log_path} . ": $!";
+ print $fh "$severity: $subject $message\n";
+ close($fh);
}
- return false;
- }
+ sub excessive ( $self, $event ) {
+ \my $time = \$event->{time};
+ \my $ip_hash = \$event->{ip_hash};
+
+ state $last_time = $time; # Time with second: 'HH:MM:SS'
+ state %count = (); # IPs accessing within the same second!
+
+ if ( $last_time ne $time ) {
+ $last_time = $time;
+ %count = ();
+ return false;
+ }
+
+ # IP requested site more than once within the same second!?
+ if ( 1 < ++( $count{$ip_hash} //= 0 ) ) {
+ $self->log( 'WARN', $ip_hash, "blocked due to excessive requesting..." );
+ return true;
+ }
+
+ return false;
+ }
}
package Foostats::Aggregator {
- use String::Util qw(contains startswith endswith);
-
- use constant {
- ATOM_FEED_URI => '/gemfeed/atom.xml',
- GEMFEED_URI => '/gemfeed/index.gmi',
- GEMFEED_URI_2 => '/gemfeed/',
- };
-
- sub new ($class) { bless { filter => Foostats::Filter->new, stats => {} }, $class }
-
- sub add ($self, $event) {
- return undef unless defined $event;
-
- my $date = $event->{date};
- my $date_key = $event->{proto} . "_$date";
-
- $self->{stats}{$date_key} //= {
- count => { filtered => 0 },
- feed_ips => { atom_feed => {}, gemfeed => {} },
- page_ips => { hosts => {}, urls => {} },
+ use String::Util qw(contains startswith endswith);
+
+ use constant {
+ ATOM_FEED_URI => '/gemfeed/atom.xml',
+ GEMFEED_URI => '/gemfeed/index.gmi',
+ GEMFEED_URI_2 => '/gemfeed/',
};
- \my $s = \$self->{stats}{$date_key};
- unless ($self->{filter}->ok($event)) {
- $s->{count}{filtered}++;
- return $event;
+ sub new ( $class, $odds_file ) {
+ bless {
+ filter => Foostats::Filter->new($odds_file),
+ stats => {}
+ },
+ $class;
}
- $self->add_count($s, $event);
- $self->add_page_ips($s, $event) unless $self->add_feed_ips($s, $event);
+ sub add ( $self, $event ) {
+ return undef
+ unless defined $event;
+
+ my $date = $event->{date};
+ my $date_key = $event->{proto} . "_$date";
+
+ $self->{stats}{$date_key} //= {
+ count => {
+ filtered => 0
+ },
+ feed_ips => {
+ atom_feed => {},
+ gemfeed => {}
+ },
+ page_ips => {
+ hosts => {},
+ urls => {}
+ },
+ };
+
+ \my $s = \$self->{stats}{$date_key};
+ unless ( $self->{filter}->ok($event) ) {
+ $s->{count}{filtered}++;
+ return $event;
+ }
+
+ $self->add_count( $s, $event );
+ $self->add_page_ips( $s, $event )
+ unless $self->add_feed_ips( $s, $event );
+
+ return $event;
+ }
+
+ sub add_count ( $self, $stats, $event ) {
+ \my $c = \$stats->{count};
+ \my $e = \$event;
- return $event;
- }
+ ( $c->{ $e->{proto} } //= 0 )++;
+ ( $c->{ $e->{ip_proto} } //= 0 )++;
+ }
- sub add_count ($self, $stats, $event) {
- \my $c = \$stats->{count};
- \my $e = \$event;
+ sub add_feed_ips ( $self, $stats, $event ) {
+ \my $f = \$stats->{feed_ips};
+ \my $e = \$event;
+
+ if ( endswith( $e->{uri_path}, ATOM_FEED_URI ) ) {
+ ( $f->{atom_feed}->{ $e->{ip_hash} } //= 0 )++;
+ }
+ elsif ( contains( $e->{uri_path}, GEMFEED_URI ) ) {
+ ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++;
+ }
+ elsif ( endswith( $e->{uri_path}, GEMFEED_URI_2 ) ) {
+ ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++;
+ }
+ else {
+ 0;
+ }
+ }
- ($c->{$e->{proto}} //= 0)++;
- ($c->{$e->{ip_proto}} //= 0)++;
- }
+ sub add_page_ips ( $self, $stats, $event ) {
+ \my $e = \$event;
+ \my $p = \$stats->{page_ips};
- sub add_feed_ips ($self, $stats, $event) {
- \my $f = \$stats->{feed_ips};
- \my $e = \$event;
+ return
+ if !endswith( $e->{uri_path}, '.html' )
+ && !endswith( $e->{uri_path}, '.gmi' );
- if (endswith($e->{uri_path}, ATOM_FEED_URI)) {
- ($f->{atom_feed}->{$e->{ip_hash}} //= 0)++;
- } elsif (contains($e->{uri_path}, GEMFEED_URI)) {
- ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++;
- } elsif (endswith($e->{uri_path}, GEMFEED_URI_2)) {
- ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++;
- } else {
- 0
+ ( $p->{hosts}->{ $e->{host} }->{ $e->{ip_hash} } //= 0 )++;
+ ( $p->{urls}->{ $e->{host} . $e->{uri_path} }->{ $e->{ip_hash} } //= 0 )++;
}
- }
+}
- sub add_page_ips ($self, $stats, $event) {
- \my $e = \$event;
- \my $p = \$stats->{page_ips};
+package Foostats::FileOutputter {
+ use JSON;
+ use Sys::Hostname;
+ use PerlIO::gzip;
- return if !endswith($e->{uri_path}, '.html')
- && !endswith($e->{uri_path}, '.gmi');
+ sub new ( $class, %args ) {
+ my $self = bless \%args, $class;
+ mkdir $self->{stats_dir}
+ or die $self->{stats_dir} . ": $!"
+ unless -d $self->{stats_dir};
- ($p->{hosts}->{$e->{host}}->{$e->{ip_hash}} //= 0)++;
- ($p->{urls}->{$e->{host}.$e->{uri_path}}->{$e->{ip_hash}} //= 0)++;
- }
-}
+ return $self;
+ }
+
+ sub last_processed_date ( $self, $proto ) {
+ my $hostname = hostname();
+ my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz";
+ my ($date) =
+ @processed
+ ? ( $processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/ )
+ : 0;
+
+ return int($date);
+ }
+
+ sub write ($self) {
+ $self->for_dates(
+ sub ( $self, $date_key, $stats ) {
+ my $hostname = hostname();
+ my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz";
+ FileHelper::write_json_gz
+ $path,
+ $stats;
+ }
+ );
+ }
-package Foostats::Outputter {
- use JSON;
- use Sys::Hostname;
- use PerlIO::gzip;
-
- sub new ($class, %args) {
- my $self = bless \%args, $class;
- mkdir $self->{stats_dir} or die $self->{stats_dir} . ": $!" unless -d $self->{stats_dir};
-
- return $self;
- }
-
- sub last_processed_date ($self, $proto) {
- my $hostname = hostname();
- my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz";
- my ($date) = @processed ? ($processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/) : 0;
-
- return int($date);
- }
-
- sub write ($self) {
- $self->for_dates(sub ($self, $date_key, $stats) {
- my $hostname = hostname();
- my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz";
- FileHelper::write_json_gz $path, $stats;
- });
- }
-
- sub for_dates ($self, $cb) {
- $cb->($self, $_, $self->{stats}{$_}) for sort keys $self->{stats}->%*;
- }
+ sub for_dates ( $self, $cb ) {
+ $cb->( $self, $_, $self->{stats}{$_} ) for sort
+ keys $self->{stats}->%*;
+ }
}
package Foostats::Replicator {
- use JSON;
- use File::Basename;
- use Time::Piece;
- use LWP::UserAgent;
- use String::Util qw(endswith);
-
- sub new ($class, %args) { bless \%args, $class }
-
- sub replicate ($self, $partner_node) {
- say "Replicating from $partner_node";
-
- for my $proto (qw(gemini web)) {
- my $count = 0;
-
- for my $date (_last_month_dates()) {
- my $file_base = "${proto}_${date}";
- my $dest_path = "${file_base}.$partner_node.json.gz";
-
- $self->replicate_file(
- "https://$partner_node/foostats/$dest_path",
- $self->{stats_dir} . '/' . $dest_path,
- $count++ < 3, # Always replicate the newest 3 files.
- );
- }
+ use JSON;
+ use File::Basename;
+ use LWP::UserAgent;
+ use String::Util qw(endswith);
+
+ sub replicate ( $stats_dir, $partner_node ) {
+ say "Replicating from $partner_node";
+
+ for my $proto (qw(gemini web)) {
+ my $count = 0;
+
+ for my $date (DateHelper::last_month_dates) {
+ my $file_base = "${proto}_${date}";
+ my $dest_path = "${file_base}.$partner_node.json.gz";
+
+ replicate_file(
+ "https://$partner_node/foostats/$dest_path",
+ "$stats_dir/$dest_path",
+ $count++
+ <
+ 3
+ , # Always replicate the newest 3 files.
+ );
+ }
+ }
}
- }
- sub replicate_file ($self, $remote_url, $dest_path, $force) {
- # $dest_path already exists, not replicating it
- return if !$force && -f $dest_path;
+ sub replicate_file ( $remote_url, $dest_path, $force ) {
+
+ # $dest_path already exists, not replicating it
+ return
+ if !$force
+ && -f $dest_path;
+
+ say "Replicating $remote_url to $dest_path (force:$force)... ";
+ my $response = LWP::UserAgent->new->get($remote_url);
+ unless ( $response->is_success ) {
+ say "\nFailed to fetch the file: " . $response->status_line;
+ return;
+ }
- say "Replicating $remote_url to $dest_path (force:$force)... ";
- my $response = LWP::UserAgent->new->get($remote_url);
- unless ($response->is_success) {
- say "\nFailed to fetch the file: " . $response->status_line;
- return;
+ FileHelper::write
+ $dest_path,
+ $response->decoded_content;
+ say 'done';
}
+}
- FileHelper::write $dest_path, $response->decoded_content;
- say 'done';
- }
+package Foostats::Merger {
+ use Data::Dumper; # TODO: UNDO
- sub _last_month_dates () {
- my $today = localtime;
- my @last_week;
+ sub merge ($stats_dir) {
+ my %merge;
+ $merge{$_} = merge_for_date( $stats_dir, $_ ) for DateHelper::last_month_dates;
+ return %merge;
+ }
- for my $days_ago (0..30) {
- my $date = $today - ($days_ago * 24 * 60 * 60);
- push @last_week, $date->strftime('%Y%m%d');
+ sub merge_for_date ( $stats_dir, $date ) {
+ printf
+ "Merging for date %s\n",
+ $date;
+
+ my @stats = stats_for_date( $stats_dir, $date );
+ return {
+ feed_ips => feed_ips(@stats),
+ count => count(@stats),
+ page_ips => page_ips(@stats),
+ };
}
- return @last_week;
- }
-}
+ sub merge_ips ( $a, $b, $key_transform = undef ) {
+ my sub merge ( $a, $b ) {
+ while ( my ( $key, $val ) = each %$b ) {
+ $a->{$key} //= 0;
+ $a->{$key} += $val;
+ }
+ }
+
+ my $is_num = qr/^\d+(\.\d+)?$/;
+
+ while ( my ( $key, $val ) = each %$b ) {
+ $key = $key_transform->($key)
+ if defined $key_transform;
+
+ if ( not exists $a->{$key} ) {
+ $a->{$key} = $val;
+ }
+ elsif (ref( $a->{$key} ) eq 'HASH'
+ && ref($val) eq 'HASH' )
+ {
+ merge( $a->{$key}, $val );
+ }
+ elsif ($a->{$key} =~ $is_num
+ && $val =~ $is_num )
+ {
+ $a->{$key} += $val;
+ }
+ else {
+ die
+ "Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n",
+ $key,
+ ref($key), $a->{$key},
+ ref( $a->{$key} ),
+ $val,
+ ref($val);
+ }
+ }
+ }
-package main {
- use Getopt::Long;
- use Sys::Hostname;
+ sub feed_ips (@stats) {
+ my ( %gemini, %web );
+
+ for my $stats (@stats) {
+ my $merge =
+ $stats->{proto} eq 'web'
+ ? \%web
+ : \%gemini;
+ printf
+ "Merging proto %s feed IPs\n",
+ $stats->{proto};
+ merge_ips( $merge, $stats->{feed_ips} );
+ }
+
+ my %total;
+ merge_ips( \%total, $web{$_} ) for keys %web;
+ merge_ips( \%total, $gemini{$_} ) for keys %gemini;
+
+ my %merge = (
+ 'Total' => scalar keys %total,
+ 'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*,
+ 'Gemini Atom' => scalar keys $gemini{atom_feed}->%*,
+ 'Web Gemfeed' => scalar keys $web{gemfeed}->%*,
+ 'Web Atom' => scalar keys $web{atom_feed}->%*,
+ );
- sub parse_logs ($stats_dir) {
- my $out = Foostats::Outputter->new(stats_dir => $stats_dir);
+ return \%merge;
+ }
- $out->{stats} = Foostats::Logreader::parse_logs(
- $out->last_processed_date('web'),
- $out->last_processed_date('gemini'),
- );
+ sub count (@stats) {
+ my %merge;
- $out->write;
- }
+ for my $stats (@stats) {
+ while ( my ( $key, $val ) = each $stats->{count}->%* ) {
+ $merge{$key} //= 0;
+ $merge{$key} += $val;
+ }
+ }
- my ($parse_logs, $replicate, $report, $all);
+ return \%merge;
+ }
- # With default values
- my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats';
- my $partner_node = hostname eq 'fishfinger.buetow.org'
- ? 'blowfish.buetow.org' : 'fishfinger.buetow.org';
+ sub page_ips (@stats) {
+ my %merge = (
+ urls => {},
+ hosts => {}
+ );
- GetOptions 'parse-logs' => \$parse_logs,
- 'replicate' => \$replicate,
- 'pretty-print' => \$report,
- 'all' => \$all,
- 'stats-dir' => \$stats_dir,
- 'partner-node' => \$partner_node;
+ for my $key ( keys %merge ) {
+ merge_ips(
+ $merge{$key},
+ $_->{page_ips}->{$key},
+ sub ($key) {
+ $key =~ s/\.html$/.../;
+ $key =~ s/\.gmi$/.../;
+ $key;
+ }
+ ) for @stats;
+
+ # Keep only uniq IP count
+ $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* for keys $merge{$key}->%*;
+ }
+
+ return \%merge;
+ }
+
+ sub stats_for_date ( $stats_dir, $date ) {
+ my @stats;
+
+ for my $proto (qw(gemini web)) {
+ for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) {
+ printf
+ "Reading %s\n",
+ $path;
+ push
+ @stats,
+ FileHelper::read_json_gz($path);
+ @{ $stats[-1] }{qw(proto path)} = ( $proto, $path );
+ }
+ }
+
+ return @stats;
+ }
+}
- parse_logs $stats_dir if $parse_logs or $all;
+package Foostats::Reporter {
+ use Data::Dumper;
- Foostats::Replicator->new(stats_dir => $stats_dir)->replicate($partner_node)
- if $replicate or $all;
+ sub report (%merged) {
+ print Dumper %merged;
+ }
+}
+
+package main {
+ use Getopt::Long;
+ use Sys::Hostname;
+
+ sub parse_logs ( $stats_dir, $odds_file ) {
+ my $out = Foostats::FileOutputter->new( stats_dir => $stats_dir );
+
+ $out->{stats} = Foostats::Logreader::parse_logs( $out->last_processed_date('web'),
+ $out->last_processed_date('gemini'), $odds_file, );
+
+ $out->write;
+ }
- die 'report not yet implemented' if $report or $all;
+ my ( $parse_logs, $replicate, $report, $all );
+
+ # With default values
+ my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats';
+ my $odds_file = $stats_dir . '/odds.txt';
+ my $partner_node =
+ hostname eq 'fishfinger.buetow.org'
+ ? 'blowfish.buetow.org'
+ : 'fishfinger.buetow.org';
+
+ # TODO: Add help output
+ GetOptions
+ 'parse-logs!' => \$parse_logs,
+ 'odds-file=s' => \$odds_file,
+ 'replicate!' => \$replicate,
+ 'report!' => \$report,
+ 'all!' => \$all,
+ 'stats-dir=s' => \$stats_dir,
+ 'partner-node=s' => \$partner_node;
+
+ parse_logs( $stats_dir, $odds_file )
+ if $parse_logs
+ or $all;
+
+ Foostats::Replicator::replicate( $stats_dir, $partner_node )
+ if $replicate
+ or $all;
+
+ Foostats::Reporter::report( Foostats::Merger::merge($stats_dir) )
+ if $report
+ or $all;
}
diff --git a/frontends/scripts/odds.txt b/frontends/scripts/odds.txt
new file mode 100644
index 0000000..489b2e0
--- /dev/null
+++ b/frontends/scripts/odds.txt
@@ -0,0 +1,16 @@
+%
++
+..
+/admin
+.asp
+.env
+.git
+HNAP1
+.lua
+microsoft.exchange
+/owa/
+.php
+robots.txt
+wordpress
+/wp
+.rar