#!/usr/bin/perl use v5.38; # Those are enabled automatically now w/ this version of Perl # use strict; # use warnings; use builtin qw(true false); use experimental qw(builtin); use feature qw(refaliasing); no warnings qw(experimental::refaliasing); # TODO: UNDO use diagnostics; # TODO: Blog post about this script and the new Perl features used. # TODO NEXT: # * Write out a nice output from each merged file, also merge if multiple hosts results # * Fix bug with .gmi.*.gmi in the log parser # * Nicely formatted .txt output by stats by count by date package FileHelper { use JSON; sub write ($path, $content) { open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!"; print $fh $content; close $fh; rename "$path.tmp", $path; } sub write_json_gz ($path, $data) { my $json = encode_json $data; say "Writing $path"; open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!"; print $fd $json; close $fd; rename "$path.tmp", $path or die "$path.tmp: $!"; } sub read_json_gz ($path) { say "Reading $path"; open my $fd, '<:gzip', $path or die "$path: $!"; my $json = decode_json <$fd>; close $fd; return $json; } } package DateHelper { use Time::Piece; sub last_month_dates () { my $today = localtime; my @dates; for my $days_ago (0..30) { my $date = $today - ($days_ago * 24 * 60 * 60); push @dates, $date->strftime('%Y%m%d'); } return @dates; } } package Foostats::Logreader { use Digest::SHA3 'sha3_512_base64'; use File::stat; use PerlIO::gzip; use Time::Piece; use String::Util qw(contains startswith endswith); use constant { GEMINI_LOGS_GLOB => '/var/log/daemon*', WEB_LOGS_GLOB => '/var/www/logs/access.log*', }; sub anonymize_ip ($ip) { my $ip_proto = contains($ip, ':') ? 'IPv6' : 'IPv4'; my $ip_hash = sha3_512_base64 $ip; return ($ip_hash, $ip_proto); } sub read_lines ($glob, $cb) { my sub year ($path) { localtime( (stat $path)->mtime )->strftime('%Y') } my sub open_file ($path) { my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<'; open my $fd, $flag, $path or die "$path: $!"; return $fd; } my $last = false; say 'File path glob matches: ' . join(' ', glob $glob); LAST: for my $path ( sort { -M $a <=> -M $b } glob $glob) { say "Processing $path"; my $file = open_file $path; my $year = year $file; while (<$file>) { next if contains($_, 'logfile turned over'); # last == true means: After this file, don't process more $last = true unless defined $cb->($year, split / +/); } say "Closing $path (last:$last)"; close $file; last LAST if $last; } } sub parse_web_logs ($last_processed_date, $cb) { my sub parse_date ($date) { my $t = Time::Piece->strptime($date, '[%d/%b/%Y:%H:%M:%S'); return ($t->strftime('%Y%m%d'), $t->strftime('%H%M%S')); } my sub parse_web_line (@line) { my ($date, $time) = parse_date $line[4]; return undef if $date < $last_processed_date; # X-Forwarded-For? my $ip = $line[-2] eq '-' ? $line[1] : $line[-2]; my ($ip_hash, $ip_proto) = anonymize_ip $ip; return { proto => 'web', host => $line[0], ip_hash => $ip_hash, ip_proto => $ip_proto, date => $date, time => $time, uri_path => $line[7], status => $line[9], }; } read_lines WEB_LOGS_GLOB, sub ($year, @line) { $cb->(parse_web_line @line) }; } sub parse_gemini_logs ($last_processed_date, $cb) { my sub parse_date ($year, @line) { my $timestr = "$line[0] $line[1]"; return Time::Piece->strptime($timestr, '%b %d')->strftime("$year%m%d"); } my sub parse_vger_line ($year, @line) { my $full_path = $line[5]; $full_path =~ s/"//g; my ($proto, undef, $host, $uri_path) = split '/', $full_path, 4; $uri_path = '' unless defined $uri_path; return { proto => 'gemini', host => $host, uri_path => "/$uri_path", status => $line[6], date => int(parse_date($year, @line)), time => $line[2], }; } my sub parse_relayd_line ($year, @line) { my $date = int(parse_date($year, @line)); my ($ip_hash, $ip_proto) = anonymize_ip $line[12]; return { ip_hash => $ip_hash, ip_proto => $ip_proto, date => $date, time => $line[2], }; } # Expect one vger and one relayd log line per event! So collect # both events (one from one log line each) and then merge the result hash! my ($vger, $relayd); read_lines GEMINI_LOGS_GLOB, sub ($year, @line) { if ($line[4] eq 'vger:') { $vger = parse_vger_line $year, @line; } elsif ($line[5] eq 'relay' and startswith($line[6], 'gemini')) { $relayd = parse_relayd_line $year, @line; return undef if $relayd->{date} < $last_processed_date; } if (defined $vger and defined $relayd and $vger->{time} eq $relayd->{time}) { $cb->({ %$vger, %$relayd }); $vger = $relayd = undef; } true; }; } sub parse_logs ($last_web_date, $last_gemini_date) { my $agg = Foostats::Aggregator->new; say "Last web date: $last_web_date"; say "Last gemini date: $last_gemini_date"; parse_web_logs $last_web_date, sub ($event) { $agg->add($event) }; parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) }; return $agg->{stats}; } } package Foostats::Filter { use String::Util qw(contains startswith endswith); use constant WARN_ODD => false; sub new ($class) { bless { odds => [qw( .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin .git microsoft.exchange .lua /owa/ )] }, $class; } sub ok ($self, $event) { state %blocked = (); return false if exists $blocked{$event->{ip_hash}}; if ($self->odd($event) or $self->excessive($event)) { ($blocked{$event->{ip_hash}} //= 0)++; return false; } else { return true; } } sub odd ($self, $event) { \my $uri_path = \$event->{uri_path}; for ($self->{odds}->@*) { if (contains($uri_path, $_)) { say STDERR "Warn: $uri_path contains $_ and is odd and will therefore be blocked!" if WARN_ODD; return true; } } return false; } sub excessive ($self, $event) { \my $time = \$event->{time}; \my $ip_hash = \$event->{ip_hash}; state $last_time = $time; # Time with second: 'HH:MM:SS' state %count = (); # IPs accessing within the same second! if ($last_time ne $time) { $last_time = $time; %count = (); return false; } # IP requested site more than once within the same second!? if (1 < ++($count{$ip_hash} //= 0)) { say STDERR "Warn: $ip_hash blocked due to excessive requesting..." if WARN_ODD; return true; } return false; } } package Foostats::Aggregator { use String::Util qw(contains startswith endswith); use constant { ATOM_FEED_URI => '/gemfeed/atom.xml', GEMFEED_URI => '/gemfeed/index.gmi', GEMFEED_URI_2 => '/gemfeed/', }; sub new ($class) { bless { filter => Foostats::Filter->new, stats => {} }, $class } sub add ($self, $event) { return undef unless defined $event; my $date = $event->{date}; my $date_key = $event->{proto} . "_$date"; $self->{stats}{$date_key} //= { count => { filtered => 0 }, feed_ips => { atom_feed => {}, gemfeed => {} }, page_ips => { hosts => {}, urls => {} }, }; \my $s = \$self->{stats}{$date_key}; unless ($self->{filter}->ok($event)) { $s->{count}{filtered}++; return $event; } $self->add_count($s, $event); $self->add_page_ips($s, $event) unless $self->add_feed_ips($s, $event); return $event; } sub add_count ($self, $stats, $event) { \my $c = \$stats->{count}; \my $e = \$event; ($c->{$e->{proto}} //= 0)++; ($c->{$e->{ip_proto}} //= 0)++; } sub add_feed_ips ($self, $stats, $event) { \my $f = \$stats->{feed_ips}; \my $e = \$event; if (endswith($e->{uri_path}, ATOM_FEED_URI)) { ($f->{atom_feed}->{$e->{ip_hash}} //= 0)++; } elsif (contains($e->{uri_path}, GEMFEED_URI)) { ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++; } elsif (endswith($e->{uri_path}, GEMFEED_URI_2)) { ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++; } else { 0 } } sub add_page_ips ($self, $stats, $event) { \my $e = \$event; \my $p = \$stats->{page_ips}; return if !endswith($e->{uri_path}, '.html') && !endswith($e->{uri_path}, '.gmi'); ($p->{hosts}->{$e->{host}}->{$e->{ip_hash}} //= 0)++; ($p->{urls}->{$e->{host}.$e->{uri_path}}->{$e->{ip_hash}} //= 0)++; } } package Foostats::FileOutputter { use JSON; use Sys::Hostname; use PerlIO::gzip; sub new ($class, %args) { my $self = bless \%args, $class; mkdir $self->{stats_dir} or die $self->{stats_dir} . ": $!" unless -d $self->{stats_dir}; return $self; } sub last_processed_date ($self, $proto) { my $hostname = hostname(); my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz"; my ($date) = @processed ? ($processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/) : 0; return int($date); } sub write ($self) { $self->for_dates(sub ($self, $date_key, $stats) { my $hostname = hostname(); my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz"; FileHelper::write_json_gz $path, $stats; }); } sub for_dates ($self, $cb) { $cb->($self, $_, $self->{stats}{$_}) for sort keys $self->{stats}->%*; } } package Foostats::Replicator { use JSON; use File::Basename; use LWP::UserAgent; use String::Util qw(endswith); sub replicate ($stats_dir, $partner_node) { say "Replicating from $partner_node"; for my $proto (qw(gemini web)) { my $count = 0; for my $date (DateHelper::last_month_dates) { my $file_base = "${proto}_${date}"; my $dest_path = "${file_base}.$partner_node.json.gz"; replicate_file( "https://$partner_node/foostats/$dest_path", "$stats_dir/$dest_path", $count++ < 3, # Always replicate the newest 3 files. ); } } } sub replicate_file ($remote_url, $dest_path, $force) { # $dest_path already exists, not replicating it return if !$force && -f $dest_path; say "Replicating $remote_url to $dest_path (force:$force)... "; my $response = LWP::UserAgent->new->get($remote_url); unless ($response->is_success) { say "\nFailed to fetch the file: " . $response->status_line; return; } FileHelper::write $dest_path, $response->decoded_content; say 'done'; } } package Foostats::Merger { use Data::Dumper; # TODO: UNDO sub merge ($stats_dir) { my %merge; $merge{$_} = merge_for_date($stats_dir, $_) for DateHelper::last_month_dates; print Dumper %merge; } sub merge_for_date ($stats_dir, $date) { printf "Merging for date %s\n", $date; my @stats = stats_for_date($stats_dir, $date); return { feed_ips => feed_ips(@stats), count => count(@stats), page_ips => page_ips(@stats), }; } sub merge_ips ($a, $b, $key_transform = undef) { my sub merge ($a, $b) { while (my ($key, $val) = each %$b) { $a->{$key} //= 0; $a->{$key} += $val; } } my $is_num = qr/^\d+(\.\d+)?$/; while (my ($key, $val) = each %$b) { $key = $key_transform->($key) if defined $key_transform; if (not exists $a->{$key}) { $a->{$key} = $val; } elsif (ref($a->{$key}) eq 'HASH' && ref($val) eq 'HASH') { merge($a->{$key}, $val); } elsif ($a->{$key} =~ $is_num && $val =~ $is_num) { $a->{$key} += $val; } else { die "Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n", $key, ref($key), $a->{$key}, ref($a->{$key}), $val, ref($val); } } } sub feed_ips (@stats) { my (%gemini, %web); for my $stats (@stats) { my $merge = $stats->{proto} eq 'web' ? \%web : \%gemini; printf "Merging proto %s feed IPs\n", $stats->{proto}; merge_ips($merge, $stats->{feed_ips}); } my %total; merge_ips(\%total, $web{$_}) for keys %web; merge_ips(\%total, $gemini{$_}) for keys %gemini; my %merge = ( 'Total' => scalar keys %total, 'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*, 'Gemini Atom' => scalar keys $gemini{atom_feed}->%*, 'Web Gemfeed' => scalar keys $web{gemfeed}->%*, 'Web Atom' => scalar keys $web{atom_feed}->%*, ); return \%merge; } sub count (@stats) { my %merge; for my $stats (@stats) { while (my ($key, $val) = each $stats->{count}->%*) { $merge{$key} //= 0; $merge{$key} += $val; } } return \%merge; } sub page_ips (@stats) { my %merge = (urls => {}, hosts => {}); for my $key (keys %merge) { merge_ips($merge{$key}, $_->{page_ips}->{$key}, sub ($key) { $key =~ s/\.html$/.../; $key =~ s/\.gmi$/.../; $key; }) for @stats; # Keep only uniq IP count $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* for keys $merge{$key}->%*; } return \%merge; } sub stats_for_date ($stats_dir, $date) { my @stats; for my $proto (qw(gemini web)) { for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) { printf "Reading %s\n", $path; push @stats, FileHelper::read_json_gz($path); @{$stats[-1]}{qw(proto path)} = ($proto, $path); } } return @stats; } } package main { use Getopt::Long; use Sys::Hostname; sub parse_logs ($stats_dir) { my $out = Foostats::FileOutputter->new(stats_dir => $stats_dir); $out->{stats} = Foostats::Logreader::parse_logs( $out->last_processed_date('web'), $out->last_processed_date('gemini'), ); $out->write; } my ($parse_logs, $replicate, $merge, $all); # With default values my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats'; my $partner_node = hostname eq 'fishfinger.buetow.org' ? 'blowfish.buetow.org' : 'fishfinger.buetow.org'; # TODO: Add help output GetOptions 'parse-logs!' => \$parse_logs, 'replicate!' => \$replicate, 'merge!' => \$merge, 'all!' => \$all, 'stats-dir=s' => \$stats_dir, 'partner-node=s' => \$partner_node; parse_logs $stats_dir if $parse_logs or $all; Foostats::Replicator::replicate($stats_dir, $partner_node) if $replicate or $all; Foostats::Merger::merge($stats_dir) if $merge or $all; }