#!/usr/bin/perl -Tw use strict; #Copyright 2005, FutureQuest, Inc. # #TT:Sun 10 Jul 2005 08:03:39 PM EDT # Scan web logs, given IP list, and pull out a list of unique referers # http://www.aota.net/forums/showthread.php?t=1977 # # This is quick-n-dirty rapid prototype algorithm script # #read log entries from STDIN #retrieve IP list from $ARG[0] my $ip_file; #file that contains our IP list my $ip_regex; #cannot use a simple lookup table since the ip_file could contain partial addresses :( my $line_count; #global line count for summation my $rc; #leave nothing untested my @ips = (); #%referrers = ( # $referrer_shortN => { # COUNT => { $count } # , RURIP => {referrer_uri_path => count, ...} # , GIP => {ip1 => count, ...} #Global IP per referrer_short # } # , ... #) my %referrers = (); #HoH sub usage { die("Usage: $ARGV[0] ip_list\n"); } sub ERROR { my ($type, $msg) = @_; printf(STDERR "[%s] %s\n", $type, $msg); exit 1; } sub WARN { my $msg = shift; ERROR('WARN', $msg); } sub FATAL { my $msg = shift; ERROR('FATAL', $msg); } #################### $ip_file = shift || usage(); if (! -f $ip_file) { FATAL("FILE: ip_list not found"); } $rc = open(IP, "$ip_file"); if (! $rc) { FATAL("Could not open $ip_file: [$rc:$!]"); } @ips = ; $rc = close(IP); if (! $rc) { FATAL("Could not close $ip_file: [$rc:$!]"); } chomp(@ips); #TT: Sanity if (! @ips) { FATAL("There are no IPs to work with"); } #TT: make sure that the '.' in IP addresses are not interpreted as a regex atom #TT: cheaper to wrap each IP in \Q...\E, instead of backslashing each '.' @ips = map { "\Q$_\E" } @ips; #TT: Build a regex string of all IP addresses... $ip_regex = join('|', @ips); if (! $ip_regex) { FATAL("The regex engine will have nothing to match against"); } #################### #################### while (defined(my $line = <>)) { #decls my $ip; my $referrer; #do they even spell check the RFCs??? my $referrer_short; #referrer domain.tld only my $referrer_uri_path; #referrer pathing $line_count++; if ($line !~ m/^($ip_regex)/o) { next; } #We have a hit, stash away the IP address $ip = $1; chomp($line); #TT: rip out the log item referrer using a simple and fast scan $line =~ m@^.+?\sHTTP/1\.[01]"\s\d{3}\s\d+\s"([^"]+)"\s"@; #" (fixup syntax highlighting) $referrer = $1; #TT: split into domain.tld & URL path $referrer =~ m@^(?:http://)?([^/]+)(/.*)?@; $referrer_short = $1; $referrer_uri_path = $2; #TT: Sanity - $referrer_short should not be empty if (! $referrer_short) { WARN("referrer_short was empty: [$line]"); next; } #TT: use magic key Z-EMPTY-Z to keep count on URIs without pathing $referrer_uri_path ||= 'Z-EMPTY-Z'; #TT: leverage Perl auto-vivification to increment the hit counters $referrers{$referrer_short}->{'COUNT'}++; $referrers{$referrer_short}->{'RURIP'}->{$referrer_uri_path}++; $referrers{$referrer_short}->{'GIP'}->{$ip}++; } #################### #################### sub by_count { $referrers{$a}->{'COUNT'} <=> $referrers{$b}->{'COUNT'} } #TT: start by keeping it simple, even though the hash contains info that can be drilled into { my $total; #TT: pseudo static variable foreach my $key (sort by_count keys %referrers) { $total += $referrers{$key}->{'COUNT'}; #print($key . "\n"); printf("%5d : %s\n" , $referrers{$key}->{'COUNT'} , $key ); } printf("\n%8d : Total Log Entries Processed\n", $line_count); printf("%8d : Total Referrer Spams\n", $total); printf("%7.2f%% : Ratio of referrer spams to log lines processed\n", $total / $line_count * 100); } #################### __DATA__ 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /favicon.ico HTTP/1.0" 200 894 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/stp.gif HTTP/1.0" 200 567 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/sth.gif HTTP/1.0" 200 394 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /i01m/ba.gif HTTP/1.0" 200 2316 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/stn.gif HTTP/1.0" 200 647 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/stb.gif HTTP/1.0" 200 460 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/sp.gif HTTP/1.0" 200 822 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/sta.gif HTTP/1.0" 200 582 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /i01m/bkim/urdu.jpg HTTP/1.0" 200 4360 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /mukti.mission.htm HTTP/1.0" 200 38207 "http://see-her-squirt-mpegs.blogspot.com/" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /i01m/image/mukti.mission.staff.jpg HTTP/1.0" 200 5925 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /i01m/tractpics/trait.gif HTTP/1.0" 200 4463 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /i01m/tractpics/rc.gif HTTP/1.0" 200 3637 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /i01m/tractpics/ttwyl.gif HTTP/1.0" 200 4474 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/leftco.gif HTTP/1.0" 200 98 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/cont.gif HTTP/1.0" 200 190 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/bar.gif HTTP/1.0" 200 1018 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/rul.gif HTTP/1.0" 200 67 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/legal.gif HTTP/1.0" 200 150 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [29/Jun/2005:08:56:07 -0400] "GET /t/rightco.gif HTTP/1.0" 200 99 "http://www.born-again-christian.info/mukti.mission.htm" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [30/Jun/2005:09:09:18 -0400] "GET /buy.christian.homeschool.homeschooling.books.supplies.htm HTTP/1.0" 403 341 "http://see-her-squirt-mpegs.blogspot.com/" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [09/Jul/2005:12:20:48 -0400] "GET /meaning/christian.terms.definitions.a.htm HTTP/1.0" 403 325 "http://www.mp3search.cn/" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [09/Jul/2005:21:30:12 -0400] "GET /meaning/christian.terms.definitions.a.htm HTTP/1.0" 403 325 "http://www.mp3search.cn/" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)" 148.223.216.169 - - [09/Jul/2005:23:53:29 -0400] "GET /meaning/christian.terms.definitions.a.htm HTTP/1.0" 403 325 "http://www.mp3search.cn/" "Mozilla/4.0 (compatible; MSIE 5.01; Windows 98)"