#!/usr/bin/perl # Extract relay IP, URI hosts from mail in folder # Hack-n-patch from: # Heavily reworked from original: # dmz@dmzs.com - March 19, 2004 # http://www.dmzs.com/tools/files/spam.phtml # LGPL # by: # Kris Deugau 2009/01/21 # Kris Deugau 2009/05/19 ##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa ##wrapver 0.1 ##wrapsum Extract relay IP and URI hosts from mail in IMAP folder # $Id: extract-data 2 2009-09-01 21:27:00Z kdeugau $ # $URL$ use strict; use warnings; use Time::Local; use POSIX qw(strftime); use IO::Socket::SSL; use Mail::IMAPClient; use Getopt::Std; use NetAddr::IP; # hmm. can't make (enough) sense of the docs. gonna have to parse headers for IPs myself... :/ # but we did find enough to extract the URIs... use lib '/opt/spamassassin/share/perl/5.8.8'; use Mail::SpamAssassin; use Mail::SpamAssassin::PerMsgStatus; my %opts; getopts("druv", \%opts); my $debug = ($opts{d} ? 1 : 0); my $sadebug = 0; # sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed my $imapdebug = 0; # so does Mail::IMAPClient... as in, the whole content of all the mail you look at. O_o my $delete_after_learning = 0; # set to 1 if you want to delete mail right away after learning # - note this makes it rather harder to deep-scan the messages to create local rules my $verbose = ($opts{v} ? 1 : 0); my $tmpdir = '/var/tmp'; my $salearn = '/usr/local/bin/sa-learn'; my $learnargs = ($sadebug ? ' -D' : '').' --showdots '; die "eeep! $salearn doesn't exist!\n" if ! -e $salearn; my $folder = 'reported.needsextraction'; #my $folder = 'confirmed'; # non-SSL IMAP settings: #my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143', # User => 'imapspamuser', # Password => 'imapspamuserpassword', # Debug => $imapdebug); my $imap = Mail::IMAPClient->new( User => 'junkmail', Password => 'k3c86z2', Socket => IO::Socket::SSL->new( Proto => 'tcp', PeerAddr => 'mail.company.com', PeerPort => 993, # IMAP over SSL standard port ), Debug => $imapdebug, ); if (!defined($imap)) { die "IMAP Login Failed"; } my $msgcount = $imap->message_count($folder); # If debugging, print out the total counts for each mailbox print $msgcount, " message(s) to process\n" if $debug; ## Process the spam mailbox $imap->select($folder); my @msgs = $imap->search("ALL"); # Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it? #my $spamtmp = "$tmpdir/spam.".time.".$$"; # this should give us a suitably pseudorandom directory #mkdir $spamtmp or die "couldn't create temporary pen for spam: $!"; my $spamtest = Mail::SpamAssassin->new(); # don't keep dereferencing this my %iplist; my %urilist; for (my $i=0; $i<$msgcount; $i++) { my $msg = $imap->message_string($msgs[$i]); my $mail = $spamtest->parse($msg); my $status = $spamtest->check($mail); # stolen from SA plugin bits #sub parsed_metadata { # my ($self, $opts) = @_; # my $scanner = $status->{permsgstatus}; my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains}; # Generate the full list of html-parsed domains. my $uris = $status->get_uri_detail_list(); # print "$msgs[$i]\n"; my %msguris; while (my($uri, $info) = each %{$uris}) { # we want to skip mailto: uris next if ($uri =~ /^mailto:/); # no domains were found via this uri, so skip next unless ($info->{domains}); next if $info->{types}->{img}; foreach ( keys %{ $info->{domains} } ) { if (exists $skip_domains->{$_}) { next; } $msguris{$_}++; #print " ".$_."\n"; } # print "$uri ".$info->{anchor_text}."\n"; } foreach (keys %msguris) { # print " $_ $msguris{$_}\n"; $urilist{$_}++; } # now, get the relay IP my %headerlist = %{$imap->parse_headers($msgs[$i], "Received")}; my $recvnum = 0; my $recv = $headerlist{'Received'}[$recvnum]; next if !$recv; my $relayip; #Received: from mail.company.com [ip.add.re.ss] # by localhost with POP3 (fetchmail-6.2.5) # for kdeugau@localhost (single-drop); Fri, 15 May 2009 11:45:10 -0400 (EDT) if ($recv =~ /from mail\.company\.com \[ip\.add\.re\.ss\]\s*by localhost with POP3 \(fetchmail/) { $recvnum += 1; $recv = $headerlist{'Received'}[$recvnum]; } if ($recv =~ /^by mx\d\.company\.com \(Postfix, from userid \d+\)/) { $recvnum++; $recv = $headerlist{'Received'}[$recvnum]; } # le sigh. gotta bypass a message if we can't parse the headers. Outlook # does an admirable job of mangling things for us. >:( if ($recv !~ /by mx\d\.company\.com \(Postfix\)/) { print "phtui: $recv\n"; next; } ##fixme # le sigh. skip IP extraction on tagged spam reported as nonspam, since the real spam is a layer deeper. next if $recv =~ /from localhost by mfs\d with SpamAssassin/; # Postini puts the "real" received: header one layer further out - SA is configured to compensate for this so we do too #IP-Network 64.18.0.0/20 #IP-Network-Block 064.018.000.000 - 064.018.015.255 #Org-Name Postini, Inc. if ($recv =~ /\[64\.18\.(?:[0-9]|1[0-5])\.\d+]\) by mx\d\.company\.com/) { $recv = $recv = $headerlist{'Received'}[++$recvnum]; #Received: from source ([208.95.48.65]) (using TLSv1) by # exprod5mx230.postini.com ([64.18.4.10]) with SMTP; Fri, 10 Jul my ($tmprelayip) = ($recv =~ /from source \(\[([\d.]+)\]\) (?:\(using TLSv1\) )?by exprod\dm[xo]b?\d+\.postini\.com/); $relayip = new NetAddr::IP $tmprelayip; } else { my ($tmprelayip) = ($recv =~ /\[([\d+.]+)\]\) by mx\d\.company\.com/); $relayip = new NetAddr::IP $tmprelayip; } # Hotmail/Windows Live Mail may originate or relay spam, but we can't blacklist them #Received: from blu0-omc4-s23.blu0.hotmail.com (blu0-omc4-s23.blu0.hotmail.com # [65.55.111.162]) by mx2.company.com (Postfix) # next if $recv =~ /from (?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com \((?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com \[65.5[2345].\d+\.\d+\]\) by mx\d\.company\.com/; #IP-Network 65.52.0.0/14 #IP-Network-Block 065.052.000.000 - 065.055.255.255 #Org-Name Microsoft Corp my $hotmail1 = new NetAddr::IP "65.52.0.0/14"; print "$.: $recv\n" if !defined ($relayip); next if $relayip->within($hotmail1); # AOL may originate or relay spam, but we can't blacklist them #Received: from omr-m33.mx.aol.com (omr-m33.mx.aol.com [64.12.143.145]) by # mx1.company.com (Postfix) with ESMTP id 7B9431C3255 for ; next if $recv =~ /from (?:omr|imo)-[dm]\d+\.mx\.aol\.com \((?:omr|imo)-[dm]\d+\.mx\.aol\.com \[[\d.]+\]\) by mx\d\.company\.com/; # Google may relay spam, GMail may originate it, but we can't blacklist them. #IP-Network 209.85.128.0/17 #IP-Network-Block 209.085.128.000 - 209.085.255.255 #Org-Name Google Inc. next if $recv =~ /\[209\.85\.(?:1(?:2[89]|[3-9]\d)|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/; #OrgName: Google Inc. #NetRange: 72.14.192.0 - 72.14.255.255 #CIDR: 72.14.192.0/18 next if $recv =~ /\[72\.14\.(?:19[2-9]|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/; # Yahoo! may ... yadda yadda yadda (geeze they've got a whack of netblocks for mail...) #IP-Network 98.136.0.0/14 #IP-Network-Block 098.136.000.000 - 098.139.255.255 #Org-Name Yahoo! Inc. my $yahoo1 = new NetAddr::IP "98.136.0.0/14"; next if $relayip->within($yahoo1); #IP-Network 66.196.64.0/18 #IP-Network-Block 066.196.064.000 - 066.196.127.255 #Org-Name Inktomi Corporation # Inktomi ~~ Yahoo! my $yahoo2 = new NetAddr::IP "66.196.64.0/18"; next if $relayip->within($yahoo2); #IP-Network 67.195.0.0/16 #IP-Network-Block 067.195.000.000 - 067.195.255.255 #Org-Name Yahoo! Inc. my $yahoo3 = new NetAddr::IP "67.195.0.0/16"; next if $relayip->within($yahoo3); #IP-Network 69.147.64.0/18 #IP-Network-Block 069.147.064.000 - 069.147.127.255 #Org-Name Yahoo! Inc. my $yahoo4 = new NetAddr::IP "69.147.64.0/18"; next if $relayip->within($yahoo4); #IP-Network 206.190.32.0/19 #IP-Network-Block 206.190.032.000 - 206.190.063.255 #Org-Name Yahoo! Broadcast Services, Inc. my $yahoo5 = new NetAddr::IP "206.190.32.0/18"; next if $relayip->within($yahoo5); #IP-Network 68.142.192.0/18 #IP-Network-Block 068.142.192.000 - 068.142.255.255 #Org-Name Inktomi Corporation my $yahoo6 = new NetAddr::IP "68.142.192.0/18"; next if $relayip->within($yahoo6); #IP-Network 216.252.96.0/19 #IP-Network-Block 216.252.096.000 - 216.252.127.255 #Org-Name Yahoo! Inc. my $yahoo7 = new NetAddr::IP "216.252.96.0/19"; next if $relayip->within($yahoo7); #inetnum: 124.83.128.0 - 124.83.255.255 #descr: Yahoo! Japan my $yahoo8 = new NetAddr::IP "124.83.128.0/17"; next if $relayip->within($yahoo8); # and the same goes for Bell Canada. *le sigh* #IP-Network 209.226.0.0/16 #IP-Network-Block 209.226.000.000 - 209.226.255.255 #Org-Name Bell Canada #Received: from tomts35-srv.bellnexxia.net (tomts35.bellnexxia.net # [209.226.175.109]) by mx2.company.com (Postfix) with ESMTP id B415C16752D for # ; Sat, 4 Jul 2009 10:48:24 -0400 (EDT) # hmm. tomts\d(-srv)?.bellnexxia.net only seem to be in .175/24. we'll just drop those ones for now... # especially since there appear to be hosted customers etc in the same ARIN allocation above. my $bell1 = new NetAddr::IP "209.226.175.0/24"; next if $relayip->within($bell1); # next if $recv =~ /\[209\.226\.175\.\d+\] by mx\d\.company\.com/; # ... and your little dog too! #IP-Network 208.82.236.0/22 #IP-Network-Block 208.082.236.000 - 208.082.239.255 #Org-Name Craigslist, Inc. my $craigslist1 = new NetAddr::IP "208.82.236.0/22"; next if $relayip->within($craigslist1); $iplist{$relayip->addr}++ if $relayip; # print "$recv\n"; # print "$relayip\n\n"; # print $imap->get_header($msgs[$i], "From"); print "\n"; # last if $i > 15; sleep 1; } # IMAP message iteration # mm. don't really need times on the IP lists #if ($opts{r}) { # print strftime("# %Y/%m/%d %H:%M", localtime())."\n"; #} foreach my $ip (sort keys %iplist) { $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/; if ($opts{r}) { print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n"; } else { print "$ip\t $iplist{$ip}\n"; } } if ($opts{u}) { print strftime("# %Y/%m/%d %H:%M", localtime())."\n"; } foreach my $uri (sort keys %urilist) { if ($opts{u}) { my @hout = qx { host $uri.multi.uribl.com; host $uri.uribl.company.com }; if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) { print "+$uri.uribl.company.com:127.0.0.2:900:::\n"; } } else { # URI plus count print "$uri: $urilist{$uri}\n"; } } $imap->close(); # Close IMAP connection cleanly. $imap->logout(); # integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away # irrelevant for SQL Bayes #my $sarebuild = `/usr/bin/sa-learn --rebuild`; #print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;