#!/usr/bin/perl
# Extract relay IP, URI hosts from mail in folder
# Hack-n-patch from:
# Heavily reworked from original:
# dmz@dmzs.com - March 19, 2004
# http://www.dmzs.com/tools/files/spam.phtml
# LGPL
# by:
# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
# Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
##wrapver 0.1
##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
# $Id: extract-data 2 2009-09-01 21:27:00Z kdeugau $
# $URL$

use strict;
use warnings;
use Time::Local;
use POSIX qw(strftime);
use IO::Socket::SSL;
use Mail::IMAPClient;
use Getopt::Std;
use NetAddr::IP;

# hmm. can't make (enough) sense of the docs.  gonna have to parse headers for IPs myself...  :/
# but we did find enough to extract the URIs...
use lib '/opt/spamassassin/share/perl/5.8.8';
use Mail::SpamAssassin;
use Mail::SpamAssassin::PerMsgStatus;

my %opts;
getopts("druv", \%opts);

my $debug = ($opts{d} ? 1 : 0);
my $sadebug = 0;	# sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
my $imapdebug = 0;	# so does Mail::IMAPClient...  as in, the whole content of all the mail you look at.  O_o
my $delete_after_learning = 0;	# set to 1 if you want to delete mail right away after learning
				# - note this makes it rather harder to deep-scan the messages to create local rules
my $verbose = ($opts{v} ? 1 : 0);

my $tmpdir = '/var/tmp';
my $salearn = '/usr/local/bin/sa-learn';
my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';

die "eeep!  $salearn doesn't exist!\n" if ! -e $salearn;

my $folder = 'reported.needsextraction';
#my $folder = 'confirmed';

# non-SSL IMAP settings:
#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
#                                  User => 'imapspamuser',
#                                  Password => 'imapspamuserpassword',
#                                  Debug => $imapdebug);

my $imap = Mail::IMAPClient->new(
	User => 'junkmail', 
	Password => 'k3c86z2',
	Socket   => IO::Socket::SSL->new(
		Proto    => 'tcp',
		PeerAddr => 'mail.company.com',
		PeerPort => 993, # IMAP over SSL standard port
		),
	Debug => $imapdebug,
	);

if (!defined($imap)) { die "IMAP Login Failed"; }

my $msgcount = $imap->message_count($folder);

# If debugging, print out the total counts for each mailbox
print $msgcount, " message(s) to process\n" if $debug;

## Process the spam mailbox
$imap->select($folder);
my @msgs = $imap->search("ALL");

# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
#my $spamtmp = "$tmpdir/spam.".time.".$$";  # this should give us a suitably pseudorandom directory
#mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";

my $spamtest = Mail::SpamAssassin->new();
  # don't keep dereferencing this

my %iplist;
my %urilist;

for (my $i=0; $i<$msgcount; $i++) {
  my $msg = $imap->message_string($msgs[$i]);

  my $mail = $spamtest->parse($msg);
  my $status = $spamtest->check($mail);

# stolen from SA plugin bits
#sub parsed_metadata {
#  my ($self, $opts) = @_;
#  my $scanner = $status->{permsgstatus};

  my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};

  # Generate the full list of html-parsed domains.
  my $uris = $status->get_uri_detail_list();

#  print "$msgs[$i]\n";
  my %msguris;

  while (my($uri, $info) = each %{$uris}) {
    # we want to skip mailto: uris
    next if ($uri =~ /^mailto:/);

    # no domains were found via this uri, so skip
    next unless ($info->{domains});

    next if $info->{types}->{img};
    foreach ( keys %{ $info->{domains} } ) {
      if (exists $skip_domains->{$_}) {
        next;
      }
      $msguris{$_}++;
      #print "  ".$_."\n";
    }

  #  print "$uri ".$info->{anchor_text}."\n";
  }

  foreach (keys %msguris) {
#    print "  $_ $msguris{$_}\n";
    $urilist{$_}++;
  }

  # now, get the relay IP

  my %headerlist = %{$imap->parse_headers($msgs[$i], "Received")};
  my $recvnum = 0;
  my $recv = $headerlist{'Received'}[$recvnum];
  next if !$recv;

  my $relayip;

  #Received: from mail.company.com [ip.add.re.ss]
  #        by localhost with POP3 (fetchmail-6.2.5)
  #        for kdeugau@localhost (single-drop); Fri, 15 May 2009 11:45:10 -0400 (EDT)
  if ($recv =~ /from mail\.company\.com \[ip\.add\.re\.ss\]\s*by localhost with POP3 \(fetchmail/) {
    $recvnum += 1;
    $recv = $headerlist{'Received'}[$recvnum];
  }

  if ($recv =~ /^by mx\d\.company\.com \(Postfix, from userid \d+\)/) {
    $recvnum++;
    $recv = $headerlist{'Received'}[$recvnum];
  }

# le sigh.  gotta bypass a message if we can't parse the headers.  Outlook
# does an admirable job of mangling things for us.  >:(
  if ($recv !~ /by mx\d\.company\.com \(Postfix\)/) {
    print "phtui:  $recv\n";
    next;
  }

##fixme
# le sigh.  skip IP extraction on tagged spam reported as nonspam, since the real spam is a layer deeper.
next if $recv =~ /from localhost by mfs\d with SpamAssassin/;

# Postini puts the "real" received: header one layer further out - SA is configured to compensate for this so we do too
  #IP-Network                    64.18.0.0/20
  #IP-Network-Block              064.018.000.000 - 064.018.015.255
  #Org-Name                      Postini, Inc.
  if ($recv =~ /\[64\.18\.(?:[0-9]|1[0-5])\.\d+]\) by mx\d\.company\.com/) {
    $recv = $recv = $headerlist{'Received'}[++$recvnum];
#Received: from source ([208.95.48.65]) (using TLSv1) by
# exprod5mx230.postini.com ([64.18.4.10]) with SMTP; Fri, 10 Jul 
    my ($tmprelayip) = ($recv =~ /from source \(\[([\d.]+)\]\) (?:\(using TLSv1\) )?by exprod\dm[xo]b?\d+\.postini\.com/);
    $relayip = new NetAddr::IP $tmprelayip;
  } else {
    my ($tmprelayip) = ($recv =~ /\[([\d+.]+)\]\) by mx\d\.company\.com/);
    $relayip = new NetAddr::IP $tmprelayip;
  }

# Hotmail/Windows Live Mail may originate or relay spam, but we can't blacklist them
  #Received: from blu0-omc4-s23.blu0.hotmail.com (blu0-omc4-s23.blu0.hotmail.com
  # [65.55.111.162]) by mx2.company.com (Postfix)
#  next if $recv =~ /from (?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com 
\((?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com \[65.5[2345].\d+\.\d+\]\) by mx\d\.company\.com/;
  #IP-Network                    65.52.0.0/14
  #IP-Network-Block              065.052.000.000 - 065.055.255.255
  #Org-Name                      Microsoft Corp
  my $hotmail1 = new NetAddr::IP "65.52.0.0/14";
print "$.: $recv\n" if !defined ($relayip);
  next if $relayip->within($hotmail1);

# AOL may originate or relay spam, but we can't blacklist them
  #Received: from omr-m33.mx.aol.com (omr-m33.mx.aol.com [64.12.143.145]) by
  # mx1.company.com (Postfix) with ESMTP id 7B9431C3255 for <webmaster@tyenet.com>;
  next if $recv =~ /from (?:omr|imo)-[dm]\d+\.mx\.aol\.com \((?:omr|imo)-[dm]\d+\.mx\.aol\.com \[[\d.]+\]\) by mx\d\.company\.com/;

# Google may relay spam, GMail may originate it, but we can't blacklist them.
  #IP-Network                    209.85.128.0/17
  #IP-Network-Block              209.085.128.000 - 209.085.255.255
  #Org-Name                      Google Inc.
  next if $recv =~ /\[209\.85\.(?:1(?:2[89]|[3-9]\d)|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/;
  #OrgName:    Google Inc.
  #NetRange:   72.14.192.0 - 72.14.255.255
  #CIDR:       72.14.192.0/18
  next if $recv =~ /\[72\.14\.(?:19[2-9]|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/;

# Yahoo! may ... yadda yadda yadda  (geeze they've got a whack of netblocks for mail...)
  #IP-Network                    98.136.0.0/14
  #IP-Network-Block              098.136.000.000 - 098.139.255.255
  #Org-Name                      Yahoo! Inc.
  my $yahoo1 = new NetAddr::IP "98.136.0.0/14";
  next if $relayip->within($yahoo1);
  #IP-Network                    66.196.64.0/18
  #IP-Network-Block              066.196.064.000 - 066.196.127.255
  #Org-Name                      Inktomi Corporation
  # Inktomi ~~ Yahoo!
  my $yahoo2 = new NetAddr::IP "66.196.64.0/18";
  next if $relayip->within($yahoo2);
  #IP-Network                    67.195.0.0/16
  #IP-Network-Block              067.195.000.000 - 067.195.255.255
  #Org-Name                      Yahoo! Inc.
  my $yahoo3 = new NetAddr::IP "67.195.0.0/16";
  next if $relayip->within($yahoo3);
  #IP-Network                    69.147.64.0/18
  #IP-Network-Block              069.147.064.000 - 069.147.127.255
  #Org-Name                      Yahoo! Inc.
  my $yahoo4 = new NetAddr::IP "69.147.64.0/18";
  next if $relayip->within($yahoo4);
  #IP-Network                    206.190.32.0/19
  #IP-Network-Block              206.190.032.000 - 206.190.063.255
  #Org-Name                      Yahoo! Broadcast Services, Inc.
  my $yahoo5 = new NetAddr::IP "206.190.32.0/18";
  next if $relayip->within($yahoo5);
  #IP-Network                    68.142.192.0/18
  #IP-Network-Block              068.142.192.000 - 068.142.255.255
  #Org-Name                      Inktomi Corporation
  my $yahoo6 = new NetAddr::IP "68.142.192.0/18";
  next if $relayip->within($yahoo6);
  #IP-Network                    216.252.96.0/19
  #IP-Network-Block              216.252.096.000 - 216.252.127.255
  #Org-Name                      Yahoo! Inc.
  my $yahoo7 = new NetAddr::IP "216.252.96.0/19";
  next if $relayip->within($yahoo7);
  #inetnum:      124.83.128.0 - 124.83.255.255
  #descr:        Yahoo! Japan
  my $yahoo8 = new NetAddr::IP "124.83.128.0/17";
  next if $relayip->within($yahoo8);

# and the same goes for Bell Canada.  *le sigh*
#IP-Network                    209.226.0.0/16
#IP-Network-Block              209.226.000.000 - 209.226.255.255
#Org-Name                      Bell Canada
#Received: from tomts35-srv.bellnexxia.net (tomts35.bellnexxia.net
# [209.226.175.109]) by mx2.company.com (Postfix) with ESMTP id B415C16752D for
# <user@compnay.com>; Sat,  4 Jul 2009 10:48:24 -0400 (EDT)
# hmm.  tomts\d(-srv)?.bellnexxia.net only seem to be in .175/24.  we'll just drop those ones for now...
# especially since there appear to be hosted customers etc in the same ARIN allocation above.
  my $bell1 = new NetAddr::IP "209.226.175.0/24";
  next if $relayip->within($bell1);
#  next if $recv =~ /\[209\.226\.175\.\d+\] by mx\d\.company\.com/;

# ... and your little dog too!
#IP-Network                    208.82.236.0/22
#IP-Network-Block              208.082.236.000 - 208.082.239.255
#Org-Name                      Craigslist, Inc.
  my $craigslist1 = new NetAddr::IP "208.82.236.0/22";
  next if $relayip->within($craigslist1);

  $iplist{$relayip->addr}++ if $relayip;
#  print "$recv\n";
#  print "$relayip\n\n";
#  print $imap->get_header($msgs[$i], "From"); print "\n";


#  last if $i > 15;
  sleep 1;
} # IMAP message iteration

# mm.  don't really need times on the IP lists
#if ($opts{r}) {
#  print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
#}
foreach my $ip (sort keys %iplist) {
  $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
  if ($opts{r}) {
    print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
  } else {
    print "$ip\t $iplist{$ip}\n";
  }
}

if ($opts{u}) {
  print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
}
foreach my $uri (sort keys %urilist) {
  if ($opts{u}) {
    my @hout = qx { host $uri.multi.uribl.com; host $uri.uribl.company.com };
    if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
      print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
    }
  } else {
    # URI plus count
    print "$uri: $urilist{$uri}\n";
  }
}

$imap->close();

# Close IMAP connection cleanly.
$imap->logout();

# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
# irrelevant for SQL Bayes
#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
