#!/usr/bin/perl
# Extract relay IP, URI hosts from mail in folder
# Hack-n-patch from:
# Heavily reworked from original:
# dmz@dmzs.com - March 19, 2004
# http://www.dmzs.com/tools/files/spam.phtml
# LGPL
# by:
# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
# Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
##wrapver 0.1
##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
# $Id: extract-data 16 2010-01-20 22:35:34Z kdeugau $
# $URL$

use strict;
use warnings;
use Time::Local;
use POSIX qw(strftime);
use IO::Socket::SSL;
use Mail::IMAPClient;
use Getopt::Std;
use NetAddr::IP;

# hmm. can't make (enough) sense of the docs.  gonna have to parse headers for IPs myself...  :/
# but we did find enough to extract the URIs...
use lib '/opt/spamassassin/share/perl/5.8.8';
use Mail::SpamAssassin;
use Mail::SpamAssassin::PerMsgStatus;

my %opts;
getopts("druv", \%opts);

my $debug = ($opts{d} ? 1 : 0);
my $sadebug = 0;	# sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
my $imapdebug = 0;	# so does Mail::IMAPClient...  as in, the whole content of all the mail you look at.  O_o
my $delete_after_learning = 0;	# set to 1 if you want to delete mail right away after learning
				# - note this makes it rather harder to deep-scan the messages to create local rules
my $verbose = ($opts{v} ? 1 : 0);

my $tmpdir = '/var/tmp';
my $salearn = '/usr/local/bin/sa-learn';
my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';

die "eeep!  $salearn doesn't exist!\n" if ! -e $salearn;

my $folder = 'reported.needsextraction';
#my $folder = 'confirmed';

# non-SSL IMAP settings:
#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
#                                  User => 'imapspamuser',
#                                  Password => 'imapspamuserpassword',
#                                  Debug => $imapdebug);

my $imap = Mail::IMAPClient->new(
	User => 'junkmail', 
	Password => 'k3c86z2',
	Socket   => IO::Socket::SSL->new(
		Proto    => 'tcp',
		PeerAddr => 'mail.company.com',
		PeerPort => 993, # IMAP over SSL standard port
		),
	Debug => $imapdebug,
	);

if (!defined($imap)) { die "IMAP Login Failed"; }

my $msgcount = $imap->message_count($folder);

# If debugging, print out the total counts for each mailbox
print $msgcount, " message(s) to process\n" if $debug;

## Process the spam mailbox
$imap->select($folder);
my @msgs = $imap->search("ALL");

# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
#my $spamtmp = "$tmpdir/spam.".time.".$$";  # this should give us a suitably pseudorandom directory
#mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";

my $spamtest = Mail::SpamAssassin->new();
  # don't keep dereferencing this

my %iplist;
my %urilist;

# put together an array of netblocks we won't/can't list for various reasons
my @dontlistme = (
    # Hotmail/Windows Live Mail
	NetAddr::IP->new("65.52.0.0/14"),

    # AOL - note only some IPs show mail-ish rDNS
    #IP-Network                    205.188.0.0/16
    #IP-Network                    64.12.0.0/16	
	NetAddr::IP->new("205.188.105.140/29"),
	NetAddr::IP->new("205.188.169.196/29"),
	NetAddr::IP->new("205.188.249.128/29"),
	NetAddr::IP->new("205.188.249.64/29"),
	NetAddr::IP->new("64.12.143.144/30"),

    # Google/GMail
	NetAddr::IP->new("209.85.128.0/17"),
	NetAddr::IP->new("72.14.192.0/18"),

    # Yahoo!/Inktomi
	NetAddr::IP->new("98.136.0.0/14"),
	NetAddr::IP->new("66.196.64.0/18"),
	NetAddr::IP->new("67.195.0.0/16"),
	NetAddr::IP->new("69.147.64.0/18"),
	NetAddr::IP->new("206.190.32.0/18"),
	NetAddr::IP->new("68.142.192.0/18"),
	NetAddr::IP->new("216.252.96.0/19"),
	NetAddr::IP->new("124.83.128.0/17"),
	NetAddr::IP->new("217.146.184.0/21"),
	NetAddr::IP->new("124.108.96.0/20"),
	NetAddr::IP->new("76.13.0.0/16"),
	NetAddr::IP->new("68.180.128.0/17"),
	NetAddr::IP->new("209.191.64.0/18"),
	NetAddr::IP->new("212.82.104.0/21"),
	NetAddr::IP->new("66.163.160.0/19"),

    # Bell Canada - note only some IPs show mail-ish rDNS
    #IP-Network                    209.226.0.0/16
    #IP-Network                    207.236.0.0/16
	NetAddr::IP->new("209.226.175.0/24"),
	NetAddr::IP->new("207.236.237.0/26"),

    # Craigslist
    #IP-Network                    208.82.236.0/22
	NetAddr::IP->new("208.82.236.0/22"),

    # Apple.com/mac.com - note only some IPs show mail-ish rDNS
    #IP-Network                    17.0.0.0/8
    # asmtpout0(11-30).mac.com
    # 17.148.16.	011 -> 86	030 -> 105
	NetAddr::IP->new("17.148.16.64/26"),

    # Vodafone - note only some IPs show mail-ish rDNS
    #route:        212.183.128.0/19
	NetAddr::IP->new("212.183.156.224/29"),

    # Eastlink (formerly Persona [Sudbury etc]) - only a few IPs observed with mail-ish rDNS
    #IP-Network                    24.222.0.0/16
	NetAddr::IP->new("24.222.0.30"),
	NetAddr::IP->new("24.224.136.8/30"),

    # Cogeco - only a few IPs observed with mail-ish rDNS
    #IP-Network                    216.221.64.0/19
	NetAddr::IP->new("216.221.81.192"),
	NetAddr::IP->new("216.221.81.96/30"),

    # UAlberta - only one IP observed with mail-ish rDNS
    #IP-Network                    129.128.0.0/16
	NetAddr::IP->new("129.128.5.19"),

    # seznam.cz
    #route:          77.75.72.0/23
	NetAddr::IP->new("77.75.72.44"),

    ); # done def for @dontlistme

MSG: for (my $i=0; $i<$msgcount; $i++) {
  my $msg = $imap->message_string($msgs[$i]);

  my $mail = $spamtest->parse($msg);
  my $status = $spamtest->check($mail);

# stolen from SA plugin bits
#sub parsed_metadata {
#  my ($self, $opts) = @_;
#  my $scanner = $status->{permsgstatus};

  my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};

  # Generate the full list of html-parsed domains.
  my $uris = $status->get_uri_detail_list();

#  print "$msgs[$i]\n";
  my %msguris;

  while (my($uri, $info) = each %{$uris}) {
    # we want to skip mailto: uris
    next if ($uri =~ /^mailto:/);

    # no domains were found via this uri, so skip
    next unless ($info->{domains});

    next if $info->{types}->{img};
    foreach ( keys %{ $info->{domains} } ) {
      if (exists $skip_domains->{$_}) {
        next;
      }
      $msguris{$_}++;
      #print "  ".$_."\n";
    }

  #  print "$uri ".$info->{anchor_text}."\n";
  }

  foreach (keys %msguris) {
#    print "  $_ $msguris{$_}\n";
    $urilist{$_}++;
  }

  # now, get the relay IP

  my $stmsg = $status->get_message();
  my @untrusted = @{$stmsg->{metadata}->{relays_untrusted}};

  my $relayip = new NetAddr::IP $untrusted[0]->{ip};

# sigh.  messages tagged as spam already make life difficult by hiding the
# old Received: headers.  We'll just handle them manually for now.
  if (!$relayip) {
    print "phtui: ";
    my %headerlist = %{$imap->parse_headers($msgs[$i], "Received", "Subject")};
    my $recvnum = 0;
    my $recv = $headerlist{'Received'}[$recvnum];
    print "$recv\n";
    print "   $headerlist{'Subject'}[0]\n";
    next MSG;
  }

  foreach my $block (@dontlistme) {
    next MSG if $relayip->within($block);
  }

  $iplist{$relayip->addr}++ if $relayip;

#  last if $i > 2;
  sleep 1;
} # IMAP message iteration

# mm.  don't really need times on the IP lists
#if ($opts{r}) {
#  print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
#}
foreach my $ip (sort keys %iplist) {
  $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
  if ($opts{r}) {
    print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
  } else {
    print "$ip\t $iplist{$ip}\n";
  }
}

if ($opts{u}) {
  print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
}
foreach my $uri (sort keys %urilist) {
  my @hout = qx { host '$uri.multi.uribl.com'; host '$uri.uribl.company.com' };
  if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
    if ($opts{u}) {
      print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
    } else {
      # URI plus count
      print "$uri\t$urilist{$uri}\n";
    }
  }
}

$imap->close();

# Close IMAP connection cleanly.
$imap->logout();

# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
# irrelevant for SQL Bayes
#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
