#!/usr/bin/perl
# Extract relay IP, URI hosts from mail in folder
# Hack-n-patch from:
# Heavily reworked from original:
# dmz@dmzs.com - March 19, 2004
# http://www.dmzs.com/tools/files/spam.phtml
# LGPL
# by:
# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
# Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
##wrapver 0.1
##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
# $Id: extract-data 30 2011-01-25 22:10:03Z kdeugau $
# $URL$

use strict;
use warnings;
use Time::Local;
use POSIX qw(strftime);
use IO::Socket::SSL;
use Mail::IMAPClient;
use Getopt::Std;
use NetAddr::IP;

# hmm. can't make (enough) sense of the docs.  gonna have to parse headers for IPs myself...  :/
# but we did find enough to extract the URIs...
use lib '/opt/spamassassin/share/perl/5.10.0';
use Mail::SpamAssassin;
use Mail::SpamAssassin::PerMsgStatus;

my %opts;
getopts("druv", \%opts);

my $debug = ($opts{d} ? 1 : 0);
my $sadebug = 0;	# sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
my $imapdebug = 0;	# so does Mail::IMAPClient...  as in, the whole content of all the mail you look at.  O_o
my $delete_after_learning = 0;	# set to 1 if you want to delete mail right away after learning
				# - note this makes it rather harder to deep-scan the messages to create local rules
my $verbose = ($opts{v} ? 1 : 0);

my $tmpdir = '/var/tmp';
my $salearn = '/usr/local/bin/sa-learn';
my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';

die "eeep!  $salearn doesn't exist!\n" if ! -e $salearn;

my $folder = 'reported.needsextraction';
#my $folder = 'confirmed';

# non-SSL IMAP settings:
#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
#                                  User => 'imapspamuser',
#                                  Password => 'imapspamuserpassword',
#                                  Debug => $imapdebug);

my $imap = Mail::IMAPClient->new(
	User => 'junkmail', 
	Password => 'k3c86z2',
	Socket   => IO::Socket::SSL->new(
		Proto    => 'tcp',
		PeerAddr => 'mail.company.com',
		PeerPort => 993, # IMAP over SSL standard port
		),
	Debug => $imapdebug,
	)

	or die "IMAP login failed: $@\n";

if (!defined($imap)) { die "IMAP Login Failed"; }

my $msgcount = $imap->message_count($folder);

# If debugging, print out the total counts for each mailbox
print $msgcount, " message(s) to process\n" if $debug;

## Process the spam mailbox
$imap->select($folder);
my @msgs = $imap->search("ALL");

# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
#my $spamtmp = "$tmpdir/spam.".time.".$$";  # this should give us a suitably pseudorandom directory
#mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";

my $spamtest = Mail::SpamAssassin->new();
  # don't keep dereferencing this

my %iplist;
my %urilist;

# put together an array of netblocks we won't/can't list for various reasons
my @dontlistme = (
    # Microsoft/Hotmail/Windows Live Mail
#IP-Network                    207.68.128.0/18
#IP-Network                    207.68.192.0/20
#IP-Network-Block              207.068.128.000 - 207.068.207.255
	NetAddr::IP->new("65.52.0.0/14"),
	NetAddr::IP->new("207.68.176.96/27"),
#IP-Network                    157.54.0.0/15
#IP-Network                    157.56.0.0/14
#IP-Network                    157.60.0.0/16
	NetAddr::IP->new("157.55.0.192/27"),
	NetAddr::IP->new("157.55.0.225"),
	NetAddr::IP->new("157.55.0.226"),
#IP-Network                    207.46.0.0/16
	NetAddr::IP->new("207.46.66.0/28"),

    # AOL - note only some IPs show mail-ish rDNS
    #IP-Network                    205.188.0.0/16
    #IP-Network                    64.12.0.0/16	
	NetAddr::IP->new("64.12.78.136/30"),
	NetAddr::IP->new("64.12.78.142"),
	NetAddr::IP->new("64.12.100.31"),
	NetAddr::IP->new("64.12.143.144/30"),
	NetAddr::IP->new("64.12.143.152/30"),
	NetAddr::IP->new("64.12.206.39"),
	NetAddr::IP->new("64.12.206.40/30"),
	NetAddr::IP->new("64.12.207.128/27"),
	NetAddr::IP->new("64.12.207.144/29"),
	NetAddr::IP->new("64.12.207.152/30"),
	NetAddr::IP->new("64.12.207.160/28"),
	NetAddr::IP->new("64.12.207.176/29"),
    #IP-Network                    205.188.0.0/16
	NetAddr::IP->new("205.188.91.94/31"),
	NetAddr::IP->new("205.188.91.96/31"),
	NetAddr::IP->new("205.188.105.143"),
	NetAddr::IP->new("205.188.105.144/30"),
	NetAddr::IP->new("205.188.169.196/29"),
	NetAddr::IP->new("205.188.249.128/29"),
	NetAddr::IP->new("205.188.249.64/29"),
	NetAddr::IP->new("205.188.169.200/23"),

    # Google/GMail
	NetAddr::IP->new("209.85.128.0/17"),
	NetAddr::IP->new("72.14.192.0/18"),
	NetAddr::IP->new("74.125.0.0/16"),

    # Yahoo!/Inktomi
	NetAddr::IP->new("98.136.0.0/14"),
	NetAddr::IP->new("66.196.64.0/18"),
	NetAddr::IP->new("67.195.0.0/16"),
	NetAddr::IP->new("69.147.64.0/18"),
	NetAddr::IP->new("206.190.32.0/18"),
	NetAddr::IP->new("68.142.192.0/18"),
	NetAddr::IP->new("216.252.96.0/19"),
	NetAddr::IP->new("124.83.128.0/17"),
	NetAddr::IP->new("217.146.184.0/21"),
	NetAddr::IP->new("124.108.96.0/20"),
	NetAddr::IP->new("76.13.0.0/16"),
	NetAddr::IP->new("68.180.128.0/17"),
	NetAddr::IP->new("209.191.64.0/18"),
	NetAddr::IP->new("212.82.104.0/21"),
	NetAddr::IP->new("66.163.160.0/19"),
#inetnum:        87.248.110.0 - 87.248.111.255
#route:          87.248.104.0/21
	NetAddr::IP->new("87.248.110.0/23"),
	NetAddr::IP->new("203.188.200.0/22"),
	NetAddr::IP->new("217.12.0.0/20"),
	NetAddr::IP->new("77.238.184.0/23"),
#IP-Network                    74.6.0.0/16
	NetAddr::IP->new("74.6.114.48/24"),
	NetAddr::IP->new("74.6.228.32/27"),
	NetAddr::IP->new("74.6.228.64/26"),
	NetAddr::IP->new("202.165.96.0/21"),
#route:          87.248.112.0/21
#inetnum:        87.248.114.0 - 87.248.115.255
	NetAddr::IP->new("87.248.114.0/24"),
#route:          116.214.0.0/20
	NetAddr::IP->new("116.214.12.0/24"),
	NetAddr::IP->new("202.86.4.0/22"),
	NetAddr::IP->new("77.238.188/23"),
	NetAddr::IP->new("217.146.182.0/23"),
	NetAddr::IP->new("114.111.64.0/18"),
	NetAddr::IP->new("115.178.12.0/23"),
	NetAddr::IP->new("121.101.151.212"),
#inetnum:      121.101.144.0 - 121.101.159.255
	NetAddr::IP->new("121.101.144.0/20"),
	NetAddr::IP->new("66.94.224.0/19"),
	NetAddr::IP->new("203.104.16.0/21"),

    # MessageLabs - may add these to trusted_networks instead
	NetAddr::IP->new("85.158.139.0/24"),
# observed: .35, .51
	NetAddr::IP->new("194.106.220.0/23"),
	NetAddr::IP->new("193.109.254.0/23"),
	NetAddr::IP->new("119.161.0.0/19"),
# buh? rDNS says Yahoo, but...
#inetnum:        203.209.224.0 - 203.209.255.255
#descr:          Alibaba (Beijing) Technology Co., Ltd.
	NetAddr::IP->new("203.209.230.22"),

    # Bell Canada - note only some IPs show mail-ish rDNS
    #IP-Network                    209.226.0.0/16
    #IP-Network                    207.236.0.0/16
	NetAddr::IP->new("209.226.175.0/24"),
	NetAddr::IP->new("207.236.237.0/26"),

    # Craigslist
    #IP-Network                    208.82.236.0/22
	NetAddr::IP->new("208.82.236.0/22"),

    # Apple.com/mac.com - note only some IPs show mail-ish rDNS
    #IP-Network                    17.0.0.0/8
    # asmtpout0(11-30).mac.com
    # 17.148.16.	011 -> 86	030 -> 105
	NetAddr::IP->new("17.148.16.64/26"),

    # Vodafone - note only some IPs show mail-ish rDNS
    #route:        212.183.128.0/19
	NetAddr::IP->new("212.183.156.224/29"),

    # Facebook - only exclude mail-ish hostnames
    #IP-Network                    69.63.176.0/20
	# .178.160/27 plus a few more to 199 show mailish rDNS
	NetAddr::IP->new("69.63.178.160/27"),
	NetAddr::IP->new("69.63.178.192/29"),
    #IP-Network                    66.220.144.0/20
	NetAddr::IP->new("66.220.144.128/26"),

    # IBM Lotus Live - rdns mostly mail
    #CIDR:       8.12.152.0/24
	NetAddr::IP->new("8.12.152.0/24"),

# ISPs
    # Eastlink (formerly Persona [Sudbury etc]) - only a few IPs observed with mail-ish rDNS
    #IP-Network                    24.222.0.0/16
	NetAddr::IP->new("24.222.0.30"),
	NetAddr::IP->new("24.224.136.0/27"),

    # Cogeco - only a few IPs observed with mail-ish rDNS
    #IP-Network                    216.221.64.0/19
	NetAddr::IP->new("216.221.81.192"),
	NetAddr::IP->new("216.221.81.25"),
	NetAddr::IP->new("216.221.81.28/30"),
	NetAddr::IP->new("216.221.81.96/30"),
    # seznam.cz
    #route:          77.75.72.0/23
	NetAddr::IP->new("77.75.72.44"),

    # T-Online - only mail-ish hostnames
#route:        194.25.0.0/16
#inetnum:      194.25.134.0 - 194.25.134.255
#netname:      DTOS-ULM-001
#descr:        www.t-online.de
#descr:        mail.t-online.de
#descr:        php.t-online.de
	NetAddr::IP->new("194.25.134.80/29"),

    # Telestra (Australia)
    #inetnum:      61.9.128.0 - 61.9.255.255
    # may miss one or two
	# .168.135-152
	NetAddr::IP->new("61.9.168.136/29"),
	NetAddr::IP->new("61.9.168.144/29"),
	# .189.132-152
	NetAddr::IP->new("61.9.189.132/30"),
	NetAddr::IP->new("61.9.189.136/29"),
	NetAddr::IP->new("61.9.189.144/29"),
    # "TelestraClear" - related to above?
    #inetnum:      203.97.0.0 - 203.97.127.255
	NetAddr::IP->new("203.97.33.64"),
	NetAddr::IP->new("203.97.33.68"),
	NetAddr::IP->new("203.97.37.64"),

    # Earthlink
    #IP-Network                    207.69.0.0/16
	NetAddr::IP->new("207.69.200.28"),

    # Sprint
    #IP-Network                    68.24.0.0/13
	# actually their PCS mail relay, so sez rDNS
	NetAddr::IP->new("68.28.27.84"),

    # Vodafone (New Zealand)
    # inetnum:        203.109.128.0 - 203.109.159.255
	NetAddr::IP->new("203.109.136/28"),

## Edumactional places

    # UAlberta - only one IP observed with mail-ish rDNS
    #IP-Network                    129.128.0.0/16
	NetAddr::IP->new("129.128.5.19"),

    # Queens University
    #IP-Network                    130.15.0.0/16
	NetAddr::IP->new("130.15.241.183"),

    # University of Maryland Baltimore County - mail servers only
    #CIDR:       130.85.0.0/16
	NetAddr::IP->new("130.85.25.76/30"),

    # University of Florida
    #IP-Network                    128.227.0.0/16
	NetAddr::IP->new("128.227.74.70"),
	NetAddr::IP->new("128.227.74.149"),
	NetAddr::IP->new("128.227.74.165"),

    # University of Texas
    #IP-Network                    129.109.0.0/16
	NetAddr::IP->new("129.109.195.0/28"),

    # Roxbury Community College
    #IP-Network                    209.104.233.0/24
	NetAddr::IP->new("209.104.233.247"),

    # Stanford University
    #IP-Network                    171.64.0.0/14
	NetAddr::IP->new("171.67.219.80/30"),

    # Cleveland State University
    #IP-Network                    137.148.0.0/16
	NetAddr::IP->new("137.148.18.13/32"),

    # University of Missouri - dba the Missouri Research and Education Network (MOREnet)
    #CIDR:       204.184.0.0/15
	NetAddr::IP->new("204.185.165.125"),

    # Yale University
    #IP-Network                    130.132.0.0/16
	NetAddr::IP->new("130.132.50.7"),

    # Rutgers University
    #IP-Network                    165.230.0.0/16
	NetAddr::IP->new("165.230.151.182"),

    # Clemson University
    #IP-Network                    130.127.0.0/16
	NetAddr::IP->new("130.127.235.21"),

    # Wilkes University
    #IP-Network                    146.94.0.0/16
	NetAddr::IP->new("146.94.192.152"),

    # Virginia Community College System
    #IP-Network                    164.106.0.0/16
	NetAddr::IP->new("164.106.130.251"),

    # University of Virginia
    #IP-Network                    199.111.0.0/16
    # Mary Washington College
    #IP-Network                    199.111.64.0/19
	NetAddr::IP->new("199.111.84.18"),

    #inetnum:        130.95.0.0 - 130.95.255.255
    #address:        The University of Western Australia
	NetAddr::IP->new("130.95.3.211"),

    #inetnum:      130.56.0.0 - 130.56.255.255
    #address:      IIS, Australian National University
	NetAddr::IP->new("130.56.64.134"),

    #IP-Network                    129.81.0.0/16
    #Org-Name                      Tulane University
	NetAddr::IP->new("129.81.224.84"),

# List servers
    # Debian listserver - relays spam, so we can't list it.
	NetAddr::IP->new("82.195.75.100"),

# ESPs
    # moderately abusable, but mostly legit
    # Org-Name	iContact (Broadwick Corp./Preation Inc.)
	NetAddr::IP->new("216.27.93.0/25"),

    # are these guys an ESP?  rDNS in many blocks shows secureserver.net, with SMTPish overtones
    #Org-Name                      GoDaddy.com, Inc.
    #IP-Network                    64.202.160.0/19
	NetAddr::IP->new("64.202.160.0/24"),
	NetAddr::IP->new("64.202.165.0/26"),
	NetAddr::IP->new("64.202.165.180/30"),
	NetAddr::IP->new("64.202.165.192/29"),
	NetAddr::IP->new("64.202.165.224/28"),
    #IP-Network                    72.167.0.0/16
	NetAddr::IP->new("72.167.82.80/29"),
	NetAddr::IP->new("72.167.82.90"),
    #IP-Network                    173.201.0.0/16
	NetAddr::IP->new("173.201.192.0/24"),

    # EmailBrain - note no actual netblocks of their own.  :(
	NetAddr::IP->new("66.100.171.162"),
#163.171.100.66.in-addr.arpa domain name pointer eb08.ebhost9.com.
#164.171.100.66.in-addr.arpa domain name pointer a.eb08.ebhost9.com.
#165.171.100.66.in-addr.arpa domain name pointer b.eb08.ebhost9.com.
#166.171.100.66.in-addr.arpa domain name pointer c.eb08.ebhost9.com.


# eBay/PayPal
	NetAddr::IP->new("66.211.160.0/19"),

# Misc legit
    # Texas Instruments
    #IP-Network                    192.94.94.0/24
	NetAddr::IP->new("192.94.94.40"),

    ); # done def for @dontlistme

MSG: for (my $i=0; $i<$msgcount; $i++) {
  my $msg = $imap->message_string($msgs[$i]);

  my $mail = $spamtest->parse($msg);
  my $status = $spamtest->check($mail);

# stolen from SA plugin bits
#sub parsed_metadata {
#  my ($self, $opts) = @_;
#  my $scanner = $status->{permsgstatus};

  my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};

  # Generate the full list of html-parsed domains.
  my $uris = $status->get_uri_detail_list();

#  print "$msgs[$i]\n";
  my %msguris;

  while (my($uri, $info) = each %{$uris}) {
    # we want to skip mailto: uris
    next if ($uri =~ /^mailto:/);

    # no domains were found via this uri, so skip
    next unless ($info->{domains});

    next if $info->{types}->{img};
    foreach ( keys %{ $info->{domains} } ) {
      if (exists $skip_domains->{$_}) {
        next;
      }
      $msguris{$_}++;
      #print "  ".$_."\n";
    }

  #  print "$uri ".$info->{anchor_text}."\n";
  }

  foreach (keys %msguris) {
#    print "  $_ $msguris{$_}\n";
    $urilist{$_}++;
  }

  # now, get the relay IP

  my $stmsg = $status->get_message();
  my @untrusted = @{$stmsg->{metadata}->{relays_untrusted}};

  my $relayip = new NetAddr::IP $untrusted[0]->{ip};

# sigh.  messages tagged as spam already make life difficult by hiding the
# old Received: headers.  We'll just handle them manually for now.
  if (!$relayip) {
    print "phtui: ";
    my %headerlist = %{$imap->parse_headers($msgs[$i], "Received", "Subject")};
    my $recvnum = 0;
    my $recv = $headerlist{'Received'}[$recvnum];
    print "$recv\n";
    print "   $headerlist{'Subject'}[0]\n";
    next MSG;
  }

  foreach my $block (@dontlistme) {
    next MSG if $relayip->within($block);
  }

  $iplist{$relayip->addr}++ if $relayip;

#  last if $i > 2;
  sleep 1;
} # IMAP message iteration

# mm.  don't really need times on the IP lists
#if ($opts{r}) {
#  print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
#}
foreach my $ip (sort keys %iplist) {
  $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
  if ($opts{r}) {
    print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
  } else {
    print "$ip\t $iplist{$ip}\n";
  }
}

if ($opts{u}) {
  print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
}
foreach my $uri (sort keys %urilist) {
  my @hout = qx { host '$uri.multi.uribl.com'; host '$uri.company.dnsbl.' };
  if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
    if ($opts{u}) {
      print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
    } else {
      # URI plus count
      print "$uri\t$urilist{$uri}\n";
    }
  }
}

$imap->close();

# Close IMAP connection cleanly.
$imap->logout();

# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
# irrelevant for SQL Bayes
#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
