Index: /trunk/dnsbl/extract-data
===================================================================
--- /trunk/dnsbl/extract-data	(revision 12)
+++ /trunk/dnsbl/extract-data	(revision 13)
@@ -87,5 +87,73 @@
 my %urilist;
 
-for (my $i=0; $i<$msgcount; $i++) {
+# put together an array of netblocks we won't/can't list for various reasons
+my @dontlistme = (
+    # Hotmail/Windows Live Mail
+	NetAddr::IP->new("65.52.0.0/14"),
+
+    # AOL - note only some IPs show mail-ish rDNS
+    #IP-Network                    205.188.0.0/16
+    #IP-Network                    64.12.0.0/16	
+	NetAddr::IP->new("205.188.105.140/29"),
+	NetAddr::IP->new("205.188.169.196/29"),
+	NetAddr::IP->new("205.188.249.128/29"),
+	NetAddr::IP->new("205.188.249.64/29"),
+
+    # Google/GMail
+	NetAddr::IP->new("209.85.128.0/17"),
+	NetAddr::IP->new("72.14.192.0/18"),
+
+    # Yahoo!/Inktomi
+	NetAddr::IP->new("98.136.0.0/14"),
+	NetAddr::IP->new("66.196.64.0/18"),
+	NetAddr::IP->new("67.195.0.0/16"),
+	NetAddr::IP->new("69.147.64.0/18"),
+	NetAddr::IP->new("206.190.32.0/18"),
+	NetAddr::IP->new("68.142.192.0/18"),
+	NetAddr::IP->new("216.252.96.0/19"),
+	NetAddr::IP->new("124.83.128.0/17"),
+	NetAddr::IP->new("217.146.184.0/21"),
+	NetAddr::IP->new("124.108.96.0/20"),
+	NetAddr::IP->new("76.13.0.0/16"),
+	NetAddr::IP->new("68.180.128.0/17"),
+	NetAddr::IP->new("209.191.64.0/18"),
+	NetAddr::IP->new("212.82.104.0/21"),
+	NetAddr::IP->new("66.163.160.0/19"),
+
+    # Bell Canada - note only some IPs show mail-ish rDNS
+    #IP-Network                    209.226.0.0/16
+    #IP-Network                    207.236.0.0/16
+	NetAddr::IP->new("209.226.175.0/24"),
+	NetAddr::IP->new("207.236.237.0/26"),
+
+    # Craigslist
+    #IP-Network                    208.82.236.0/22
+	NetAddr::IP->new("208.82.236.0/22"),
+
+    # Apple.com/mac.com - note only some IPs show mail-ish rDNS
+    #IP-Network                    17.0.0.0/8
+    # asmtpout0(11-30).mac.com
+    # 17.148.16.	011 -> 86	030 -> 105
+	NetAddr::IP->new("17.148.16.64/26"),
+
+    # Vodafone - note only some IPs show mail-ish rDNS
+    #route:        212.183.128.0/19
+	NetAddr::IP->new("212.183.156.224/29"),
+
+    # Eastlink (formerly Persona [Sudbury etc]) - only one IP observed with mail-ish rDNS
+    #IP-Network                    24.222.0.0/16
+	NetAddr::IP->new("24.222.0.30"),
+
+    # Cogeco - only a few IPs observed with mail-ish rDNS
+    #IP-Network                    216.221.64.0/19
+	NetAddr::IP->new("216.221.81.192"),
+	NetAddr::IP->new("216.221.81.96/30"),
+
+    # UAlberta - only one IP observed with mail-ish rDNS
+    #IP-Network                    129.128.0.0/16
+	NetAddr::IP->new("129.128.5.19"),
+    ); # done def for @dontlistme
+
+MSG: for (my $i=0; $i<$msgcount; $i++) {
   my $msg = $imap->message_string($msgs[$i]);
 
@@ -135,221 +203,13 @@
   my @untrusted = @{$stmsg->{metadata}->{relays_untrusted}};
 
-  my $sa_intip = new NetAddr::IP $untrusted[0]->{ip};
-
-
-  my %headerlist = %{$imap->parse_headers($msgs[$i], "Received")};
-  my $recvnum = 0;
-  my $recv = $headerlist{'Received'}[$recvnum];
-  next if !$recv;
-
-  my $relayip;
-
-  #Received: from mail.company.com [ip.add.re.ss]
-  #        by localhost with POP3 (fetchmail-6.2.5)
-  #        for kdeugau@localhost (single-drop); Fri, 15 May 2009 11:45:10 -0400 (EDT)
-  if ($recv =~ /from mail\.company\.com \[ip\.add\.re\.ss\]\s*by localhost with POP3 \(fetchmail/) {
-    $recvnum += 1;
-    $recv = $headerlist{'Received'}[$recvnum];
-  }
-
-  if ($recv =~ /^by mx\d\.company\.com \(Postfix, from userid \d+\)/) {
-    $recvnum++;
-    $recv = $headerlist{'Received'}[$recvnum];
-  }
-
-# le sigh.  gotta bypass a message if we can't parse the headers.  Outlook
-# does an admirable job of mangling things for us.  >:(
-  if ($recv !~ /by mx\d\.company\.com \(Postfix\)/) {
-    print "phtui:  $recv\n";
-    next;
-  }
-
-##fixme
-# le sigh.  skip IP extraction on tagged spam reported as nonspam, since the real spam is a layer deeper.
-next if $recv =~ /from localhost by mfs\d with SpamAssassin/;
-
-# Postini puts the "real" received: header one layer further out - SA is configured to compensate for this so we do too
-  #IP-Network                    64.18.0.0/20
-  #IP-Network-Block              064.018.000.000 - 064.018.015.255
-  #Org-Name                      Postini, Inc.
-  if ($recv =~ /\[64\.18\.(?:[0-9]|1[0-5])\.\d+]\) by mx\d\.company\.com/) {
-    $recv = $recv = $headerlist{'Received'}[++$recvnum];
-#Received: from source ([208.95.48.65]) (using TLSv1) by
-# exprod5mx230.postini.com ([64.18.4.10]) with SMTP; Fri, 10 Jul 
-    my ($tmprelayip) = ($recv =~ /from source \(\[([\d.]+)\]\) (?:\(using TLSv1\) )?by (?:exprod\dm[xo]b?|chipmx)\d+\.postini\.com/);
-    $relayip = new NetAddr::IP $tmprelayip;
-
-  } elsif ($recv =~ /\[137\.82\.45\.(?:[0-9]|1[0-5])\]\) by mx\d\.company\.com/) {
-# Customer with (spam)forwarding from UBC - enough to justify this code
-  #IP-Network                    137.82.0.0/16
-  #IP-Network-Block              137.082.000.000 - 137.082.255.255
-  #Org-Name                      University of British Columbia
-  # only 137.82.45.0/28 or so seem to be outbound relays (duh)
-    $recv = $recv = $headerlist{'Received'}[++$recvnum];
-#Received: from bcnbib.gov.ar (200-42-22-14.dup.prima.net.ar [200.42.22.14])
-#  by mr4.mail-relay.ubc.ca (Postfix)
-    my ($tmprelayip) = ($recv =~ /from \[?[a-zA-Z0-9._-]+\]? \([a-zA-Z0-9._-]+ \[([\d+.]+)\]\) by mr\d\.mail-relay\.ubc\.ca \(Postfix\)/);
-    $relayip = new NetAddr::IP $tmprelayip;
-
-  } else {
-    my ($tmprelayip) = ($recv =~ /\[([\d+.]+)\]\) by mx\d\.company\.com/);
-    $relayip = new NetAddr::IP $tmprelayip;
-  }
-
-print "eep, no ip from manual extraction\n$recv\n" if !$relayip;
-print "SA vs manual extraction, relay IP mismatch: $sa_intip vs $relayip on\n\t$recv\n" if $sa_intip != $relayip;
-
-# Hotmail/Windows Live Mail may originate or relay spam, but we can't blacklist them
-  #Received: from blu0-omc4-s23.blu0.hotmail.com (blu0-omc4-s23.blu0.hotmail.com
-  # [65.55.111.162]) by mx2.company.com (Postfix)
-#  next if $recv =~ /from (?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com 
-\((?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com \[65.5[2345].\d+\.\d+\]\) by mx\d\.company\.com/;
-  #IP-Network                    65.52.0.0/14
-  #IP-Network-Block              065.052.000.000 - 065.055.255.255
-  #Org-Name                      Microsoft Corp
-  my $hotmail1 = new NetAddr::IP "65.52.0.0/14";
-print "$.: $recv\n" if !defined ($relayip);
-  next if $relayip->within($hotmail1);
-
-# AOL may originate or relay spam, but we can't blacklist them
-  #Received: from omr-m33.mx.aol.com (omr-m33.mx.aol.com [64.12.143.145]) by
-  # mx1.company.com (Postfix) with ESMTP id 7B9431C3255 for <webmaster@tyenet.com>;
-  next if $recv =~ /from (?:omr|im[or])-[dm][ab]?\d+\.mx\.aol\.com \((?:omr|im[or])-[dm][ba]?\d+\.mx\.aol\.com \[[\d.]+\]\) by mx\d\.company\.com/;
-
-# Google may relay spam, GMail may originate it, but we can't blacklist them.
-  #IP-Network                    209.85.128.0/17
-  #IP-Network-Block              209.085.128.000 - 209.085.255.255
-  #Org-Name                      Google Inc.
-  next if $recv =~ /\[209\.85\.(?:1(?:2[89]|[3-9]\d)|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/;
-  #OrgName:    Google Inc.
-  #NetRange:   72.14.192.0 - 72.14.255.255
-  #CIDR:       72.14.192.0/18
-  next if $recv =~ /\[72\.14\.(?:19[2-9]|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/;
-
-# Yahoo! may ... yadda yadda yadda  (geeze they've got a whack of netblocks for mail...)
- #IP-Network                    98.136.0.0/14
-  my $yahoo1 = new NetAddr::IP "98.136.0.0/14";
-  next if $relayip->within($yahoo1);
- #IP-Network                    66.196.64.0/18
- #Org-Name                      Inktomi Corporation
- # Inktomi ~~ Yahoo!
-  my $yahoo2 = new NetAddr::IP "66.196.64.0/18";
-  next if $relayip->within($yahoo2);
- #IP-Network                    67.195.0.0/16
-  my $yahoo3 = new NetAddr::IP "67.195.0.0/16";
-  next if $relayip->within($yahoo3);
- #IP-Network                    69.147.64.0/18
-  my $yahoo4 = new NetAddr::IP "69.147.64.0/18";
-  next if $relayip->within($yahoo4);
- #IP-Network                    206.190.32.0/19
- #Org-Name                      Yahoo! Broadcast Services, Inc.
-  my $yahoo5 = new NetAddr::IP "206.190.32.0/18";
-  next if $relayip->within($yahoo5);
- #IP-Network                    68.142.192.0/18
- #Org-Name                      Inktomi Corporation
-  my $yahoo6 = new NetAddr::IP "68.142.192.0/18";
-  next if $relayip->within($yahoo6);
- #IP-Network                    216.252.96.0/19
-  my $yahoo7 = new NetAddr::IP "216.252.96.0/19";
-  next if $relayip->within($yahoo7);
- #inetnum:      124.83.128.0 - 124.83.255.255
-  my $yahoo8 = new NetAddr::IP "124.83.128.0/17";
-  next if $relayip->within($yahoo8);
- #inetnum:        217.146.184.0 - 217.146.191.47
-  my $yahoo9 = new NetAddr::IP "217.146.184.0/21";
-  next if $relayip->within($yahoo9);
- #inetnum:      124.108.96.0 - 124.108.111.255
-  my $yahoo10 = new NetAddr::IP "124.108.96.0/20";
-  next if $relayip->within($yahoo10);
- #IP-Network                    76.13.0.0/16
-  my $yahoo11 = new NetAddr::IP "76.13.0.0/16";
-  next if $relayip->within($yahoo11);
- #IP-Network                    68.180.128.0/17
-  my $yahoo12 = new NetAddr::IP "68.180.128.0/17";
-  next if $relayip->within($yahoo12);
- #IP-Network                    209.191.64.0/18
-  my $yahoo13 = new NetAddr::IP "209.191.64.0/18";
-  next if $relayip->within($yahoo13);
- #route:          212.82.104.0/21
-  my $yahoo14 = new NetAddr::IP "212.82.104.0/21";
-  next if $relayip->within($yahoo14);
- #IP-Network                    66.163.160.0/19
-  my $yahoo15 = new NetAddr::IP "66.163.160.0/19";
-  next if $relayip->within($yahoo15);
-
-# and the same goes for Bell Canada.  *le sigh*
-#IP-Network                    209.226.0.0/16
-#IP-Network-Block              209.226.000.000 - 209.226.255.255
-#Org-Name                      Bell Canada
-#Received: from tomts35-srv.bellnexxia.net (tomts35.bellnexxia.net
-# [209.226.175.109]) by mx2.company.com (Postfix) with ESMTP id B415C16752D for
-# <user@compnay.com>; Sat,  4 Jul 2009 10:48:24 -0400 (EDT)
-# hmm.  tomts\d(-srv)?.bellnexxia.net only seem to be in .175/24.  we'll just drop those ones for now...
-# especially since there appear to be hosted customers etc in the same ARIN allocation above.
-  my $bell1 = new NetAddr::IP "209.226.175.0/24";
-  next if $relayip->within($bell1);
-  #IP-Network                    207.236.0.0/16
-  # only listing a subsection - rDNS hosts look like Bell SMTP hardware
-  my $bell2 = new NetAddr::IP "207.236.237.0/26";
-  next if $relayip->within($bell2);
-
-# ... and your little dog too!
-#IP-Network                    208.82.236.0/22
-#IP-Network-Block              208.082.236.000 - 208.082.239.255
-#Org-Name                      Craigslist, Inc.
-  my $craigslist1 = new NetAddr::IP "208.82.236.0/22";
-  next if $relayip->within($craigslist1);
-
-# not gonna whitelist the whole enchilada... just the asmtpout0(11-30).mac.com
-# 17.148.16	011 -> 86	030 -> 105
-#IP-Network                    17.0.0.0/8
-#IP-Network-Block              017.000.000.000 - 017.255.255.255
-#Org-Name                      Apple Computer, Inc.
-  my $apple1 = new NetAddr::IP "17.148.16.64/26";
-  next if $relayip->within($apple1);
-
-# and Vodafone...
-# 212.183.156.227  (.227 through .230 have server rdns)
-#route:        212.183.128.0/19
-#descr:        Vodafone UK
-#inetnum:        212.183.156.0 - 212.183.156.255
-#descr:          Vodafone Limited
-  my $voda1 = new NetAddr::IP "212.183.156.224/29";
-  next if $relayip->within($voda1);
-
-# ooohhh, Eastlink wants to join the party
-#24.222.0.30
-#IP-Network                    24.222.0.0/16
-#IP-Network-Block              024.222.000.000 - 024.222.255.255
-#Org-Name                      Bragg Communications Incorporated
-  my $eastlink1 = new NetAddr::IP "24.222.0.30";
-  next if $relayip->within($eastlink1);
-
-# and now Cogeco
-#216.221.81.192
-#IP-Network                    216.221.64.0/19
-#IP-Network-Block              216.221.064.000 - 216.221.095.255
-#Org-Name                      Cogeco Telecom
-# only ignoring systems-looking IPs or blocks with mostly systems-looking IPs
-  my $cogeco1 = new NetAddr::IP "216.221.81.192";
-  next if $relayip->within($cogeco1);
-  my $cogeco2 = new NetAddr::IP "216.221.81.96/30";
-  next if $relayip->within($cogeco2);
-
-# and UAlberta
-#129.128.5.19
-#IP-Network                    129.128.0.0/16
-#IP-Network-Block              129.128.000.000 - 129.128.255.255
-#Org-Name                      University of Alberta
-  my $ualberta1 = new NetAddr::IP "129.128.5.19";
-  next if $relayip->within($ualberta1);
+  my $relayip = new NetAddr::IP $untrusted[0]->{ip};
+
+  foreach my $block (@dontlistme) {
+    next MSG if $relayip->within($block);
+  }
 
   $iplist{$relayip->addr}++ if $relayip;
-#  print "$recv\n";
-#  print "$relayip\n\n";
-#  print $imap->get_header($msgs[$i], "From"); print "\n";
-
-
-#  last if $i > 15;
+
+#  last if $i > 2;
   sleep 1;
 } # IMAP message iteration
