source: trunk/dnsbl/extract-data@ 12

Last change on this file since 12 was 12, checked in by Kris Deugau, 14 years ago

/trunk/dnsbl

Update extract-data with changes from live version

  • Property svn:executable set to *
  • Property svn:keywords set to Date Rev Author Id
File size: 14.7 KB
Line 
1#!/usr/bin/perl
2# Extract relay IP, URI hosts from mail in folder
3# Hack-n-patch from:
4# Heavily reworked from original:
5# dmz@dmzs.com - March 19, 2004
6# http://www.dmzs.com/tools/files/spam.phtml
7# LGPL
8# by:
9# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
10# Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
11##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
12##wrapver 0.1
13##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
14# $Id: extract-data 12 2010-01-12 16:20:24Z kdeugau $
15# $URL$
16
17use strict;
18use warnings;
19use Time::Local;
20use POSIX qw(strftime);
21use IO::Socket::SSL;
22use Mail::IMAPClient;
23use Getopt::Std;
24use NetAddr::IP;
25
26# hmm. can't make (enough) sense of the docs. gonna have to parse headers for IPs myself... :/
27# but we did find enough to extract the URIs...
28use lib '/opt/spamassassin/share/perl/5.8.8';
29use Mail::SpamAssassin;
30use Mail::SpamAssassin::PerMsgStatus;
31
32my %opts;
33getopts("druv", \%opts);
34
35my $debug = ($opts{d} ? 1 : 0);
36my $sadebug = 0; # sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
37my $imapdebug = 0; # so does Mail::IMAPClient... as in, the whole content of all the mail you look at. O_o
38my $delete_after_learning = 0; # set to 1 if you want to delete mail right away after learning
39 # - note this makes it rather harder to deep-scan the messages to create local rules
40my $verbose = ($opts{v} ? 1 : 0);
41
42my $tmpdir = '/var/tmp';
43my $salearn = '/usr/local/bin/sa-learn';
44my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';
45
46die "eeep! $salearn doesn't exist!\n" if ! -e $salearn;
47
48my $folder = 'reported.needsextraction';
49#my $folder = 'confirmed';
50
51# non-SSL IMAP settings:
52#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
53# User => 'imapspamuser',
54# Password => 'imapspamuserpassword',
55# Debug => $imapdebug);
56
57my $imap = Mail::IMAPClient->new(
58 User => 'junkmail',
59 Password => 'k3c86z2',
60 Socket => IO::Socket::SSL->new(
61 Proto => 'tcp',
62 PeerAddr => 'mail.company.com',
63 PeerPort => 993, # IMAP over SSL standard port
64 ),
65 Debug => $imapdebug,
66 );
67
68if (!defined($imap)) { die "IMAP Login Failed"; }
69
70my $msgcount = $imap->message_count($folder);
71
72# If debugging, print out the total counts for each mailbox
73print $msgcount, " message(s) to process\n" if $debug;
74
75## Process the spam mailbox
76$imap->select($folder);
77my @msgs = $imap->search("ALL");
78
79# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
80#my $spamtmp = "$tmpdir/spam.".time.".$$"; # this should give us a suitably pseudorandom directory
81#mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";
82
83my $spamtest = Mail::SpamAssassin->new();
84 # don't keep dereferencing this
85
86my %iplist;
87my %urilist;
88
89for (my $i=0; $i<$msgcount; $i++) {
90 my $msg = $imap->message_string($msgs[$i]);
91
92 my $mail = $spamtest->parse($msg);
93 my $status = $spamtest->check($mail);
94
95# stolen from SA plugin bits
96#sub parsed_metadata {
97# my ($self, $opts) = @_;
98# my $scanner = $status->{permsgstatus};
99
100 my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};
101
102 # Generate the full list of html-parsed domains.
103 my $uris = $status->get_uri_detail_list();
104
105# print "$msgs[$i]\n";
106 my %msguris;
107
108 while (my($uri, $info) = each %{$uris}) {
109 # we want to skip mailto: uris
110 next if ($uri =~ /^mailto:/);
111
112 # no domains were found via this uri, so skip
113 next unless ($info->{domains});
114
115 next if $info->{types}->{img};
116 foreach ( keys %{ $info->{domains} } ) {
117 if (exists $skip_domains->{$_}) {
118 next;
119 }
120 $msguris{$_}++;
121 #print " ".$_."\n";
122 }
123
124 # print "$uri ".$info->{anchor_text}."\n";
125 }
126
127 foreach (keys %msguris) {
128# print " $_ $msguris{$_}\n";
129 $urilist{$_}++;
130 }
131
132 # now, get the relay IP
133
134 my $stmsg = $status->get_message();
135 my @untrusted = @{$stmsg->{metadata}->{relays_untrusted}};
136
137 my $sa_intip = new NetAddr::IP $untrusted[0]->{ip};
138
139
140 my %headerlist = %{$imap->parse_headers($msgs[$i], "Received")};
141 my $recvnum = 0;
142 my $recv = $headerlist{'Received'}[$recvnum];
143 next if !$recv;
144
145 my $relayip;
146
147 #Received: from mail.company.com [ip.add.re.ss]
148 # by localhost with POP3 (fetchmail-6.2.5)
149 # for kdeugau@localhost (single-drop); Fri, 15 May 2009 11:45:10 -0400 (EDT)
150 if ($recv =~ /from mail\.company\.com \[ip\.add\.re\.ss\]\s*by localhost with POP3 \(fetchmail/) {
151 $recvnum += 1;
152 $recv = $headerlist{'Received'}[$recvnum];
153 }
154
155 if ($recv =~ /^by mx\d\.company\.com \(Postfix, from userid \d+\)/) {
156 $recvnum++;
157 $recv = $headerlist{'Received'}[$recvnum];
158 }
159
160# le sigh. gotta bypass a message if we can't parse the headers. Outlook
161# does an admirable job of mangling things for us. >:(
162 if ($recv !~ /by mx\d\.company\.com \(Postfix\)/) {
163 print "phtui: $recv\n";
164 next;
165 }
166
167##fixme
168# le sigh. skip IP extraction on tagged spam reported as nonspam, since the real spam is a layer deeper.
169next if $recv =~ /from localhost by mfs\d with SpamAssassin/;
170
171# Postini puts the "real" received: header one layer further out - SA is configured to compensate for this so we do too
172 #IP-Network 64.18.0.0/20
173 #IP-Network-Block 064.018.000.000 - 064.018.015.255
174 #Org-Name Postini, Inc.
175 if ($recv =~ /\[64\.18\.(?:[0-9]|1[0-5])\.\d+]\) by mx\d\.company\.com/) {
176 $recv = $recv = $headerlist{'Received'}[++$recvnum];
177#Received: from source ([208.95.48.65]) (using TLSv1) by
178# exprod5mx230.postini.com ([64.18.4.10]) with SMTP; Fri, 10 Jul
179 my ($tmprelayip) = ($recv =~ /from source \(\[([\d.]+)\]\) (?:\(using TLSv1\) )?by (?:exprod\dm[xo]b?|chipmx)\d+\.postini\.com/);
180 $relayip = new NetAddr::IP $tmprelayip;
181
182 } elsif ($recv =~ /\[137\.82\.45\.(?:[0-9]|1[0-5])\]\) by mx\d\.company\.com/) {
183# Customer with (spam)forwarding from UBC - enough to justify this code
184 #IP-Network 137.82.0.0/16
185 #IP-Network-Block 137.082.000.000 - 137.082.255.255
186 #Org-Name University of British Columbia
187 # only 137.82.45.0/28 or so seem to be outbound relays (duh)
188 $recv = $recv = $headerlist{'Received'}[++$recvnum];
189#Received: from bcnbib.gov.ar (200-42-22-14.dup.prima.net.ar [200.42.22.14])
190# by mr4.mail-relay.ubc.ca (Postfix)
191 my ($tmprelayip) = ($recv =~ /from \[?[a-zA-Z0-9._-]+\]? \([a-zA-Z0-9._-]+ \[([\d+.]+)\]\) by mr\d\.mail-relay\.ubc\.ca \(Postfix\)/);
192 $relayip = new NetAddr::IP $tmprelayip;
193
194 } else {
195 my ($tmprelayip) = ($recv =~ /\[([\d+.]+)\]\) by mx\d\.company\.com/);
196 $relayip = new NetAddr::IP $tmprelayip;
197 }
198
199print "eep, no ip from manual extraction\n$recv\n" if !$relayip;
200print "SA vs manual extraction, relay IP mismatch: $sa_intip vs $relayip on\n\t$recv\n" if $sa_intip != $relayip;
201
202# Hotmail/Windows Live Mail may originate or relay spam, but we can't blacklist them
203 #Received: from blu0-omc4-s23.blu0.hotmail.com (blu0-omc4-s23.blu0.hotmail.com
204 # [65.55.111.162]) by mx2.company.com (Postfix)
205# next if $recv =~ /from (?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com
206\((?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com \[65.5[2345].\d+\.\d+\]\) by mx\d\.company\.com/;
207 #IP-Network 65.52.0.0/14
208 #IP-Network-Block 065.052.000.000 - 065.055.255.255
209 #Org-Name Microsoft Corp
210 my $hotmail1 = new NetAddr::IP "65.52.0.0/14";
211print "$.: $recv\n" if !defined ($relayip);
212 next if $relayip->within($hotmail1);
213
214# AOL may originate or relay spam, but we can't blacklist them
215 #Received: from omr-m33.mx.aol.com (omr-m33.mx.aol.com [64.12.143.145]) by
216 # mx1.company.com (Postfix) with ESMTP id 7B9431C3255 for <webmaster@tyenet.com>;
217 next if $recv =~ /from (?:omr|im[or])-[dm][ab]?\d+\.mx\.aol\.com \((?:omr|im[or])-[dm][ba]?\d+\.mx\.aol\.com \[[\d.]+\]\) by mx\d\.company\.com/;
218
219# Google may relay spam, GMail may originate it, but we can't blacklist them.
220 #IP-Network 209.85.128.0/17
221 #IP-Network-Block 209.085.128.000 - 209.085.255.255
222 #Org-Name Google Inc.
223 next if $recv =~ /\[209\.85\.(?:1(?:2[89]|[3-9]\d)|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/;
224 #OrgName: Google Inc.
225 #NetRange: 72.14.192.0 - 72.14.255.255
226 #CIDR: 72.14.192.0/18
227 next if $recv =~ /\[72\.14\.(?:19[2-9]|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/;
228
229# Yahoo! may ... yadda yadda yadda (geeze they've got a whack of netblocks for mail...)
230 #IP-Network 98.136.0.0/14
231 my $yahoo1 = new NetAddr::IP "98.136.0.0/14";
232 next if $relayip->within($yahoo1);
233 #IP-Network 66.196.64.0/18
234 #Org-Name Inktomi Corporation
235 # Inktomi ~~ Yahoo!
236 my $yahoo2 = new NetAddr::IP "66.196.64.0/18";
237 next if $relayip->within($yahoo2);
238 #IP-Network 67.195.0.0/16
239 my $yahoo3 = new NetAddr::IP "67.195.0.0/16";
240 next if $relayip->within($yahoo3);
241 #IP-Network 69.147.64.0/18
242 my $yahoo4 = new NetAddr::IP "69.147.64.0/18";
243 next if $relayip->within($yahoo4);
244 #IP-Network 206.190.32.0/19
245 #Org-Name Yahoo! Broadcast Services, Inc.
246 my $yahoo5 = new NetAddr::IP "206.190.32.0/18";
247 next if $relayip->within($yahoo5);
248 #IP-Network 68.142.192.0/18
249 #Org-Name Inktomi Corporation
250 my $yahoo6 = new NetAddr::IP "68.142.192.0/18";
251 next if $relayip->within($yahoo6);
252 #IP-Network 216.252.96.0/19
253 my $yahoo7 = new NetAddr::IP "216.252.96.0/19";
254 next if $relayip->within($yahoo7);
255 #inetnum: 124.83.128.0 - 124.83.255.255
256 my $yahoo8 = new NetAddr::IP "124.83.128.0/17";
257 next if $relayip->within($yahoo8);
258 #inetnum: 217.146.184.0 - 217.146.191.47
259 my $yahoo9 = new NetAddr::IP "217.146.184.0/21";
260 next if $relayip->within($yahoo9);
261 #inetnum: 124.108.96.0 - 124.108.111.255
262 my $yahoo10 = new NetAddr::IP "124.108.96.0/20";
263 next if $relayip->within($yahoo10);
264 #IP-Network 76.13.0.0/16
265 my $yahoo11 = new NetAddr::IP "76.13.0.0/16";
266 next if $relayip->within($yahoo11);
267 #IP-Network 68.180.128.0/17
268 my $yahoo12 = new NetAddr::IP "68.180.128.0/17";
269 next if $relayip->within($yahoo12);
270 #IP-Network 209.191.64.0/18
271 my $yahoo13 = new NetAddr::IP "209.191.64.0/18";
272 next if $relayip->within($yahoo13);
273 #route: 212.82.104.0/21
274 my $yahoo14 = new NetAddr::IP "212.82.104.0/21";
275 next if $relayip->within($yahoo14);
276 #IP-Network 66.163.160.0/19
277 my $yahoo15 = new NetAddr::IP "66.163.160.0/19";
278 next if $relayip->within($yahoo15);
279
280# and the same goes for Bell Canada. *le sigh*
281#IP-Network 209.226.0.0/16
282#IP-Network-Block 209.226.000.000 - 209.226.255.255
283#Org-Name Bell Canada
284#Received: from tomts35-srv.bellnexxia.net (tomts35.bellnexxia.net
285# [209.226.175.109]) by mx2.company.com (Postfix) with ESMTP id B415C16752D for
286# <user@compnay.com>; Sat, 4 Jul 2009 10:48:24 -0400 (EDT)
287# hmm. tomts\d(-srv)?.bellnexxia.net only seem to be in .175/24. we'll just drop those ones for now...
288# especially since there appear to be hosted customers etc in the same ARIN allocation above.
289 my $bell1 = new NetAddr::IP "209.226.175.0/24";
290 next if $relayip->within($bell1);
291 #IP-Network 207.236.0.0/16
292 # only listing a subsection - rDNS hosts look like Bell SMTP hardware
293 my $bell2 = new NetAddr::IP "207.236.237.0/26";
294 next if $relayip->within($bell2);
295
296# ... and your little dog too!
297#IP-Network 208.82.236.0/22
298#IP-Network-Block 208.082.236.000 - 208.082.239.255
299#Org-Name Craigslist, Inc.
300 my $craigslist1 = new NetAddr::IP "208.82.236.0/22";
301 next if $relayip->within($craigslist1);
302
303# not gonna whitelist the whole enchilada... just the asmtpout0(11-30).mac.com
304# 17.148.16 011 -> 86 030 -> 105
305#IP-Network 17.0.0.0/8
306#IP-Network-Block 017.000.000.000 - 017.255.255.255
307#Org-Name Apple Computer, Inc.
308 my $apple1 = new NetAddr::IP "17.148.16.64/26";
309 next if $relayip->within($apple1);
310
311# and Vodafone...
312# 212.183.156.227 (.227 through .230 have server rdns)
313#route: 212.183.128.0/19
314#descr: Vodafone UK
315#inetnum: 212.183.156.0 - 212.183.156.255
316#descr: Vodafone Limited
317 my $voda1 = new NetAddr::IP "212.183.156.224/29";
318 next if $relayip->within($voda1);
319
320# ooohhh, Eastlink wants to join the party
321#24.222.0.30
322#IP-Network 24.222.0.0/16
323#IP-Network-Block 024.222.000.000 - 024.222.255.255
324#Org-Name Bragg Communications Incorporated
325 my $eastlink1 = new NetAddr::IP "24.222.0.30";
326 next if $relayip->within($eastlink1);
327
328# and now Cogeco
329#216.221.81.192
330#IP-Network 216.221.64.0/19
331#IP-Network-Block 216.221.064.000 - 216.221.095.255
332#Org-Name Cogeco Telecom
333# only ignoring systems-looking IPs or blocks with mostly systems-looking IPs
334 my $cogeco1 = new NetAddr::IP "216.221.81.192";
335 next if $relayip->within($cogeco1);
336 my $cogeco2 = new NetAddr::IP "216.221.81.96/30";
337 next if $relayip->within($cogeco2);
338
339# and UAlberta
340#129.128.5.19
341#IP-Network 129.128.0.0/16
342#IP-Network-Block 129.128.000.000 - 129.128.255.255
343#Org-Name University of Alberta
344 my $ualberta1 = new NetAddr::IP "129.128.5.19";
345 next if $relayip->within($ualberta1);
346
347 $iplist{$relayip->addr}++ if $relayip;
348# print "$recv\n";
349# print "$relayip\n\n";
350# print $imap->get_header($msgs[$i], "From"); print "\n";
351
352
353# last if $i > 15;
354 sleep 1;
355} # IMAP message iteration
356
357# mm. don't really need times on the IP lists
358#if ($opts{r}) {
359# print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
360#}
361foreach my $ip (sort keys %iplist) {
362 $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
363 if ($opts{r}) {
364 print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
365 } else {
366 print "$ip\t $iplist{$ip}\n";
367 }
368}
369
370if ($opts{u}) {
371 print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
372}
373foreach my $uri (sort keys %urilist) {
374 my @hout = qx { host '$uri.multi.uribl.com'; host '$uri.uribl.company.com' };
375 if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
376 if ($opts{u}) {
377 print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
378 } else {
379 # URI plus count
380 print "$uri\t$urilist{$uri}\n";
381 }
382 }
383}
384
385$imap->close();
386
387# Close IMAP connection cleanly.
388$imap->logout();
389
390# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
391# irrelevant for SQL Bayes
392#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
393#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
Note: See TracBrowser for help on using the repository browser.