source: trunk/dnsbl/extract-data@ 32

Last change on this file since 32 was 30, checked in by Kris Deugau, 13 years ago

/trunk/dnsbl

Commit added entries in IP skiplist

  • Property svn:executable set to *
  • Property svn:keywords set to Date Rev Author Id
File size: 15.9 KB
Line 
1#!/usr/bin/perl
2# Extract relay IP, URI hosts from mail in folder
3# Hack-n-patch from:
4# Heavily reworked from original:
5# dmz@dmzs.com - March 19, 2004
6# http://www.dmzs.com/tools/files/spam.phtml
7# LGPL
8# by:
9# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
10# Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
11##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
12##wrapver 0.1
13##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
14# $Id: extract-data 30 2011-01-25 22:10:03Z kdeugau $
15# $URL$
16
17use strict;
18use warnings;
19use Time::Local;
20use POSIX qw(strftime);
21use IO::Socket::SSL;
22use Mail::IMAPClient;
23use Getopt::Std;
24use NetAddr::IP;
25
26# hmm. can't make (enough) sense of the docs. gonna have to parse headers for IPs myself... :/
27# but we did find enough to extract the URIs...
28use lib '/opt/spamassassin/share/perl/5.10.0';
29use Mail::SpamAssassin;
30use Mail::SpamAssassin::PerMsgStatus;
31
32my %opts;
33getopts("druv", \%opts);
34
35my $debug = ($opts{d} ? 1 : 0);
36my $sadebug = 0; # sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
37my $imapdebug = 0; # so does Mail::IMAPClient... as in, the whole content of all the mail you look at. O_o
38my $delete_after_learning = 0; # set to 1 if you want to delete mail right away after learning
39 # - note this makes it rather harder to deep-scan the messages to create local rules
40my $verbose = ($opts{v} ? 1 : 0);
41
42my $tmpdir = '/var/tmp';
43my $salearn = '/usr/local/bin/sa-learn';
44my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';
45
46die "eeep! $salearn doesn't exist!\n" if ! -e $salearn;
47
48my $folder = 'reported.needsextraction';
49#my $folder = 'confirmed';
50
51# non-SSL IMAP settings:
52#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
53# User => 'imapspamuser',
54# Password => 'imapspamuserpassword',
55# Debug => $imapdebug);
56
57my $imap = Mail::IMAPClient->new(
58 User => 'junkmail',
59 Password => 'k3c86z2',
60 Socket => IO::Socket::SSL->new(
61 Proto => 'tcp',
62 PeerAddr => 'mail.company.com',
63 PeerPort => 993, # IMAP over SSL standard port
64 ),
65 Debug => $imapdebug,
66 )
67
68 or die "IMAP login failed: $@\n";
69
70if (!defined($imap)) { die "IMAP Login Failed"; }
71
72my $msgcount = $imap->message_count($folder);
73
74# If debugging, print out the total counts for each mailbox
75print $msgcount, " message(s) to process\n" if $debug;
76
77## Process the spam mailbox
78$imap->select($folder);
79my @msgs = $imap->search("ALL");
80
81# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
82#my $spamtmp = "$tmpdir/spam.".time.".$$"; # this should give us a suitably pseudorandom directory
83#mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";
84
85my $spamtest = Mail::SpamAssassin->new();
86 # don't keep dereferencing this
87
88my %iplist;
89my %urilist;
90
91# put together an array of netblocks we won't/can't list for various reasons
92my @dontlistme = (
93 # Microsoft/Hotmail/Windows Live Mail
94#IP-Network 207.68.128.0/18
95#IP-Network 207.68.192.0/20
96#IP-Network-Block 207.068.128.000 - 207.068.207.255
97 NetAddr::IP->new("65.52.0.0/14"),
98 NetAddr::IP->new("207.68.176.96/27"),
99#IP-Network 157.54.0.0/15
100#IP-Network 157.56.0.0/14
101#IP-Network 157.60.0.0/16
102 NetAddr::IP->new("157.55.0.192/27"),
103 NetAddr::IP->new("157.55.0.225"),
104 NetAddr::IP->new("157.55.0.226"),
105#IP-Network 207.46.0.0/16
106 NetAddr::IP->new("207.46.66.0/28"),
107
108 # AOL - note only some IPs show mail-ish rDNS
109 #IP-Network 205.188.0.0/16
110 #IP-Network 64.12.0.0/16
111 NetAddr::IP->new("64.12.78.136/30"),
112 NetAddr::IP->new("64.12.78.142"),
113 NetAddr::IP->new("64.12.100.31"),
114 NetAddr::IP->new("64.12.143.144/30"),
115 NetAddr::IP->new("64.12.143.152/30"),
116 NetAddr::IP->new("64.12.206.39"),
117 NetAddr::IP->new("64.12.206.40/30"),
118 NetAddr::IP->new("64.12.207.128/27"),
119 NetAddr::IP->new("64.12.207.144/29"),
120 NetAddr::IP->new("64.12.207.152/30"),
121 NetAddr::IP->new("64.12.207.160/28"),
122 NetAddr::IP->new("64.12.207.176/29"),
123 #IP-Network 205.188.0.0/16
124 NetAddr::IP->new("205.188.91.94/31"),
125 NetAddr::IP->new("205.188.91.96/31"),
126 NetAddr::IP->new("205.188.105.143"),
127 NetAddr::IP->new("205.188.105.144/30"),
128 NetAddr::IP->new("205.188.169.196/29"),
129 NetAddr::IP->new("205.188.249.128/29"),
130 NetAddr::IP->new("205.188.249.64/29"),
131 NetAddr::IP->new("205.188.169.200/23"),
132
133 # Google/GMail
134 NetAddr::IP->new("209.85.128.0/17"),
135 NetAddr::IP->new("72.14.192.0/18"),
136 NetAddr::IP->new("74.125.0.0/16"),
137
138 # Yahoo!/Inktomi
139 NetAddr::IP->new("98.136.0.0/14"),
140 NetAddr::IP->new("66.196.64.0/18"),
141 NetAddr::IP->new("67.195.0.0/16"),
142 NetAddr::IP->new("69.147.64.0/18"),
143 NetAddr::IP->new("206.190.32.0/18"),
144 NetAddr::IP->new("68.142.192.0/18"),
145 NetAddr::IP->new("216.252.96.0/19"),
146 NetAddr::IP->new("124.83.128.0/17"),
147 NetAddr::IP->new("217.146.184.0/21"),
148 NetAddr::IP->new("124.108.96.0/20"),
149 NetAddr::IP->new("76.13.0.0/16"),
150 NetAddr::IP->new("68.180.128.0/17"),
151 NetAddr::IP->new("209.191.64.0/18"),
152 NetAddr::IP->new("212.82.104.0/21"),
153 NetAddr::IP->new("66.163.160.0/19"),
154#inetnum: 87.248.110.0 - 87.248.111.255
155#route: 87.248.104.0/21
156 NetAddr::IP->new("87.248.110.0/23"),
157 NetAddr::IP->new("203.188.200.0/22"),
158 NetAddr::IP->new("217.12.0.0/20"),
159 NetAddr::IP->new("77.238.184.0/23"),
160#IP-Network 74.6.0.0/16
161 NetAddr::IP->new("74.6.114.48/24"),
162 NetAddr::IP->new("74.6.228.32/27"),
163 NetAddr::IP->new("74.6.228.64/26"),
164 NetAddr::IP->new("202.165.96.0/21"),
165#route: 87.248.112.0/21
166#inetnum: 87.248.114.0 - 87.248.115.255
167 NetAddr::IP->new("87.248.114.0/24"),
168#route: 116.214.0.0/20
169 NetAddr::IP->new("116.214.12.0/24"),
170 NetAddr::IP->new("202.86.4.0/22"),
171 NetAddr::IP->new("77.238.188/23"),
172 NetAddr::IP->new("217.146.182.0/23"),
173 NetAddr::IP->new("114.111.64.0/18"),
174 NetAddr::IP->new("115.178.12.0/23"),
175 NetAddr::IP->new("121.101.151.212"),
176#inetnum: 121.101.144.0 - 121.101.159.255
177 NetAddr::IP->new("121.101.144.0/20"),
178 NetAddr::IP->new("66.94.224.0/19"),
179 NetAddr::IP->new("203.104.16.0/21"),
180
181 # MessageLabs - may add these to trusted_networks instead
182 NetAddr::IP->new("85.158.139.0/24"),
183# observed: .35, .51
184 NetAddr::IP->new("194.106.220.0/23"),
185 NetAddr::IP->new("193.109.254.0/23"),
186 NetAddr::IP->new("119.161.0.0/19"),
187# buh? rDNS says Yahoo, but...
188#inetnum: 203.209.224.0 - 203.209.255.255
189#descr: Alibaba (Beijing) Technology Co., Ltd.
190 NetAddr::IP->new("203.209.230.22"),
191
192 # Bell Canada - note only some IPs show mail-ish rDNS
193 #IP-Network 209.226.0.0/16
194 #IP-Network 207.236.0.0/16
195 NetAddr::IP->new("209.226.175.0/24"),
196 NetAddr::IP->new("207.236.237.0/26"),
197
198 # Craigslist
199 #IP-Network 208.82.236.0/22
200 NetAddr::IP->new("208.82.236.0/22"),
201
202 # Apple.com/mac.com - note only some IPs show mail-ish rDNS
203 #IP-Network 17.0.0.0/8
204 # asmtpout0(11-30).mac.com
205 # 17.148.16. 011 -> 86 030 -> 105
206 NetAddr::IP->new("17.148.16.64/26"),
207
208 # Vodafone - note only some IPs show mail-ish rDNS
209 #route: 212.183.128.0/19
210 NetAddr::IP->new("212.183.156.224/29"),
211
212 # Facebook - only exclude mail-ish hostnames
213 #IP-Network 69.63.176.0/20
214 # .178.160/27 plus a few more to 199 show mailish rDNS
215 NetAddr::IP->new("69.63.178.160/27"),
216 NetAddr::IP->new("69.63.178.192/29"),
217 #IP-Network 66.220.144.0/20
218 NetAddr::IP->new("66.220.144.128/26"),
219
220 # IBM Lotus Live - rdns mostly mail
221 #CIDR: 8.12.152.0/24
222 NetAddr::IP->new("8.12.152.0/24"),
223
224# ISPs
225 # Eastlink (formerly Persona [Sudbury etc]) - only a few IPs observed with mail-ish rDNS
226 #IP-Network 24.222.0.0/16
227 NetAddr::IP->new("24.222.0.30"),
228 NetAddr::IP->new("24.224.136.0/27"),
229
230 # Cogeco - only a few IPs observed with mail-ish rDNS
231 #IP-Network 216.221.64.0/19
232 NetAddr::IP->new("216.221.81.192"),
233 NetAddr::IP->new("216.221.81.25"),
234 NetAddr::IP->new("216.221.81.28/30"),
235 NetAddr::IP->new("216.221.81.96/30"),
236 # seznam.cz
237 #route: 77.75.72.0/23
238 NetAddr::IP->new("77.75.72.44"),
239
240 # T-Online - only mail-ish hostnames
241#route: 194.25.0.0/16
242#inetnum: 194.25.134.0 - 194.25.134.255
243#netname: DTOS-ULM-001
244#descr: www.t-online.de
245#descr: mail.t-online.de
246#descr: php.t-online.de
247 NetAddr::IP->new("194.25.134.80/29"),
248
249 # Telestra (Australia)
250 #inetnum: 61.9.128.0 - 61.9.255.255
251 # may miss one or two
252 # .168.135-152
253 NetAddr::IP->new("61.9.168.136/29"),
254 NetAddr::IP->new("61.9.168.144/29"),
255 # .189.132-152
256 NetAddr::IP->new("61.9.189.132/30"),
257 NetAddr::IP->new("61.9.189.136/29"),
258 NetAddr::IP->new("61.9.189.144/29"),
259 # "TelestraClear" - related to above?
260 #inetnum: 203.97.0.0 - 203.97.127.255
261 NetAddr::IP->new("203.97.33.64"),
262 NetAddr::IP->new("203.97.33.68"),
263 NetAddr::IP->new("203.97.37.64"),
264
265 # Earthlink
266 #IP-Network 207.69.0.0/16
267 NetAddr::IP->new("207.69.200.28"),
268
269 # Sprint
270 #IP-Network 68.24.0.0/13
271 # actually their PCS mail relay, so sez rDNS
272 NetAddr::IP->new("68.28.27.84"),
273
274 # Vodafone (New Zealand)
275 # inetnum: 203.109.128.0 - 203.109.159.255
276 NetAddr::IP->new("203.109.136/28"),
277
278## Edumactional places
279
280 # UAlberta - only one IP observed with mail-ish rDNS
281 #IP-Network 129.128.0.0/16
282 NetAddr::IP->new("129.128.5.19"),
283
284 # Queens University
285 #IP-Network 130.15.0.0/16
286 NetAddr::IP->new("130.15.241.183"),
287
288 # University of Maryland Baltimore County - mail servers only
289 #CIDR: 130.85.0.0/16
290 NetAddr::IP->new("130.85.25.76/30"),
291
292 # University of Florida
293 #IP-Network 128.227.0.0/16
294 NetAddr::IP->new("128.227.74.70"),
295 NetAddr::IP->new("128.227.74.149"),
296 NetAddr::IP->new("128.227.74.165"),
297
298 # University of Texas
299 #IP-Network 129.109.0.0/16
300 NetAddr::IP->new("129.109.195.0/28"),
301
302 # Roxbury Community College
303 #IP-Network 209.104.233.0/24
304 NetAddr::IP->new("209.104.233.247"),
305
306 # Stanford University
307 #IP-Network 171.64.0.0/14
308 NetAddr::IP->new("171.67.219.80/30"),
309
310 # Cleveland State University
311 #IP-Network 137.148.0.0/16
312 NetAddr::IP->new("137.148.18.13/32"),
313
314 # University of Missouri - dba the Missouri Research and Education Network (MOREnet)
315 #CIDR: 204.184.0.0/15
316 NetAddr::IP->new("204.185.165.125"),
317
318 # Yale University
319 #IP-Network 130.132.0.0/16
320 NetAddr::IP->new("130.132.50.7"),
321
322 # Rutgers University
323 #IP-Network 165.230.0.0/16
324 NetAddr::IP->new("165.230.151.182"),
325
326 # Clemson University
327 #IP-Network 130.127.0.0/16
328 NetAddr::IP->new("130.127.235.21"),
329
330 # Wilkes University
331 #IP-Network 146.94.0.0/16
332 NetAddr::IP->new("146.94.192.152"),
333
334 # Virginia Community College System
335 #IP-Network 164.106.0.0/16
336 NetAddr::IP->new("164.106.130.251"),
337
338 # University of Virginia
339 #IP-Network 199.111.0.0/16
340 # Mary Washington College
341 #IP-Network 199.111.64.0/19
342 NetAddr::IP->new("199.111.84.18"),
343
344 #inetnum: 130.95.0.0 - 130.95.255.255
345 #address: The University of Western Australia
346 NetAddr::IP->new("130.95.3.211"),
347
348 #inetnum: 130.56.0.0 - 130.56.255.255
349 #address: IIS, Australian National University
350 NetAddr::IP->new("130.56.64.134"),
351
352 #IP-Network 129.81.0.0/16
353 #Org-Name Tulane University
354 NetAddr::IP->new("129.81.224.84"),
355
356# List servers
357 # Debian listserver - relays spam, so we can't list it.
358 NetAddr::IP->new("82.195.75.100"),
359
360# ESPs
361 # moderately abusable, but mostly legit
362 # Org-Name iContact (Broadwick Corp./Preation Inc.)
363 NetAddr::IP->new("216.27.93.0/25"),
364
365 # are these guys an ESP? rDNS in many blocks shows secureserver.net, with SMTPish overtones
366 #Org-Name GoDaddy.com, Inc.
367 #IP-Network 64.202.160.0/19
368 NetAddr::IP->new("64.202.160.0/24"),
369 NetAddr::IP->new("64.202.165.0/26"),
370 NetAddr::IP->new("64.202.165.180/30"),
371 NetAddr::IP->new("64.202.165.192/29"),
372 NetAddr::IP->new("64.202.165.224/28"),
373 #IP-Network 72.167.0.0/16
374 NetAddr::IP->new("72.167.82.80/29"),
375 NetAddr::IP->new("72.167.82.90"),
376 #IP-Network 173.201.0.0/16
377 NetAddr::IP->new("173.201.192.0/24"),
378
379 # EmailBrain - note no actual netblocks of their own. :(
380 NetAddr::IP->new("66.100.171.162"),
381#163.171.100.66.in-addr.arpa domain name pointer eb08.ebhost9.com.
382#164.171.100.66.in-addr.arpa domain name pointer a.eb08.ebhost9.com.
383#165.171.100.66.in-addr.arpa domain name pointer b.eb08.ebhost9.com.
384#166.171.100.66.in-addr.arpa domain name pointer c.eb08.ebhost9.com.
385
386
387# eBay/PayPal
388 NetAddr::IP->new("66.211.160.0/19"),
389
390# Misc legit
391 # Texas Instruments
392 #IP-Network 192.94.94.0/24
393 NetAddr::IP->new("192.94.94.40"),
394
395 ); # done def for @dontlistme
396
397MSG: for (my $i=0; $i<$msgcount; $i++) {
398 my $msg = $imap->message_string($msgs[$i]);
399
400 my $mail = $spamtest->parse($msg);
401 my $status = $spamtest->check($mail);
402
403# stolen from SA plugin bits
404#sub parsed_metadata {
405# my ($self, $opts) = @_;
406# my $scanner = $status->{permsgstatus};
407
408 my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};
409
410 # Generate the full list of html-parsed domains.
411 my $uris = $status->get_uri_detail_list();
412
413# print "$msgs[$i]\n";
414 my %msguris;
415
416 while (my($uri, $info) = each %{$uris}) {
417 # we want to skip mailto: uris
418 next if ($uri =~ /^mailto:/);
419
420 # no domains were found via this uri, so skip
421 next unless ($info->{domains});
422
423 next if $info->{types}->{img};
424 foreach ( keys %{ $info->{domains} } ) {
425 if (exists $skip_domains->{$_}) {
426 next;
427 }
428 $msguris{$_}++;
429 #print " ".$_."\n";
430 }
431
432 # print "$uri ".$info->{anchor_text}."\n";
433 }
434
435 foreach (keys %msguris) {
436# print " $_ $msguris{$_}\n";
437 $urilist{$_}++;
438 }
439
440 # now, get the relay IP
441
442 my $stmsg = $status->get_message();
443 my @untrusted = @{$stmsg->{metadata}->{relays_untrusted}};
444
445 my $relayip = new NetAddr::IP $untrusted[0]->{ip};
446
447# sigh. messages tagged as spam already make life difficult by hiding the
448# old Received: headers. We'll just handle them manually for now.
449 if (!$relayip) {
450 print "phtui: ";
451 my %headerlist = %{$imap->parse_headers($msgs[$i], "Received", "Subject")};
452 my $recvnum = 0;
453 my $recv = $headerlist{'Received'}[$recvnum];
454 print "$recv\n";
455 print " $headerlist{'Subject'}[0]\n";
456 next MSG;
457 }
458
459 foreach my $block (@dontlistme) {
460 next MSG if $relayip->within($block);
461 }
462
463 $iplist{$relayip->addr}++ if $relayip;
464
465# last if $i > 2;
466 sleep 1;
467} # IMAP message iteration
468
469# mm. don't really need times on the IP lists
470#if ($opts{r}) {
471# print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
472#}
473foreach my $ip (sort keys %iplist) {
474 $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
475 if ($opts{r}) {
476 print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
477 } else {
478 print "$ip\t $iplist{$ip}\n";
479 }
480}
481
482if ($opts{u}) {
483 print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
484}
485foreach my $uri (sort keys %urilist) {
486 my @hout = qx { host '$uri.multi.uribl.com'; host '$uri.company.dnsbl.' };
487 if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
488 if ($opts{u}) {
489 print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
490 } else {
491 # URI plus count
492 print "$uri\t$urilist{$uri}\n";
493 }
494 }
495}
496
497$imap->close();
498
499# Close IMAP connection cleanly.
500$imap->logout();
501
502# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
503# irrelevant for SQL Bayes
504#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
505#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
Note: See TracBrowser for help on using the repository browser.