source: trunk/dnsbl/extract-data@ 21

Last change on this file since 21 was 21, checked in by Kris Deugau, 14 years ago

/trunk/dnsbl

Update extract-data with current live skiplist

  • Property svn:executable set to *
  • Property svn:keywords set to Date Rev Author Id
File size: 13.1 KB
Line 
1#!/usr/bin/perl
2# Extract relay IP, URI hosts from mail in folder
3# Hack-n-patch from:
4# Heavily reworked from original:
5# dmz@dmzs.com - March 19, 2004
6# http://www.dmzs.com/tools/files/spam.phtml
7# LGPL
8# by:
9# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
10# Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
11##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
12##wrapver 0.1
13##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
14# $Id: extract-data 21 2010-08-30 18:16:14Z kdeugau $
15# $URL$
16
17use strict;
18use warnings;
19use Time::Local;
20use POSIX qw(strftime);
21use IO::Socket::SSL;
22use Mail::IMAPClient;
23use Getopt::Std;
24use NetAddr::IP;
25
26# hmm. can't make (enough) sense of the docs. gonna have to parse headers for IPs myself... :/
27# but we did find enough to extract the URIs...
28use lib '/opt/spamassassin/share/perl/5.10.0';
29use Mail::SpamAssassin;
30use Mail::SpamAssassin::PerMsgStatus;
31
32my %opts;
33getopts("druv", \%opts);
34
35my $debug = ($opts{d} ? 1 : 0);
36my $sadebug = 0; # sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
37my $imapdebug = 0; # so does Mail::IMAPClient... as in, the whole content of all the mail you look at. O_o
38my $delete_after_learning = 0; # set to 1 if you want to delete mail right away after learning
39 # - note this makes it rather harder to deep-scan the messages to create local rules
40my $verbose = ($opts{v} ? 1 : 0);
41
42my $tmpdir = '/var/tmp';
43my $salearn = '/usr/local/bin/sa-learn';
44my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';
45
46die "eeep! $salearn doesn't exist!\n" if ! -e $salearn;
47
48my $folder = 'reported.needsextraction';
49#my $folder = 'confirmed';
50
51# non-SSL IMAP settings:
52#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
53# User => 'imapspamuser',
54# Password => 'imapspamuserpassword',
55# Debug => $imapdebug);
56
57my $imap = Mail::IMAPClient->new(
58 User => 'junkmail',
59 Password => 'k3c86z2',
60 Socket => IO::Socket::SSL->new(
61 Proto => 'tcp',
62 PeerAddr => 'mail.company.com',
63 PeerPort => 993, # IMAP over SSL standard port
64 ),
65 Debug => $imapdebug,
66 )
67
68 or die "IMAP login failed: $@\n";
69
70if (!defined($imap)) { die "IMAP Login Failed"; }
71
72my $msgcount = $imap->message_count($folder);
73
74# If debugging, print out the total counts for each mailbox
75print $msgcount, " message(s) to process\n" if $debug;
76
77## Process the spam mailbox
78$imap->select($folder);
79my @msgs = $imap->search("ALL");
80
81# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
82#my $spamtmp = "$tmpdir/spam.".time.".$$"; # this should give us a suitably pseudorandom directory
83#mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";
84
85my $spamtest = Mail::SpamAssassin->new();
86 # don't keep dereferencing this
87
88my %iplist;
89my %urilist;
90
91# put together an array of netblocks we won't/can't list for various reasons
92my @dontlistme = (
93 # Hotmail/Windows Live Mail
94#IP-Network 207.68.128.0/18
95#IP-Network 207.68.192.0/20
96#IP-Network-Block 207.068.128.000 - 207.068.207.255
97 NetAddr::IP->new("65.52.0.0/14"),
98 NetAddr::IP->new("207.68.176.96/27"),
99
100 # AOL - note only some IPs show mail-ish rDNS
101 #IP-Network 205.188.0.0/16
102 #IP-Network 64.12.0.0/16
103 NetAddr::IP->new("205.188.91.96/29"),
104 NetAddr::IP->new("205.188.105.143"),
105 NetAddr::IP->new("205.188.105.144/30"),
106 NetAddr::IP->new("205.188.169.196/29"),
107 NetAddr::IP->new("205.188.249.128/29"),
108 NetAddr::IP->new("205.188.249.64/29"),
109 NetAddr::IP->new("64.12.78.136/30"),
110 NetAddr::IP->new("64.12.78.142"),
111 NetAddr::IP->new("64.12.143.144/30"),
112 NetAddr::IP->new("64.12.143.152/30"),
113 NetAddr::IP->new("64.12.206.39"),
114 NetAddr::IP->new("64.12.206.40/30"),
115 NetAddr::IP->new("64.12.207.128/27"),
116 NetAddr::IP->new("64.12.207.144/29"),
117 NetAddr::IP->new("64.12.207.152/30"),
118 NetAddr::IP->new("64.12.207.160/28"),
119 NetAddr::IP->new("64.12.207.176/29"),
120 #IP-Network 205.188.0.0/16
121 NetAddr::IP->new("205.188.169.200/23"),
122
123 # Google/GMail
124 NetAddr::IP->new("209.85.128.0/17"),
125 NetAddr::IP->new("72.14.192.0/18"),
126 NetAddr::IP->new("74.125.0.0/16"),
127
128 # Yahoo!/Inktomi
129 NetAddr::IP->new("98.136.0.0/14"),
130 NetAddr::IP->new("66.196.64.0/18"),
131 NetAddr::IP->new("67.195.0.0/16"),
132 NetAddr::IP->new("69.147.64.0/18"),
133 NetAddr::IP->new("206.190.32.0/18"),
134 NetAddr::IP->new("68.142.192.0/18"),
135 NetAddr::IP->new("216.252.96.0/19"),
136 NetAddr::IP->new("124.83.128.0/17"),
137 NetAddr::IP->new("217.146.184.0/21"),
138 NetAddr::IP->new("124.108.96.0/20"),
139 NetAddr::IP->new("76.13.0.0/16"),
140 NetAddr::IP->new("68.180.128.0/17"),
141 NetAddr::IP->new("209.191.64.0/18"),
142 NetAddr::IP->new("212.82.104.0/21"),
143 NetAddr::IP->new("66.163.160.0/19"),
144#inetnum: 87.248.110.0 - 87.248.111.255
145#route: 87.248.104.0/21
146 NetAddr::IP->new("87.248.110.0/23"),
147 NetAddr::IP->new("203.188.200.0/22"),
148 NetAddr::IP->new("217.12.0.0/20"),
149 NetAddr::IP->new("77.238.184.0/23"),
150#IP-Network 74.6.0.0/16
151 NetAddr::IP->new("74.6.114.48/24"),
152 NetAddr::IP->new("74.6.228.32/27"),
153 NetAddr::IP->new("74.6.228.64/26"),
154 NetAddr::IP->new("202.165.96.0/21"),
155#route: 87.248.112.0/21
156#inetnum: 87.248.114.0 - 87.248.115.255
157 NetAddr::IP->new("87.248.114.0/24"),
158#route: 116.214.0.0/20
159 NetAddr::IP->new("116.214.12.0/24"),
160
161 # MessageLabs - may add these to trusted_networks instead
162 NetAddr::IP->new("85.158.139.0/24"),
163# observed: .35, .51
164 NetAddr::IP->new("194.106.220.0/23"),
165
166 # Bell Canada - note only some IPs show mail-ish rDNS
167 #IP-Network 209.226.0.0/16
168 #IP-Network 207.236.0.0/16
169 NetAddr::IP->new("209.226.175.0/24"),
170 NetAddr::IP->new("207.236.237.0/26"),
171
172 # Craigslist
173 #IP-Network 208.82.236.0/22
174 NetAddr::IP->new("208.82.236.0/22"),
175
176 # Apple.com/mac.com - note only some IPs show mail-ish rDNS
177 #IP-Network 17.0.0.0/8
178 # asmtpout0(11-30).mac.com
179 # 17.148.16. 011 -> 86 030 -> 105
180 NetAddr::IP->new("17.148.16.64/26"),
181
182 # Vodafone - note only some IPs show mail-ish rDNS
183 #route: 212.183.128.0/19
184 NetAddr::IP->new("212.183.156.224/29"),
185
186 # Facebook - only exclude mail-ish hostnames
187 #IP-Network 69.63.176.0/20
188 # .178.160/27 plus a few more to 199 show mailish rDNS
189 NetAddr::IP->new("69.63.178.160/27"),
190 NetAddr::IP->new("69.63.178.192/29"),
191 #IP-Network 66.220.144.0/20
192 NetAddr::IP->new("66.220.144.128/26"),
193
194 # IBM Lotus Live - rdns mostly mail
195 #CIDR: 8.12.152.0/24
196 NetAddr::IP->new("8.12.152.0/24"),
197
198# ISPs
199 # Eastlink (formerly Persona [Sudbury etc]) - only a few IPs observed with mail-ish rDNS
200 #IP-Network 24.222.0.0/16
201 NetAddr::IP->new("24.222.0.30"),
202 NetAddr::IP->new("24.224.136.8/30"),
203
204 # Cogeco - only a few IPs observed with mail-ish rDNS
205 #IP-Network 216.221.64.0/19
206 NetAddr::IP->new("216.221.81.192"),
207 NetAddr::IP->new("216.221.81.28/30"),
208 NetAddr::IP->new("216.221.81.96/30"),
209 # seznam.cz
210 #route: 77.75.72.0/23
211 NetAddr::IP->new("77.75.72.44"),
212
213 # T-Online - only mail-ish hostnames
214#route: 194.25.0.0/16
215#inetnum: 194.25.134.0 - 194.25.134.255
216#netname: DTOS-ULM-001
217#descr: www.t-online.de
218#descr: mail.t-online.de
219#descr: php.t-online.de
220 NetAddr::IP->new("194.25.134.80/29"),
221
222 # Telestra (Australia)
223 #inetnum: 61.9.128.0 - 61.9.255.255
224 # may miss one or two
225 NetAddr::IP->new("61.9.168.136/29"),
226 NetAddr::IP->new("61.9.168.144/29"),
227
228## Edumactional places
229
230 # UAlberta - only one IP observed with mail-ish rDNS
231 #IP-Network 129.128.0.0/16
232 NetAddr::IP->new("129.128.5.19"),
233
234 # Queens University
235 #IP-Network 130.15.0.0/16
236 NetAddr::IP->new("130.15.241.183"),
237
238 # University of Maryland Baltimore County - mail servers only
239 #CIDR: 130.85.0.0/16
240 NetAddr::IP->new("130.85.25.76/30"),
241
242 # University of Florida
243 #IP-Network 128.227.0.0/16
244 NetAddr::IP->new("128.227.74.149"),
245 NetAddr::IP->new("128.227.74.165"),
246
247 # University of Texas
248 #IP-Network 129.109.0.0/16
249 NetAddr::IP->new("129.109.195.0/28"),
250
251 # Roxbury Community College
252 #IP-Network 209.104.233.0/24
253 NetAddr::IP->new("209.104.233.247"),
254
255 # Stanford University
256 #IP-Network 171.64.0.0/14
257 NetAddr::IP->new("171.67.219.80/30"),
258
259 # Cleveland State University
260 #IP-Network 137.148.0.0/16
261 NetAddr::IP->new("137.148.18.13/32"),
262
263 # University of Missouri - dba the Missouri Research and Education Network (MOREnet)
264 #CIDR: 204.184.0.0/15
265 NetAddr::IP->new("204.185.165.125"),
266
267 # Yale University
268 #IP-Network 130.132.0.0/16
269 NetAddr::IP->new("130.132.50.7"),
270
271 # Rutgers University
272 #IP-Network 165.230.0.0/16
273 NetAddr::IP->new("165.230.151.182"),
274
275 # Clemson University
276 #IP-Network 130.127.0.0/16
277 NetAddr::IP->new("130.127.235.21"),
278
279 # Wilkes University
280 #IP-Network 146.94.0.0/16
281 NetAddr::IP->new("146.94.192.152"),
282
283 # Virginia Community College System
284 #IP-Network 164.106.0.0/16
285 NetAddr::IP->new("164.106.130.251"),
286
287 # University of Virginia
288 #IP-Network 199.111.0.0/16
289 # Mary Washington College
290 #IP-Network 199.111.64.0/19
291 NetAddr::IP->new("199.111.84.18"),
292
293# List servers
294 # Debian listserver - relays spam, so we can't list it.
295 NetAddr::IP->new("82.195.75.100"),
296
297# ESPs
298 # moderately abusable, but mostly legit
299 # Org-Name iContact (Broadwick Corp./Preation Inc.)
300 NetAddr::IP->new("216.27.93.0/25"),
301
302 # are these guys an ESP? rDNS in many blocks shows secureserver.net, with SMTPish overtones
303 #Org-Name GoDaddy.com, Inc.
304 #IP-Network 64.202.160.0/19
305 NetAddr::IP->new("64.202.160.0/24"),
306 NetAddr::IP->new("64.202.165.0/26"),
307 NetAddr::IP->new("64.202.165.180/30"),
308 NetAddr::IP->new("64.202.165.192/29"),
309 NetAddr::IP->new("64.202.165.224/28"),
310 #IP-Network 72.167.0.0/16
311 NetAddr::IP->new("72.167.82.80/29"),
312 #IP-Network 173.201.0.0/16
313 NetAddr::IP->new("173.201.192.0/24"),
314
315 ); # done def for @dontlistme
316
317MSG: for (my $i=0; $i<$msgcount; $i++) {
318 my $msg = $imap->message_string($msgs[$i]);
319
320 my $mail = $spamtest->parse($msg);
321 my $status = $spamtest->check($mail);
322
323# stolen from SA plugin bits
324#sub parsed_metadata {
325# my ($self, $opts) = @_;
326# my $scanner = $status->{permsgstatus};
327
328 my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};
329
330 # Generate the full list of html-parsed domains.
331 my $uris = $status->get_uri_detail_list();
332
333# print "$msgs[$i]\n";
334 my %msguris;
335
336 while (my($uri, $info) = each %{$uris}) {
337 # we want to skip mailto: uris
338 next if ($uri =~ /^mailto:/);
339
340 # no domains were found via this uri, so skip
341 next unless ($info->{domains});
342
343 next if $info->{types}->{img};
344 foreach ( keys %{ $info->{domains} } ) {
345 if (exists $skip_domains->{$_}) {
346 next;
347 }
348 $msguris{$_}++;
349 #print " ".$_."\n";
350 }
351
352 # print "$uri ".$info->{anchor_text}."\n";
353 }
354
355 foreach (keys %msguris) {
356# print " $_ $msguris{$_}\n";
357 $urilist{$_}++;
358 }
359
360 # now, get the relay IP
361
362 my $stmsg = $status->get_message();
363 my @untrusted = @{$stmsg->{metadata}->{relays_untrusted}};
364
365 my $relayip = new NetAddr::IP $untrusted[0]->{ip};
366
367# sigh. messages tagged as spam already make life difficult by hiding the
368# old Received: headers. We'll just handle them manually for now.
369 if (!$relayip) {
370 print "phtui: ";
371 my %headerlist = %{$imap->parse_headers($msgs[$i], "Received", "Subject")};
372 my $recvnum = 0;
373 my $recv = $headerlist{'Received'}[$recvnum];
374 print "$recv\n";
375 print " $headerlist{'Subject'}[0]\n";
376 next MSG;
377 }
378
379 foreach my $block (@dontlistme) {
380 next MSG if $relayip->within($block);
381 }
382
383 $iplist{$relayip->addr}++ if $relayip;
384
385# last if $i > 2;
386 sleep 1;
387} # IMAP message iteration
388
389# mm. don't really need times on the IP lists
390#if ($opts{r}) {
391# print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
392#}
393foreach my $ip (sort keys %iplist) {
394 $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
395 if ($opts{r}) {
396 print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
397 } else {
398 print "$ip\t $iplist{$ip}\n";
399 }
400}
401
402if ($opts{u}) {
403 print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
404}
405foreach my $uri (sort keys %urilist) {
406 my @hout = qx { host '$uri.multi.uribl.com'; host '$uri.uribl.company.com' };
407 if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
408 if ($opts{u}) {
409 print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
410 } else {
411 # URI plus count
412 print "$uri\t$urilist{$uri}\n";
413 }
414 }
415}
416
417$imap->close();
418
419# Close IMAP connection cleanly.
420$imap->logout();
421
422# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
423# irrelevant for SQL Bayes
424#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
425#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
Note: See TracBrowser for help on using the repository browser.