source: trunk/dnsbl/extract-data@ 18

Last change on this file since 18 was 18, checked in by Kris Deugau, 14 years ago

/trunk/dnsbl

Update extract-data based on live version

  • Property svn:executable set to *
  • Property svn:keywords set to Date Rev Author Id
File size: 9.6 KB
Line 
1#!/usr/bin/perl
2# Extract relay IP, URI hosts from mail in folder
3# Hack-n-patch from:
4# Heavily reworked from original:
5# dmz@dmzs.com - March 19, 2004
6# http://www.dmzs.com/tools/files/spam.phtml
7# LGPL
8# by:
9# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
10# Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
11##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
12##wrapver 0.1
13##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
14# $Id: extract-data 18 2010-03-04 22:10:13Z kdeugau $
15# $URL$
16
17use strict;
18use warnings;
19use Time::Local;
20use POSIX qw(strftime);
21use IO::Socket::SSL;
22use Mail::IMAPClient;
23use Getopt::Std;
24use NetAddr::IP;
25
26# hmm. can't make (enough) sense of the docs. gonna have to parse headers for IPs myself... :/
27# but we did find enough to extract the URIs...
28use lib '/opt/spamassassin/share/perl/5.10.0';
29use Mail::SpamAssassin;
30use Mail::SpamAssassin::PerMsgStatus;
31
32my %opts;
33getopts("druv", \%opts);
34
35my $debug = ($opts{d} ? 1 : 0);
36my $sadebug = 0; # sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
37my $imapdebug = 0; # so does Mail::IMAPClient... as in, the whole content of all the mail you look at. O_o
38my $delete_after_learning = 0; # set to 1 if you want to delete mail right away after learning
39 # - note this makes it rather harder to deep-scan the messages to create local rules
40my $verbose = ($opts{v} ? 1 : 0);
41
42my $tmpdir = '/var/tmp';
43my $salearn = '/usr/local/bin/sa-learn';
44my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';
45
46die "eeep! $salearn doesn't exist!\n" if ! -e $salearn;
47
48my $folder = 'reported.needsextraction';
49#my $folder = 'confirmed';
50
51# non-SSL IMAP settings:
52#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
53# User => 'imapspamuser',
54# Password => 'imapspamuserpassword',
55# Debug => $imapdebug);
56
57my $imap = Mail::IMAPClient->new(
58 User => 'junkmail',
59 Password => 'k3c86z2',
60 Socket => IO::Socket::SSL->new(
61 Proto => 'tcp',
62 PeerAddr => 'mail.company.com',
63 PeerPort => 993, # IMAP over SSL standard port
64 ),
65 Debug => $imapdebug,
66 );
67
68if (!defined($imap)) { die "IMAP Login Failed"; }
69
70my $msgcount = $imap->message_count($folder);
71
72# If debugging, print out the total counts for each mailbox
73print $msgcount, " message(s) to process\n" if $debug;
74
75## Process the spam mailbox
76$imap->select($folder);
77my @msgs = $imap->search("ALL");
78
79# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
80#my $spamtmp = "$tmpdir/spam.".time.".$$"; # this should give us a suitably pseudorandom directory
81#mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";
82
83my $spamtest = Mail::SpamAssassin->new();
84 # don't keep dereferencing this
85
86my %iplist;
87my %urilist;
88
89# put together an array of netblocks we won't/can't list for various reasons
90my @dontlistme = (
91 # Hotmail/Windows Live Mail
92#IP-Network 207.68.128.0/18
93#IP-Network 207.68.192.0/20
94#IP-Network-Block 207.068.128.000 - 207.068.207.255
95 NetAddr::IP->new("65.52.0.0/14"),
96 NetAddr::IP->new("207.68.176.96/27"),
97
98 # AOL - note only some IPs show mail-ish rDNS
99 #IP-Network 205.188.0.0/16
100 #IP-Network 64.12.0.0/16
101 NetAddr::IP->new("205.188.91.96/29"),
102 NetAddr::IP->new("205.188.105.143"),
103 NetAddr::IP->new("205.188.105.144/30"),
104 NetAddr::IP->new("205.188.169.196/29"),
105 NetAddr::IP->new("205.188.249.128/29"),
106 NetAddr::IP->new("205.188.249.64/29"),
107 NetAddr::IP->new("64.12.78.136/30"),
108 NetAddr::IP->new("64.12.78.142"),
109 NetAddr::IP->new("64.12.143.144/30"),
110 NetAddr::IP->new("64.12.143.152/30"),
111 NetAddr::IP->new("64.12.206.40/30"),
112
113 # Google/GMail
114 NetAddr::IP->new("209.85.128.0/17"),
115 NetAddr::IP->new("72.14.192.0/18"),
116 NetAddr::IP->new("74.125.0.0/16"),
117
118 # Yahoo!/Inktomi
119 NetAddr::IP->new("98.136.0.0/14"),
120 NetAddr::IP->new("66.196.64.0/18"),
121 NetAddr::IP->new("67.195.0.0/16"),
122 NetAddr::IP->new("69.147.64.0/18"),
123 NetAddr::IP->new("206.190.32.0/18"),
124 NetAddr::IP->new("68.142.192.0/18"),
125 NetAddr::IP->new("216.252.96.0/19"),
126 NetAddr::IP->new("124.83.128.0/17"),
127 NetAddr::IP->new("217.146.184.0/21"),
128 NetAddr::IP->new("124.108.96.0/20"),
129 NetAddr::IP->new("76.13.0.0/16"),
130 NetAddr::IP->new("68.180.128.0/17"),
131 NetAddr::IP->new("209.191.64.0/18"),
132 NetAddr::IP->new("212.82.104.0/21"),
133 NetAddr::IP->new("66.163.160.0/19"),
134#inetnum: 87.248.110.0 - 87.248.111.255
135#route: 87.248.104.0/21
136 NetAddr::IP->new("87.248.110.0/23"),
137 NetAddr::IP->new("203.188.200.0/22"),
138 NetAddr::IP->new("217.12.0.0/20"),
139
140 # MessageLabs - may add these to trusted_networks instead
141 NetAddr::IP->new("85.158.139.0/24"),
142
143 # Bell Canada - note only some IPs show mail-ish rDNS
144 #IP-Network 209.226.0.0/16
145 #IP-Network 207.236.0.0/16
146 NetAddr::IP->new("209.226.175.0/24"),
147 NetAddr::IP->new("207.236.237.0/26"),
148
149 # Craigslist
150 #IP-Network 208.82.236.0/22
151 NetAddr::IP->new("208.82.236.0/22"),
152
153 # Apple.com/mac.com - note only some IPs show mail-ish rDNS
154 #IP-Network 17.0.0.0/8
155 # asmtpout0(11-30).mac.com
156 # 17.148.16. 011 -> 86 030 -> 105
157 NetAddr::IP->new("17.148.16.64/26"),
158
159 # Vodafone - note only some IPs show mail-ish rDNS
160 #route: 212.183.128.0/19
161 NetAddr::IP->new("212.183.156.224/29"),
162
163 # Eastlink (formerly Persona [Sudbury etc]) - only a few IPs observed with mail-ish rDNS
164 #IP-Network 24.222.0.0/16
165 NetAddr::IP->new("24.222.0.30"),
166 NetAddr::IP->new("24.224.136.8/30"),
167
168 # Cogeco - only a few IPs observed with mail-ish rDNS
169 #IP-Network 216.221.64.0/19
170 NetAddr::IP->new("216.221.81.192"),
171 NetAddr::IP->new("216.221.81.96/30"),
172
173 # seznam.cz
174 #route: 77.75.72.0/23
175 NetAddr::IP->new("77.75.72.44"),
176
177 # Facebook - only exclude mail-ish hostnames
178 #IP-Network 69.63.176.0/20
179 # .178.160/27 plus a few more to 199 show mailish rDNS
180 NetAddr::IP->new("69.63.178.160/27"),
181 NetAddr::IP->new("69.63.178.192/29"),
182
183 # T-Online - only mail-ish hostnames
184#route: 194.25.0.0/16
185#inetnum: 194.25.134.0 - 194.25.134.255
186#netname: DTOS-ULM-001
187#descr: www.t-online.de
188#descr: mail.t-online.de
189#descr: php.t-online.de
190 NetAddr::IP->new("194.25.134.80/29"),
191
192## Edumactional places
193
194 # UAlberta - only one IP observed with mail-ish rDNS
195 #IP-Network 129.128.0.0/16
196 NetAddr::IP->new("129.128.5.19"),
197
198 # University of Maryland Baltimore County - mail servers only
199 #CIDR: 130.85.0.0/16
200 NetAddr::IP->new("130.85.25.76/30"),
201
202 # University of Florida
203 #IP-Network 128.227.0.0/16
204 NetAddr::IP->new("128.227.74.165"),
205
206 ); # done def for @dontlistme
207
208MSG: for (my $i=0; $i<$msgcount; $i++) {
209 my $msg = $imap->message_string($msgs[$i]);
210
211 my $mail = $spamtest->parse($msg);
212 my $status = $spamtest->check($mail);
213
214# stolen from SA plugin bits
215#sub parsed_metadata {
216# my ($self, $opts) = @_;
217# my $scanner = $status->{permsgstatus};
218
219 my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};
220
221 # Generate the full list of html-parsed domains.
222 my $uris = $status->get_uri_detail_list();
223
224# print "$msgs[$i]\n";
225 my %msguris;
226
227 while (my($uri, $info) = each %{$uris}) {
228 # we want to skip mailto: uris
229 next if ($uri =~ /^mailto:/);
230
231 # no domains were found via this uri, so skip
232 next unless ($info->{domains});
233
234 next if $info->{types}->{img};
235 foreach ( keys %{ $info->{domains} } ) {
236 if (exists $skip_domains->{$_}) {
237 next;
238 }
239 $msguris{$_}++;
240 #print " ".$_."\n";
241 }
242
243 # print "$uri ".$info->{anchor_text}."\n";
244 }
245
246 foreach (keys %msguris) {
247# print " $_ $msguris{$_}\n";
248 $urilist{$_}++;
249 }
250
251 # now, get the relay IP
252
253 my $stmsg = $status->get_message();
254 my @untrusted = @{$stmsg->{metadata}->{relays_untrusted}};
255
256 my $relayip = new NetAddr::IP $untrusted[0]->{ip};
257
258# sigh. messages tagged as spam already make life difficult by hiding the
259# old Received: headers. We'll just handle them manually for now.
260 if (!$relayip) {
261 print "phtui: ";
262 my %headerlist = %{$imap->parse_headers($msgs[$i], "Received", "Subject")};
263 my $recvnum = 0;
264 my $recv = $headerlist{'Received'}[$recvnum];
265 print "$recv\n";
266 print " $headerlist{'Subject'}[0]\n";
267 next MSG;
268 }
269
270 foreach my $block (@dontlistme) {
271 next MSG if $relayip->within($block);
272 }
273
274 $iplist{$relayip->addr}++ if $relayip;
275
276# last if $i > 2;
277 sleep 1;
278} # IMAP message iteration
279
280# mm. don't really need times on the IP lists
281#if ($opts{r}) {
282# print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
283#}
284foreach my $ip (sort keys %iplist) {
285 $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
286 if ($opts{r}) {
287 print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
288 } else {
289 print "$ip\t $iplist{$ip}\n";
290 }
291}
292
293if ($opts{u}) {
294 print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
295}
296foreach my $uri (sort keys %urilist) {
297 my @hout = qx { host '$uri.multi.uribl.com'; host '$uri.uribl.company.com' };
298 if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
299 if ($opts{u}) {
300 print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
301 } else {
302 # URI plus count
303 print "$uri\t$urilist{$uri}\n";
304 }
305 }
306}
307
308$imap->close();
309
310# Close IMAP connection cleanly.
311$imap->logout();
312
313# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
314# irrelevant for SQL Bayes
315#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
316#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
Note: See TracBrowser for help on using the repository browser.