1 | #!/usr/bin/perl
|
---|
2 | # Extract relay IP, URI hosts from mail in folder
|
---|
3 | # Hack-n-patch from:
|
---|
4 | # Heavily reworked from original:
|
---|
5 | # dmz@dmzs.com - March 19, 2004
|
---|
6 | # http://www.dmzs.com/tools/files/spam.phtml
|
---|
7 | # LGPL
|
---|
8 | # by:
|
---|
9 | # Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
|
---|
10 | # Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
|
---|
11 | ##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
|
---|
12 | ##wrapver 0.1
|
---|
13 | ##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
|
---|
14 | # $Id: extract-data 2 2009-09-01 21:27:00Z kdeugau $
|
---|
15 | # $URL$
|
---|
16 |
|
---|
17 | use strict;
|
---|
18 | use warnings;
|
---|
19 | use Time::Local;
|
---|
20 | use POSIX qw(strftime);
|
---|
21 | use IO::Socket::SSL;
|
---|
22 | use Mail::IMAPClient;
|
---|
23 | use Getopt::Std;
|
---|
24 | use NetAddr::IP;
|
---|
25 |
|
---|
26 | # hmm. can't make (enough) sense of the docs. gonna have to parse headers for IPs myself... :/
|
---|
27 | # but we did find enough to extract the URIs...
|
---|
28 | use lib '/opt/spamassassin/share/perl/5.8.8';
|
---|
29 | use Mail::SpamAssassin;
|
---|
30 | use Mail::SpamAssassin::PerMsgStatus;
|
---|
31 |
|
---|
32 | my %opts;
|
---|
33 | getopts("druv", \%opts);
|
---|
34 |
|
---|
35 | my $debug = ($opts{d} ? 1 : 0);
|
---|
36 | my $sadebug = 0; # sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
|
---|
37 | my $imapdebug = 0; # so does Mail::IMAPClient... as in, the whole content of all the mail you look at. O_o
|
---|
38 | my $delete_after_learning = 0; # set to 1 if you want to delete mail right away after learning
|
---|
39 | # - note this makes it rather harder to deep-scan the messages to create local rules
|
---|
40 | my $verbose = ($opts{v} ? 1 : 0);
|
---|
41 |
|
---|
42 | my $tmpdir = '/var/tmp';
|
---|
43 | my $salearn = '/usr/local/bin/sa-learn';
|
---|
44 | my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';
|
---|
45 |
|
---|
46 | die "eeep! $salearn doesn't exist!\n" if ! -e $salearn;
|
---|
47 |
|
---|
48 | my $folder = 'reported.needsextraction';
|
---|
49 | #my $folder = 'confirmed';
|
---|
50 |
|
---|
51 | # non-SSL IMAP settings:
|
---|
52 | #my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
|
---|
53 | # User => 'imapspamuser',
|
---|
54 | # Password => 'imapspamuserpassword',
|
---|
55 | # Debug => $imapdebug);
|
---|
56 |
|
---|
57 | my $imap = Mail::IMAPClient->new(
|
---|
58 | User => 'junkmail',
|
---|
59 | Password => 'k3c86z2',
|
---|
60 | Socket => IO::Socket::SSL->new(
|
---|
61 | Proto => 'tcp',
|
---|
62 | PeerAddr => 'mail.company.com',
|
---|
63 | PeerPort => 993, # IMAP over SSL standard port
|
---|
64 | ),
|
---|
65 | Debug => $imapdebug,
|
---|
66 | );
|
---|
67 |
|
---|
68 | if (!defined($imap)) { die "IMAP Login Failed"; }
|
---|
69 |
|
---|
70 | my $msgcount = $imap->message_count($folder);
|
---|
71 |
|
---|
72 | # If debugging, print out the total counts for each mailbox
|
---|
73 | print $msgcount, " message(s) to process\n" if $debug;
|
---|
74 |
|
---|
75 | ## Process the spam mailbox
|
---|
76 | $imap->select($folder);
|
---|
77 | my @msgs = $imap->search("ALL");
|
---|
78 |
|
---|
79 | # Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
|
---|
80 | #my $spamtmp = "$tmpdir/spam.".time.".$$"; # this should give us a suitably pseudorandom directory
|
---|
81 | #mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";
|
---|
82 |
|
---|
83 | my $spamtest = Mail::SpamAssassin->new();
|
---|
84 | # don't keep dereferencing this
|
---|
85 |
|
---|
86 | my %iplist;
|
---|
87 | my %urilist;
|
---|
88 |
|
---|
89 | for (my $i=0; $i<$msgcount; $i++) {
|
---|
90 | my $msg = $imap->message_string($msgs[$i]);
|
---|
91 |
|
---|
92 | my $mail = $spamtest->parse($msg);
|
---|
93 | my $status = $spamtest->check($mail);
|
---|
94 |
|
---|
95 | # stolen from SA plugin bits
|
---|
96 | #sub parsed_metadata {
|
---|
97 | # my ($self, $opts) = @_;
|
---|
98 | # my $scanner = $status->{permsgstatus};
|
---|
99 |
|
---|
100 | my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};
|
---|
101 |
|
---|
102 | # Generate the full list of html-parsed domains.
|
---|
103 | my $uris = $status->get_uri_detail_list();
|
---|
104 |
|
---|
105 | # print "$msgs[$i]\n";
|
---|
106 | my %msguris;
|
---|
107 |
|
---|
108 | while (my($uri, $info) = each %{$uris}) {
|
---|
109 | # we want to skip mailto: uris
|
---|
110 | next if ($uri =~ /^mailto:/);
|
---|
111 |
|
---|
112 | # no domains were found via this uri, so skip
|
---|
113 | next unless ($info->{domains});
|
---|
114 |
|
---|
115 | next if $info->{types}->{img};
|
---|
116 | foreach ( keys %{ $info->{domains} } ) {
|
---|
117 | if (exists $skip_domains->{$_}) {
|
---|
118 | next;
|
---|
119 | }
|
---|
120 | $msguris{$_}++;
|
---|
121 | #print " ".$_."\n";
|
---|
122 | }
|
---|
123 |
|
---|
124 | # print "$uri ".$info->{anchor_text}."\n";
|
---|
125 | }
|
---|
126 |
|
---|
127 | foreach (keys %msguris) {
|
---|
128 | # print " $_ $msguris{$_}\n";
|
---|
129 | $urilist{$_}++;
|
---|
130 | }
|
---|
131 |
|
---|
132 | # now, get the relay IP
|
---|
133 |
|
---|
134 | my %headerlist = %{$imap->parse_headers($msgs[$i], "Received")};
|
---|
135 | my $recvnum = 0;
|
---|
136 | my $recv = $headerlist{'Received'}[$recvnum];
|
---|
137 | next if !$recv;
|
---|
138 |
|
---|
139 | my $relayip;
|
---|
140 |
|
---|
141 | #Received: from mail.company.com [ip.add.re.ss]
|
---|
142 | # by localhost with POP3 (fetchmail-6.2.5)
|
---|
143 | # for kdeugau@localhost (single-drop); Fri, 15 May 2009 11:45:10 -0400 (EDT)
|
---|
144 | if ($recv =~ /from mail\.company\.com \[ip\.add\.re\.ss\]\s*by localhost with POP3 \(fetchmail/) {
|
---|
145 | $recvnum += 1;
|
---|
146 | $recv = $headerlist{'Received'}[$recvnum];
|
---|
147 | }
|
---|
148 |
|
---|
149 | if ($recv =~ /^by mx\d\.company\.com \(Postfix, from userid \d+\)/) {
|
---|
150 | $recvnum++;
|
---|
151 | $recv = $headerlist{'Received'}[$recvnum];
|
---|
152 | }
|
---|
153 |
|
---|
154 | # le sigh. gotta bypass a message if we can't parse the headers. Outlook
|
---|
155 | # does an admirable job of mangling things for us. >:(
|
---|
156 | if ($recv !~ /by mx\d\.company\.com \(Postfix\)/) {
|
---|
157 | print "phtui: $recv\n";
|
---|
158 | next;
|
---|
159 | }
|
---|
160 |
|
---|
161 | ##fixme
|
---|
162 | # le sigh. skip IP extraction on tagged spam reported as nonspam, since the real spam is a layer deeper.
|
---|
163 | next if $recv =~ /from localhost by mfs\d with SpamAssassin/;
|
---|
164 |
|
---|
165 | # Postini puts the "real" received: header one layer further out - SA is configured to compensate for this so we do too
|
---|
166 | #IP-Network 64.18.0.0/20
|
---|
167 | #IP-Network-Block 064.018.000.000 - 064.018.015.255
|
---|
168 | #Org-Name Postini, Inc.
|
---|
169 | if ($recv =~ /\[64\.18\.(?:[0-9]|1[0-5])\.\d+]\) by mx\d\.company\.com/) {
|
---|
170 | $recv = $recv = $headerlist{'Received'}[++$recvnum];
|
---|
171 | #Received: from source ([208.95.48.65]) (using TLSv1) by
|
---|
172 | # exprod5mx230.postini.com ([64.18.4.10]) with SMTP; Fri, 10 Jul
|
---|
173 | my ($tmprelayip) = ($recv =~ /from source \(\[([\d.]+)\]\) (?:\(using TLSv1\) )?by exprod\dm[xo]b?\d+\.postini\.com/);
|
---|
174 | $relayip = new NetAddr::IP $tmprelayip;
|
---|
175 | } else {
|
---|
176 | my ($tmprelayip) = ($recv =~ /\[([\d+.]+)\]\) by mx\d\.company\.com/);
|
---|
177 | $relayip = new NetAddr::IP $tmprelayip;
|
---|
178 | }
|
---|
179 |
|
---|
180 | # Hotmail/Windows Live Mail may originate or relay spam, but we can't blacklist them
|
---|
181 | #Received: from blu0-omc4-s23.blu0.hotmail.com (blu0-omc4-s23.blu0.hotmail.com
|
---|
182 | # [65.55.111.162]) by mx2.company.com (Postfix)
|
---|
183 | # next if $recv =~ /from (?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com
|
---|
184 | \((?:bay|blu|col|snt)0-omc\d+-s\d+\.(?:bay|blu|col|snt)0\.hotmail\.com \[65.5[2345].\d+\.\d+\]\) by mx\d\.company\.com/;
|
---|
185 | #IP-Network 65.52.0.0/14
|
---|
186 | #IP-Network-Block 065.052.000.000 - 065.055.255.255
|
---|
187 | #Org-Name Microsoft Corp
|
---|
188 | my $hotmail1 = new NetAddr::IP "65.52.0.0/14";
|
---|
189 | print "$.: $recv\n" if !defined ($relayip);
|
---|
190 | next if $relayip->within($hotmail1);
|
---|
191 |
|
---|
192 | # AOL may originate or relay spam, but we can't blacklist them
|
---|
193 | #Received: from omr-m33.mx.aol.com (omr-m33.mx.aol.com [64.12.143.145]) by
|
---|
194 | # mx1.company.com (Postfix) with ESMTP id 7B9431C3255 for <webmaster@tyenet.com>;
|
---|
195 | next if $recv =~ /from (?:omr|imo)-[dm]\d+\.mx\.aol\.com \((?:omr|imo)-[dm]\d+\.mx\.aol\.com \[[\d.]+\]\) by mx\d\.company\.com/;
|
---|
196 |
|
---|
197 | # Google may relay spam, GMail may originate it, but we can't blacklist them.
|
---|
198 | #IP-Network 209.85.128.0/17
|
---|
199 | #IP-Network-Block 209.085.128.000 - 209.085.255.255
|
---|
200 | #Org-Name Google Inc.
|
---|
201 | next if $recv =~ /\[209\.85\.(?:1(?:2[89]|[3-9]\d)|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/;
|
---|
202 | #OrgName: Google Inc.
|
---|
203 | #NetRange: 72.14.192.0 - 72.14.255.255
|
---|
204 | #CIDR: 72.14.192.0/18
|
---|
205 | next if $recv =~ /\[72\.14\.(?:19[2-9]|2(?:[0-4]\d|5[0-5]))\.\d+\]\) by mx\d\.company\.com/;
|
---|
206 |
|
---|
207 | # Yahoo! may ... yadda yadda yadda (geeze they've got a whack of netblocks for mail...)
|
---|
208 | #IP-Network 98.136.0.0/14
|
---|
209 | #IP-Network-Block 098.136.000.000 - 098.139.255.255
|
---|
210 | #Org-Name Yahoo! Inc.
|
---|
211 | my $yahoo1 = new NetAddr::IP "98.136.0.0/14";
|
---|
212 | next if $relayip->within($yahoo1);
|
---|
213 | #IP-Network 66.196.64.0/18
|
---|
214 | #IP-Network-Block 066.196.064.000 - 066.196.127.255
|
---|
215 | #Org-Name Inktomi Corporation
|
---|
216 | # Inktomi ~~ Yahoo!
|
---|
217 | my $yahoo2 = new NetAddr::IP "66.196.64.0/18";
|
---|
218 | next if $relayip->within($yahoo2);
|
---|
219 | #IP-Network 67.195.0.0/16
|
---|
220 | #IP-Network-Block 067.195.000.000 - 067.195.255.255
|
---|
221 | #Org-Name Yahoo! Inc.
|
---|
222 | my $yahoo3 = new NetAddr::IP "67.195.0.0/16";
|
---|
223 | next if $relayip->within($yahoo3);
|
---|
224 | #IP-Network 69.147.64.0/18
|
---|
225 | #IP-Network-Block 069.147.064.000 - 069.147.127.255
|
---|
226 | #Org-Name Yahoo! Inc.
|
---|
227 | my $yahoo4 = new NetAddr::IP "69.147.64.0/18";
|
---|
228 | next if $relayip->within($yahoo4);
|
---|
229 | #IP-Network 206.190.32.0/19
|
---|
230 | #IP-Network-Block 206.190.032.000 - 206.190.063.255
|
---|
231 | #Org-Name Yahoo! Broadcast Services, Inc.
|
---|
232 | my $yahoo5 = new NetAddr::IP "206.190.32.0/18";
|
---|
233 | next if $relayip->within($yahoo5);
|
---|
234 | #IP-Network 68.142.192.0/18
|
---|
235 | #IP-Network-Block 068.142.192.000 - 068.142.255.255
|
---|
236 | #Org-Name Inktomi Corporation
|
---|
237 | my $yahoo6 = new NetAddr::IP "68.142.192.0/18";
|
---|
238 | next if $relayip->within($yahoo6);
|
---|
239 | #IP-Network 216.252.96.0/19
|
---|
240 | #IP-Network-Block 216.252.096.000 - 216.252.127.255
|
---|
241 | #Org-Name Yahoo! Inc.
|
---|
242 | my $yahoo7 = new NetAddr::IP "216.252.96.0/19";
|
---|
243 | next if $relayip->within($yahoo7);
|
---|
244 | #inetnum: 124.83.128.0 - 124.83.255.255
|
---|
245 | #descr: Yahoo! Japan
|
---|
246 | my $yahoo8 = new NetAddr::IP "124.83.128.0/17";
|
---|
247 | next if $relayip->within($yahoo8);
|
---|
248 |
|
---|
249 | # and the same goes for Bell Canada. *le sigh*
|
---|
250 | #IP-Network 209.226.0.0/16
|
---|
251 | #IP-Network-Block 209.226.000.000 - 209.226.255.255
|
---|
252 | #Org-Name Bell Canada
|
---|
253 | #Received: from tomts35-srv.bellnexxia.net (tomts35.bellnexxia.net
|
---|
254 | # [209.226.175.109]) by mx2.company.com (Postfix) with ESMTP id B415C16752D for
|
---|
255 | # <user@compnay.com>; Sat, 4 Jul 2009 10:48:24 -0400 (EDT)
|
---|
256 | # hmm. tomts\d(-srv)?.bellnexxia.net only seem to be in .175/24. we'll just drop those ones for now...
|
---|
257 | # especially since there appear to be hosted customers etc in the same ARIN allocation above.
|
---|
258 | my $bell1 = new NetAddr::IP "209.226.175.0/24";
|
---|
259 | next if $relayip->within($bell1);
|
---|
260 | # next if $recv =~ /\[209\.226\.175\.\d+\] by mx\d\.company\.com/;
|
---|
261 |
|
---|
262 | # ... and your little dog too!
|
---|
263 | #IP-Network 208.82.236.0/22
|
---|
264 | #IP-Network-Block 208.082.236.000 - 208.082.239.255
|
---|
265 | #Org-Name Craigslist, Inc.
|
---|
266 | my $craigslist1 = new NetAddr::IP "208.82.236.0/22";
|
---|
267 | next if $relayip->within($craigslist1);
|
---|
268 |
|
---|
269 | $iplist{$relayip->addr}++ if $relayip;
|
---|
270 | # print "$recv\n";
|
---|
271 | # print "$relayip\n\n";
|
---|
272 | # print $imap->get_header($msgs[$i], "From"); print "\n";
|
---|
273 |
|
---|
274 |
|
---|
275 | # last if $i > 15;
|
---|
276 | sleep 1;
|
---|
277 | } # IMAP message iteration
|
---|
278 |
|
---|
279 | # mm. don't really need times on the IP lists
|
---|
280 | #if ($opts{r}) {
|
---|
281 | # print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
|
---|
282 | #}
|
---|
283 | foreach my $ip (sort keys %iplist) {
|
---|
284 | $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
|
---|
285 | if ($opts{r}) {
|
---|
286 | print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
|
---|
287 | } else {
|
---|
288 | print "$ip\t $iplist{$ip}\n";
|
---|
289 | }
|
---|
290 | }
|
---|
291 |
|
---|
292 | if ($opts{u}) {
|
---|
293 | print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
|
---|
294 | }
|
---|
295 | foreach my $uri (sort keys %urilist) {
|
---|
296 | if ($opts{u}) {
|
---|
297 | my @hout = qx { host $uri.multi.uribl.com; host $uri.uribl.company.com };
|
---|
298 | if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
|
---|
299 | print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
|
---|
300 | }
|
---|
301 | } else {
|
---|
302 | # URI plus count
|
---|
303 | print "$uri: $urilist{$uri}\n";
|
---|
304 | }
|
---|
305 | }
|
---|
306 |
|
---|
307 | $imap->close();
|
---|
308 |
|
---|
309 | # Close IMAP connection cleanly.
|
---|
310 | $imap->logout();
|
---|
311 |
|
---|
312 | # integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
|
---|
313 | # irrelevant for SQL Bayes
|
---|
314 | #my $sarebuild = `/usr/bin/sa-learn --rebuild`;
|
---|
315 | #print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
|
---|