source: trunk/dnsbl/extract-data@ 16

Last change on this file since 16 was 16, checked in by Kris Deugau, 14 years ago

/trunk/dnsbl

Changes from live copy of extract-data

  • new IP blocks to ignore for listing
  • spit out refs for messages already tagged as spam so they can be manually handled; SA libs don't strip SA wrapper message when used this way
  • Property svn:executable set to *
  • Property svn:keywords set to Date Rev Author Id
File size: 8.1 KB
Line 
1#!/usr/bin/perl
2# Extract relay IP, URI hosts from mail in folder
3# Hack-n-patch from:
4# Heavily reworked from original:
5# dmz@dmzs.com - March 19, 2004
6# http://www.dmzs.com/tools/files/spam.phtml
7# LGPL
8# by:
9# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
10# Kris Deugau <kdeugau@deepnet.cx> 2009/05/19
11##wrapreq libmail-imapclient-perl (>=3.11), libio-socket-ssl-perl, install-sa
12##wrapver 0.1
13##wrapsum Extract relay IP and URI hosts from mail in IMAP folder
14# $Id: extract-data 16 2010-01-20 22:35:34Z kdeugau $
15# $URL$
16
17use strict;
18use warnings;
19use Time::Local;
20use POSIX qw(strftime);
21use IO::Socket::SSL;
22use Mail::IMAPClient;
23use Getopt::Std;
24use NetAddr::IP;
25
26# hmm. can't make (enough) sense of the docs. gonna have to parse headers for IPs myself... :/
27# but we did find enough to extract the URIs...
28use lib '/opt/spamassassin/share/perl/5.8.8';
29use Mail::SpamAssassin;
30use Mail::SpamAssassin::PerMsgStatus;
31
32my %opts;
33getopts("druv", \%opts);
34
35my $debug = ($opts{d} ? 1 : 0);
36my $sadebug = 0; # sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
37my $imapdebug = 0; # so does Mail::IMAPClient... as in, the whole content of all the mail you look at. O_o
38my $delete_after_learning = 0; # set to 1 if you want to delete mail right away after learning
39 # - note this makes it rather harder to deep-scan the messages to create local rules
40my $verbose = ($opts{v} ? 1 : 0);
41
42my $tmpdir = '/var/tmp';
43my $salearn = '/usr/local/bin/sa-learn';
44my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';
45
46die "eeep! $salearn doesn't exist!\n" if ! -e $salearn;
47
48my $folder = 'reported.needsextraction';
49#my $folder = 'confirmed';
50
51# non-SSL IMAP settings:
52#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
53# User => 'imapspamuser',
54# Password => 'imapspamuserpassword',
55# Debug => $imapdebug);
56
57my $imap = Mail::IMAPClient->new(
58 User => 'junkmail',
59 Password => 'k3c86z2',
60 Socket => IO::Socket::SSL->new(
61 Proto => 'tcp',
62 PeerAddr => 'mail.company.com',
63 PeerPort => 993, # IMAP over SSL standard port
64 ),
65 Debug => $imapdebug,
66 );
67
68if (!defined($imap)) { die "IMAP Login Failed"; }
69
70my $msgcount = $imap->message_count($folder);
71
72# If debugging, print out the total counts for each mailbox
73print $msgcount, " message(s) to process\n" if $debug;
74
75## Process the spam mailbox
76$imap->select($folder);
77my @msgs = $imap->search("ALL");
78
79# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
80#my $spamtmp = "$tmpdir/spam.".time.".$$"; # this should give us a suitably pseudorandom directory
81#mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";
82
83my $spamtest = Mail::SpamAssassin->new();
84 # don't keep dereferencing this
85
86my %iplist;
87my %urilist;
88
89# put together an array of netblocks we won't/can't list for various reasons
90my @dontlistme = (
91 # Hotmail/Windows Live Mail
92 NetAddr::IP->new("65.52.0.0/14"),
93
94 # AOL - note only some IPs show mail-ish rDNS
95 #IP-Network 205.188.0.0/16
96 #IP-Network 64.12.0.0/16
97 NetAddr::IP->new("205.188.105.140/29"),
98 NetAddr::IP->new("205.188.169.196/29"),
99 NetAddr::IP->new("205.188.249.128/29"),
100 NetAddr::IP->new("205.188.249.64/29"),
101 NetAddr::IP->new("64.12.143.144/30"),
102
103 # Google/GMail
104 NetAddr::IP->new("209.85.128.0/17"),
105 NetAddr::IP->new("72.14.192.0/18"),
106
107 # Yahoo!/Inktomi
108 NetAddr::IP->new("98.136.0.0/14"),
109 NetAddr::IP->new("66.196.64.0/18"),
110 NetAddr::IP->new("67.195.0.0/16"),
111 NetAddr::IP->new("69.147.64.0/18"),
112 NetAddr::IP->new("206.190.32.0/18"),
113 NetAddr::IP->new("68.142.192.0/18"),
114 NetAddr::IP->new("216.252.96.0/19"),
115 NetAddr::IP->new("124.83.128.0/17"),
116 NetAddr::IP->new("217.146.184.0/21"),
117 NetAddr::IP->new("124.108.96.0/20"),
118 NetAddr::IP->new("76.13.0.0/16"),
119 NetAddr::IP->new("68.180.128.0/17"),
120 NetAddr::IP->new("209.191.64.0/18"),
121 NetAddr::IP->new("212.82.104.0/21"),
122 NetAddr::IP->new("66.163.160.0/19"),
123
124 # Bell Canada - note only some IPs show mail-ish rDNS
125 #IP-Network 209.226.0.0/16
126 #IP-Network 207.236.0.0/16
127 NetAddr::IP->new("209.226.175.0/24"),
128 NetAddr::IP->new("207.236.237.0/26"),
129
130 # Craigslist
131 #IP-Network 208.82.236.0/22
132 NetAddr::IP->new("208.82.236.0/22"),
133
134 # Apple.com/mac.com - note only some IPs show mail-ish rDNS
135 #IP-Network 17.0.0.0/8
136 # asmtpout0(11-30).mac.com
137 # 17.148.16. 011 -> 86 030 -> 105
138 NetAddr::IP->new("17.148.16.64/26"),
139
140 # Vodafone - note only some IPs show mail-ish rDNS
141 #route: 212.183.128.0/19
142 NetAddr::IP->new("212.183.156.224/29"),
143
144 # Eastlink (formerly Persona [Sudbury etc]) - only a few IPs observed with mail-ish rDNS
145 #IP-Network 24.222.0.0/16
146 NetAddr::IP->new("24.222.0.30"),
147 NetAddr::IP->new("24.224.136.8/30"),
148
149 # Cogeco - only a few IPs observed with mail-ish rDNS
150 #IP-Network 216.221.64.0/19
151 NetAddr::IP->new("216.221.81.192"),
152 NetAddr::IP->new("216.221.81.96/30"),
153
154 # UAlberta - only one IP observed with mail-ish rDNS
155 #IP-Network 129.128.0.0/16
156 NetAddr::IP->new("129.128.5.19"),
157
158 # seznam.cz
159 #route: 77.75.72.0/23
160 NetAddr::IP->new("77.75.72.44"),
161
162 ); # done def for @dontlistme
163
164MSG: for (my $i=0; $i<$msgcount; $i++) {
165 my $msg = $imap->message_string($msgs[$i]);
166
167 my $mail = $spamtest->parse($msg);
168 my $status = $spamtest->check($mail);
169
170# stolen from SA plugin bits
171#sub parsed_metadata {
172# my ($self, $opts) = @_;
173# my $scanner = $status->{permsgstatus};
174
175 my $skip_domains = $status->{main}->{conf}->{uridnsbl_skip_domains};
176
177 # Generate the full list of html-parsed domains.
178 my $uris = $status->get_uri_detail_list();
179
180# print "$msgs[$i]\n";
181 my %msguris;
182
183 while (my($uri, $info) = each %{$uris}) {
184 # we want to skip mailto: uris
185 next if ($uri =~ /^mailto:/);
186
187 # no domains were found via this uri, so skip
188 next unless ($info->{domains});
189
190 next if $info->{types}->{img};
191 foreach ( keys %{ $info->{domains} } ) {
192 if (exists $skip_domains->{$_}) {
193 next;
194 }
195 $msguris{$_}++;
196 #print " ".$_."\n";
197 }
198
199 # print "$uri ".$info->{anchor_text}."\n";
200 }
201
202 foreach (keys %msguris) {
203# print " $_ $msguris{$_}\n";
204 $urilist{$_}++;
205 }
206
207 # now, get the relay IP
208
209 my $stmsg = $status->get_message();
210 my @untrusted = @{$stmsg->{metadata}->{relays_untrusted}};
211
212 my $relayip = new NetAddr::IP $untrusted[0]->{ip};
213
214# sigh. messages tagged as spam already make life difficult by hiding the
215# old Received: headers. We'll just handle them manually for now.
216 if (!$relayip) {
217 print "phtui: ";
218 my %headerlist = %{$imap->parse_headers($msgs[$i], "Received", "Subject")};
219 my $recvnum = 0;
220 my $recv = $headerlist{'Received'}[$recvnum];
221 print "$recv\n";
222 print " $headerlist{'Subject'}[0]\n";
223 next MSG;
224 }
225
226 foreach my $block (@dontlistme) {
227 next MSG if $relayip->within($block);
228 }
229
230 $iplist{$relayip->addr}++ if $relayip;
231
232# last if $i > 2;
233 sleep 1;
234} # IMAP message iteration
235
236# mm. don't really need times on the IP lists
237#if ($opts{r}) {
238# print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
239#}
240foreach my $ip (sort keys %iplist) {
241 $ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
242 if ($opts{r}) {
243 print "+$4.$3.$2.$1.spamhosts.company.com:127.0.0.2:900:::\n";
244 } else {
245 print "$ip\t $iplist{$ip}\n";
246 }
247}
248
249if ($opts{u}) {
250 print strftime("# %Y/%m/%d %H:%M", localtime())."\n";
251}
252foreach my $uri (sort keys %urilist) {
253 my @hout = qx { host '$uri.multi.uribl.com'; host '$uri.uribl.company.com' };
254 if ($hout[0] =~ /NXDOMAIN/ && $hout[1] =~ /NXDOMAIN/) {
255 if ($opts{u}) {
256 print "+$uri.uribl.company.com:127.0.0.2:900:::\n";
257 } else {
258 # URI plus count
259 print "$uri\t$urilist{$uri}\n";
260 }
261 }
262}
263
264$imap->close();
265
266# Close IMAP connection cleanly.
267$imap->logout();
268
269# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
270# irrelevant for SQL Bayes
271#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
272#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;
Note: See TracBrowser for help on using the repository browser.