#!/usr/bin/perl -Tw

sub homedir(){
	$ENV{HOME}=~/(.*)/;
	$::homedir=$1;
}
use lib '/usr/local/lib';
use lib homedir."/.lib";
use lib '/usr/local/lib/mime_parse_scan';
use lib "$::homedir/.lib/mime_parse_scan";

use lib '/home/7eggert/l/my/spam/';

use File::Glob ':glob';
use Digest::MD5;
use BE::Lib;
use Socket;

delete $ENV{"PATH"};

my $DOMAINLIST="/home/7eggert/.config/fsmtpd/spammers.domain";
my $DOMAINLIST2="/home/7eggert/.config/fsmtpd/goodurl.domain";
my $NXDOMAIN="/home/7eggert/.config/fsmtpd/nxdomain";
my $NEUTRAL="/home/7eggert/.config/fsmtpd/neutralfiles.md5";
my $TLTLD="/home/7eggert/.config/fsmtpd/two-level-tlds";
my $GOOGLESPAMMERS="/home/7eggert/.config/fsmtpd/spammers.google";
my $SITEMD5="/home/7eggert/.config/fsmtpd/spammers.sitemd5";

if(open(MD5,"<",$SITEMD5)){
	@::md5s=<MD5>;
	map(chomp,@::md5s);
} else{die "open($SITEMD5): $!"};

if(open(SD,">>",$DOMAINLIST)){
}else{die $!}

open(DOMAINLIST,'<',$DOMAINLIST) || die "$DOMAINLIST: $!";
@::DOMAINLIST=<DOMAINLIST>;
close(DOMAINLIST) || die $!;
map({chomp;s/^\[(.*)\]$/$1/}@::DOMAINLIST);
@::DOMAINLIST=grep(!/^$|^#/,@::DOMAINLIST);

open(DOMAINLIST,'<',$DOMAINLIST2) || die "$DOMAINLIST2: $!";
open(NXDOMAIN,'<',$NXDOMAIN) || die "$NXDOMAIN: $!";
@::DOMAINLIST2=(<DOMAINLIST>,<NXDOMAIN>);
close(DOMAINLIST) || die $!;
close(NXDOMAIN) || die $!;
map({chomp;s/^\[(.*)\]$/$1/}@::DOMAINLIST2);
@::DOMAINLIST2=grep(!/^$|^#/,@::DOMAINLIST2);

open(NEUTRAL,'<',$NEUTRAL) || die $!;
@::NEUTRAL=<NEUTRAL>;
close(NEUTRAL) || die $!;
map({chomp;s/^\[(.*)\]$/$1/}@::NEUTRAL);
@::NEUTRAL=grep(!/^$|^#/,@::NEUTRAL);


if(open(SD,">>",$DOMAINLIST)){
}else{die $!}
if(open(NXDOMAIN,">>",$NXDOMAIN)){
}else{die $!}


%::googlespammers=read_iplist($GOOGLESPAMMERS);


$recursions=1;
$::verbose=0;
$::linkex=0;
$::recurse=1;
$::nodomain=0;
@::deescape=("correct");
%::done=();
%::done_domain=();

push(@ARGV,'-r');

$::base=undef;

for my $dir (("tmp","files","gif")){
	if(!-d $dir){mkdir $dir}
	if(!-d $dir){die "Cannot create $dir: $!\n"}
}

url_loadtldomain($TLTLD);

sub is_tainted {
	my $var = shift;
	my $blank = substr( $var, 0, 0 );
	return not eval { eval "1 || $blank" || 1 };
}

#chdir("tmp");
url:while(@ARGV){
	my $url=shift(@ARGV);
#	print STDERR "doing $url, rest is ",join(', ',@ARGV),"\n";
	if($url=~/^-(\d+)$/){$recursions=$1;next}
	if($url eq "-v"){$::verbose=1;next}
	if($url eq "-r"){$::recurse=0;$::linkex=0;next}
	if($url eq "-l"){$::linkex=1;next}
	if($url eq "-b"){$::base=undef;next}
	if($url eq "-D"){$::nodomain=1;next}
	if($url eq "-R"){push(@::deescape,"redir");next}

	if(!($url=~/^https?:/i)){
		if(-f $url){
			open(F,'-|','/home/7eggert/l/my/spam/mime_parse_scan/mime-parse-scan.pl','-onlylink',$url);
			my @a=<F>;
			close(F);
			map(chomp,@a);
			@a=grep(!/^-|^$/,@a);
			unshift(@ARGV,@a);
#			print STDERR "unshifted <",join(', ',@a),">\n";
			next;
		}
		print STDERR "skipping $url\n";
		next;
	}
#	print STDERR "continuing $url";

	$url=url_deescape($url,@deescape);
	$url=~s/\?[0-9a-z.]+$/?08154711/i
	|| $url=~s/\?.*//;
	
	if(grep($_ eq $url,keys(%::done))){next}
	$::done{$url}=1;

	my @domain;
	if(!(@domain=url_getdomain($url))){next}
	my $domain=$domain[0];

	if($::done_domain{$domain}){next}
	if($domain[1] && $::done_domain{$domain[1]}){next}

	my @d;
	if(@d=grep($domain=~/(^|\.)\Q$_\E$/,@::DOMAINLIST2)){next}
	if(@d=grep($domain=~/(^|\.)\Q$_\E$/,@::DOMAINLIST)){
		if($::base && grep($domain=~/(^|\.)\Q$_\E$/,@::DOMAINLIST)){
			$domain=url_getdomain($base);
			$::done_domain{$domain}=1;
			if($domain=~/^[\w-]+\.\w+$|^(\d+\.){3}\d+$/){
				if($domain=~/^(\d+\.){3}\d+$/){$domain="[$domain]"}
				print SD "$domain\n";
				$domain="($domain)";
			}
			print "$domain refers_done $::base\n";
			while(shift(@ARGV) ne "-b"){}
			$::base=undef;
			next url;
		}
		if($::verbose){print "\n\n"}
		print "($domain) done $d[0]\n";
		next url;
	}
	
	if($::verbose){
		print STDERR "$url $domain ".($domain[1]?$domain[1]:"")."\e[K\r";
	}
	
	if($domain[1]){
		if(!(my $ip=gethostbyname($domain[1]))){
			if($? == 2){$ip=gethostbyname($domain[1])}
			if($? == 1){
				print NXDOMAIN "$domain[1]\n";
				$::done_domain{$domain[1]}=1;
				if($::verbose){print "\n\n"}
				print "($domain[1]) nxdomain\n";
				next url;
			}
		}
		if(my $ip=inet_aton($domain[1].".multi.surbl.org")){
			print SD "$domain[1]\n";
			$::done_domain{$domain[1]}=1;
			if($::verbose){print "\n\n"}
			my $mask=unpack('N',$ip);
			my @bls=();
			for(my $i=1;$i<=5;$i++){
				if($mask&(1<<$i)){
					push(@bls,('n/a','sc','ws','ph','ob','ab')[$i]);
				}
			}
			print "($domain[1]) surbl.org(".join(',',@bls).") $url\n";
			next url;
		}
		my $a=ipinlist($domain[1],%::googlespammers);
		if($a){
			print SD "$domain[1]\n";
			$::done_domain{$domain[1]}=1;
			if($::verbose){print "\n\n"}
			print "($domain[1]) googlespammer $a\n";
			next url;
		}
	}elsif($domain=~/^(\d+\.){0,3}\d+$/){
		my $ip=inet_aton($domain);
		if($ip){
			my $a=unpack("N",$ip);
			$a=($a&0xff)<<24|($a&0xff00)<<8|($a&0xff0000)>>8|($a&0xff000000)>>24;
			if(my $ip=inet_aton(inet_ntoa(pack("N",$a)).".sc.surbl.org")) {
				print SD "[$domain]\n";
				$::done_domain{"[$domain]"}=1;
				if($::verbose){print "\n\n"}
				my $mask=unpack('N',$ip);
				my @bls=();
				for(my $i=1;$i<=5;$i++){
					if($mask&(1<<$i)){
						push(@bls,('n/a','sc','ws','ph','ob','ab')[$i]);
					}
				}
				print "([$domain]) surbl.org(".join(',',@bls).") $url\n";
				next url;
	}	}	}
	my @files=grep(!/^tmp\/\.\.?$/,
	          bsd_glob('tmp/{*,.*}',GLOB_NOSORT|GLOB_BRACE));
	for $a (@files){
		$a=~/(.*)/;
		if(!is_tainted($1)){unlink($1)}
		else{print "\n\nfoobar $a\n\n"}
	}
	for(my $x=4;$x>=0;){
		@files=grep(!/^tmp\/\.\.?$/,
		       bsd_glob('tmp/{*,.*}',GLOB_NOSORT|GLOB_BRACE));
		for $a (@files){
			$a=~/(.*)/;
			if(!is_tainted($1)){unlink($1)}
			else{print "\n\nfoobar $a\n\n"}
		}
		@files=grep(!/^tmp\/\.\.?$/,
			   bsd_glob('tmp/{*,.*}',GLOB_NOSORT|GLOB_BRACE));
		--$x;
		if(($x>0) && @files){
#			if(fork){exit}
			sleep 4;print STDERR "waiting for stale files ($x) ... \r"
		}
	}
	@files=grep(!/^tmp\/\.\.?$/,
	       bsd_glob('tmp/{*,.*}',GLOB_NOSORT|GLOB_BRACE));
	if(@files){die "\nCould not erase files: ".join(", ",@files)}
	
	$url=~/(.*)/;

	if(!$::linkex){
		chdir('tmp') || die;
		system("/usr/bin/wget","-nd","-T8","-t1","-Q4m","-rl$recursions",
			   "--timeout=30", "-e","robots=off","-o","/dev/null","$1");
		chdir('..') || die;
		@files=bsd_glob('tmp/{*,.*}',GLOB_NOSORT|GLOB_BRACE);
		@files=grep(!/^tmp\/\.\.?$/,@files);
		for $file (@files){
			if(open(FILE,'<',$file)){
				$ctx = Digest::MD5->new;
				$ctx->addfile(*FILE);
				my $digest = $ctx->hexdigest;
				if(my @x=grep(/^\Q$digest\E\b/,@::md5s)){
					my $desc=$x[0];
					$desc=~s/.*?\s+//;
					if($::base){
						my $domain2=url_getdomain($::base);
						if($domain2 ne $domain){
							push(@::DOMAINLIST,$domain);
							if($domain2=~/^[\w-]+\.\w+$|^(\d+\.){3}\d+$/){
								if($domain2=~/^(\d+\.){3}\d+$/){$domain="[$domain2]"}
								print SD "$domain2\n";
								$::done_domain{$domain2}=1;
								$domain2="($domain2)";
							}
							if($::verbose){print "\n\n"}
							print "$domain2 refers_$desc $base $file\n";
					}	}
					push(@::DOMAINLIST,$domain);
					if($domain=~/^[\w-]+\.\w+$|^(\d+\.){3}\d+$/){
						if($domain=~/^(\d+\.){3}\d+$/){$domain="[$domain]"}
						print SD "$domain\n";
						$::done_domain{$domain}=1;
						$domain="($domain)";
					}
					if($::verbose&&!$::base){print "\n\n"}
					print "$domain $desc $url $file\n";
					if($::base){
						while(shift(@ARGV) ne "-b"){}
						$::base=undef;
					}
					next url;
				}elsif(!grep($_ =~ /\Q$digest\E$/,@::NEUTRAL)){
					my $ext=".dat";
					my $nfile;
					if(($file=~/unsub|remov|head|alprazolam|ambien|cialis|diazepam|fioricet|
					levitra|phent|prozac|slogan|soma|valium|viagra|xanax/x)
					&&!($file=~/\.php$/)){
						if($file=~/(\.[^.]+)$/){$ext=$1}
						$nfile="files/$digest md5_$domain$ext";
						$file=~/(.*)/;
						if(!-e $nfile){rename($1,$nfile)}
					}elsif($file=~/\.(?:gif|jpg|png)$/i){
						if($file=~/(\.[^.]+)$/){$ext=$1}
						$nfile="gif/$digest md5_$domain$ext";
						$file=~/(.*)/;
						if(!-e $nfile){
							rename($1,$nfile)
							|| print " $1 -> $nfile: $!\n";
		}	}	}	}	}
	}else{@files=($1)}
	close(FILE);
	if(!$::base&&(@files==1)&&!$::nodomain){
		$files[0]=~/(.*)/;
		open(LINKS,'-|',"/home/7eggert/bin/link-extract.pl",$url)
		|| die "link-extract.pl: $!";
		my @l=<LINKS>;
		close(LINKS);
		map(s/[\x0d\x0a]+$//,@l);
		map($_=url_deescape($_,"redir"),@l);
		if(@l){
			map(chomp($_),@l);
			unshift(@ARGV,@l,'-b');
			$::base=$url;
			next url;
	}	}
	my $oldurl=$url;
	if(!$::base&&!$::nodomain&&($url=~
	s~^(https?:/+?(www\.)?([0-9a-z]+\@)?([^/\?:]+)(:\d+)?)[/\?]\w.*~$1~)){
		if(($url ne $oldurl)
		&& ("$url/" ne $oldurl)){
			$::base=$oldurl;
			unshift(@ARGV,$url,'-b');
}	}	}
#print STDERR "normal exit\n";