#!/usr/bin/perl -T
use LWP::UserAgent;

use lib '/usr/local/lib/mime_parse_scan/';
use lib '/home/7eggert/l/my/spam/';
use BE::Lib;

delete($ENV{"PATH"});

chomp($cwd = `pwd`);

%::links_match=('a'=>'href', 'img'=>'src', 'form'=>'action');
%::links=();
$::text='';

sub cb_start {
	my($tag, $attr) = @_;
	for my $k (keys(%{$attr})){
		if($::links_match{$tag} && ($::links_match{$tag} eq $k)){
			my $url=url_deescape(${$attr}{$k});
			$::links{$url}=1;
}	}	}

use HTML::Parser;
$::HTML_Parser=HTML::Parser->new(
  "default_h" => [sub { $::text.=shift }, 'dtext'],
  "start_h"   => [\&cb_start, 'tag, attr'],
  "comment_h" => [""],
  "end_h"     => [""]);

@::deescape=();

$::verbose=0;
$|=1;

while($url = shift(@ARGV)) {

	if($url eq "-R"){push(@::deescape,"redir");next;};
	if($url eq "-C"){push(@::deescape,"correct");next;};
	if($url eq "-v"){$::verbose=1;next;};

	%::links=();
	$::text="";
	$ua = LWP::UserAgent->new(env_proxy => 1);

	if(!($url=~m~^\w+://~)) {
		if(!($url =~ /^\//)) {$url = "$cwd/$url"};
		$url="file://localhost$url";
	}
	
	# Request document and parse it as it arrives
	my $text='';
	if($::verbose){
		print "$url\e[K\r";
	}
	$ua->request(HTTP::Request->new(GET => $url),sub{$text.=$_[0]});
	$::HTML_Parser->parse($text);
	$::HTML_Parser->eof;

	if($::text){
		my %links=();
		$::text=~s/\xa0/ /g;
		while($::text=~m,(^|[<="'\s])(https?:/+[^\s"'>]+),gi)
		{$::links{url_deescape($2,@::deescape)}=1;}
		while($::text=~m,\s(www\.[^\s"'>]+),gi)
		{$links{url_deescape("http://$1",@::deescape)}=1;}
		while($::text=~m,^(www\.[^\s"'>]+),gmi)
		{$links{url_deescape("http://$1",@::deescape)}=1;}
		while($text=~m,(^|[="\s])(https?:/+[^\s"'>]+),gi)
		{$::links{url_deescape($2,@::deescape)}=1;}
		while($text=~m,\s(www\.[^\s"'>]+),gi)
		{$links{url_deescape("http://$1",@::deescape)}=1;}
		while($text=~m,^(www\.[^\s"'>]+),gmi)
		{$links{url_deescape("http://$1",@::deescape)}=1;}
	}

	my @l=keys(%::links);
	if(@l){
		map($_=url_addbase($_,$url),@l);
		print(join("\n",sort(@l))."\n");
}	}

