#!/usr/bin/perl
#
# eg5a: grab a webpage, parse the links in it
#       and fetch and store linked .ps, .pdf and .tgz files
#
use strict;
use warnings;

use Function::Parameters;
use LWP::Simple;
use HTML::Parser;
use URI::URL;

my $url;
my @links = ();

#
# deal with a start tag with its attributes
#
fun findlinks( $tag, $attr )
{
	return unless $tag eq "a";
	my $link = $attr->{href};
	return unless defined $link;
	$link = url( $link, $url )->abs;
	push @links, $link;
}

# main program
die "Usage: eg7 url destdir\n" unless @ARGV == 2;

$url = shift @ARGV;
my $destdir = shift @ARGV;

my $contents = get( $url ) || die "eg7: can't fetch URL $url\n";

my $parser = HTML::Parser->new(
	start_h => [ \&findlinks, 'tagname,attr'] );

$parser->parse( $contents );

# now @links contains the links - print them out.

mkdir( $destdir, 0755 ) unless -d $destdir;
chdir( $destdir ) || die "can't cd into $destdir\n";

foreach (@links)
{
	next unless m#^http# && m#/([^/]+\.(ps|pdf|tgz))$#;
	my $filename = $1;
	print "fetching $_ -> $destdir/$filename\n";
	getstore( $_, $filename ) || warn "can't fetch $_\n";
}
