#!/usr/bin/perl -w use strict; use Getopt::Std; our $opt_a = 0; our $opt_i = 0; our $opt_l = 0; our $opt_m = 0; our $opt_R = 0; our $opt_u = 0; our $opt_v = 0; ### Uniquing functions. Unlike uniq(1), these do not care if the input is sorted first. sub unique(@) # Eliminates redundant values. { my %count; ++$count{$_} foreach (@_); return keys %count; } sub distinct(@) # Eliminates values mentioned more than once. { my %count; ++$count{$_} foreach (@_); return grep { $count{$_} == 1 } keys %count; } ### Composing, decomposing, inspecting, and otherwise munging urls. sub decompose_url($) { my $scheme = $_[0] =~ m{^([^:]+):} ? $1 : ""; my $host = $_[0] =~ m{^$scheme:/[^/]*/([^/]+)} ? $1 : ""; my $domain = $host =~ m{([^.]+\.[^.]+)$} ? $1 : ""; my $path = $_[0] =~ m{^(?:$scheme:/[^/]*/$host/)?(.*)} ? $1 : ""; my $dirname = $path =~ m{(.*)/[^/]*$} ? $1 : ""; my $basename = $path =~ m{([^/]+)$} ? $1 : ""; return ($scheme, $host, $domain, $path, $dirname, $basename); } sub complete_url($$) { my $url = shift; my $href = shift; my ($scheme, $host, $dirname) = (decompose_url($url))[0, 1, 4]; my $root = "$scheme://$host"; my $base = $dirname ? ($dirname =~ m{^/} ? "$root$dirname/" : "$root/$dirname/") : "$root/"; local $_ = $href; my $furl = m{^[a-z]+:} ? $_ # fully qualified url, return as is. : m{^/} ? $root.$_ # absolute to base. : $base.$_ # relative to base. ; while ($furl =~ m{/\./}) { $furl =~ s{/\./}{/}; } while ($furl =~ m{/[^/]+/\.\./}) { $furl =~ s{/[^/]+/\.\./}{/}; } return $furl; } sub domain($) { return (decompose_url($_[0]))[2]; } sub is_local($$) { return 1 if !$opt_l; my $d = domain($_[0]); my $result = !$d || $d eq $_[1]; print STDERR "$0: ignoring $_ (not local).\n" if ($opt_v && !$result); return $result; } sub is_image($) { return $_[0] =~ m/\.(gif|jpe?g|png|tiff?)$/i; } sub is_html($) { return 1 if ($opt_a); my ($scheme, $dirname, $basename) = (decompose_url($_[0]))[0, 4, 5]; if ($opt_v) { if ($scheme && $scheme ne 'http') { print STDERR "$0: ignoring href $_[0] (not http).\n"; } elsif (!$dirname && !$basename) { print STDERR "$0: ignoring href $_[0] (top-level page).\n"; } } return 0 if $scheme && $scheme ne 'http'; # no: explicit scheme that's not http. return 0 if !$dirname && !$basename; # no: looks like a top-level page. return 1 if !$basename; # yes: trivial basename. return 1 if $basename =~ m/\.html?$/i; # yes: has HTML extension. return 1 if $basename =~ m/^[^.]+$/; # yes: no extension. return 1 if $basename =~ /\?/; # yes: looks like a query. return 1 if $basename =~ /\.(php3?)$/i; # yes: looks like a CGI. if ($opt_v) { print STDERR "$0: ignoring href $_[0] (not HTML).\n"; } return 0; } ### A very trimmed-down curl(1) as a function. sub curl($;$;$) { my $url = shift; my $required = shift; my $strip = shift; my $data = `curl -LisS "$url"`; exit 1 if $required && $?; if ($data =~ m/^HTTP\/([0-9.]+) (\d+) (.*)/ && $2 >= 400) { if ($required) { print STDERR "request for $url failed: $2 $3\n"; exit 1; } else { $data = ""; } } $data = strip($data) if $strip; return $data; } sub strip($) { my $html = shift; $html =~ s///sg; $html =~ s{]*>.*?}{}isg; return $html; } ### Sniffing tags out of HTML. sub tags($) { return ( $_[0] =~ m{<[^/][^>]+>}sg ); # this skips end tags. } sub imgs($) { return ($_[0] =~ m{]*>}isg); } sub hrefs($;$) { my $html = shift; my $thumbnails_only = shift; my @result; foreach ($html =~ m{(<(?:a|area)[^>]*>.*?)}isg) { if ($thumbnails_only) { next if !m{^<[^>]*>\s*(]*>)\s*$}is; if (!$opt_a) { # Don't follow tiny images -- they're probably navigation links. my $w = attribute($1, "width"); next if $w && $w <= 48; my $h = attribute($1, "height"); next if $h && $h <= 48; } } push @result, $1 if m{\bhref=["']?([^ "'>]*)"?}is; } return @result; } sub attribute($$) { return $_[0] =~ m/\b$_[1]=["']?([^ >"']+)"?/i ? $1 : ""; } ### The top-level bits. sub images($$;$;$) { my $url = shift; my $domain = shift; my $required = shift; my $source = shift; $source = strip($source); print STDERR "$0: checking page $url\n" if ($opt_v); return map complete_url($url, $_), grep is_local($_, $domain), map attribute($_, 'src'), imgs $source; } getopts('ailmRuv'); my ($B, $data) = @ARGV; my $D = domain($B); if (!$opt_R) { # images on this page only. if( !$data ) { $data = curl($B, 1, 1); } my @images = images($B, $D, 1, $data); @images = distinct @images if $opt_u; print "$_\n" foreach @images; } else { # get linked images and images on sub-pages. if( !$data ) { $data = curl($B, 1); } if ($opt_m && $B =~ m|/homepage.mac.com/| && $data =~ /new Slide\(/) { # special case for .Mac slideshow pages. DON'T strip comments first; .Mac puts the script inside an HTML comment. my @urls = $data =~ /new Slide\('([^'\\]*(?:\\.[^'\\]*)*)'/g; map s/\\(.)/$1/g, @urls; print "$_\n" foreach @urls; } else { # Just a regular page... $data = strip($data); my @images = map complete_url($B, $_), grep is_local($_, $D), grep is_image($_), hrefs($data); if (@images) { # if we find any direct image links, we're done. print "$_\n" foreach $opt_u ? unique @images : @images; } else { # otherwise, scan for images on sub-pages. my @pages = map complete_url($B, $_), grep is_html($_), hrefs($data, $opt_i); my @subimages; push @subimages, images($_, $D) foreach (grep is_local($_, $D), unique @pages); print "$_\n" foreach $opt_u ? distinct @subimages : @subimages; } } }