while ( ($user, $homedir)= each %home_dir ) {
($dev,$inode)= stat("$homedir/.") ;
$inode_to_user{$dev}{$inode}= $user
unless $inode_to_user{$dev}{$inode} ne '' ;
}
}
#----- Extracting URLs from HTML ------------------------------------
# Parse an SGML tag, and return a hash structure with a "name" scalar and
# an "attrib" hash.
# Parses first tag in string, ignoring all surrounding text.
# Results are canonicalized to lower case wherever case-insensitive.
sub parse_tag {
my($tag)= @_ ; # will be mangled
my($tagname,%attrib) ;
# only parse first tag in string
($tag)= split(/>/, $tag) ; # remove all after >
$tag=~ s/^([^<]*<)?\s*// ; # remove pre-<, <, and leading blanks
($tagname,$tag)= split(/\s+/, $tag, 2) ; # split out tag name
# Extract name/value (possibly quoted), lowercase name, set $attrib{}.
# If quoted, is delimited by quotes; if not, delimited by whitespace.
$attrib{lc($1)}= &HTMLunescape($+)
while ($tag=~ s/\s*(\w+)\s*=\s*(([^"']\S*)|"([^"]*)"?|'([^']*)'?)//) ;
# now, get remaining non-valued (boolean) attributes
$tag=~ s/^\s*|\s*$//g ; # skip leading/trailing blanks
foreach (split(/\s+/, $tag)) {
$_= lc($_) ;
$attrib{$_}= $_ ; # booleans have values equal to their name
}
return { 'name' => lc($tagname),
'attrib' => \%attrib } ;
}
# Unescape any HTML character references and return resulting string.
# Support entity character references in %e_to_ch (which is incomplete),
# plus "$#ddd;" and "Ý" forms for values<256.
# Note that not decoding a valid character is erroneous, in that a
# subsequent re-escaping will not return the original string, because
# of the ampersand. Nonetheless, that's preferable to losing the data.
# Q: Is there an appropriate general way to represent an unescaped string?
sub HTMLunescape {
my($s)= @_ ;
# Try alpha, decimal, and hex representations, only substituting if valid
$s=~ s/&(([a-zA-Z][a-zA-Z0-9.-]*);?|#([0-9]+);?|#[Xx]([0-9a-fA-F]+);?)/
length($2) ? ( defined($e_to_ch{$2}) ? $e_to_ch{$2} : "&$1" )
: length($3) ? ( $3 < 256 ? chr($3) : "&$1" )
: length($4) ? ( hex($4) < 256 ? chr(hex($4)) : "&$1" )
: "&$1"
/ge ;
return $s ;
}
# Given a block of HTML, extracts all URLs referenced in it, and adds them
# to our data structures to be downloaded or checked (i.e. calls
# &add_url()).
# Note that %html_urls and %non_html_urls are set at the start of the
# program for efficiency, but are an integral part of this routine.
# Currently, this extracts all <.*?> patterns, which may not be valid if
# "<" or ">" characters are e.g. inside a <script> element.
sub extract_urls {
my($HTML, $baseurl, $referer, $depth)= @_ ;
my(@tags) ;
# Remove comments before extracting links, as pointed out by Tim Hunter.
$HTML=~ s/<!--.*?--.*?>//gs ;
# We must look for <base> tag before all the work, so we must parse
# all tags first. :( Therefore, we save this large array of
# structures for efficiency, hoping we don't run out of memory.
my($i)= -1 ; # to start at array element 0
foreach ($HTML=~ /(<.*?>)/gs) {
$tags[++$i]= &parse_tag($_) ;
$baseurl= $tags[$i]{'attrib'}{'href'}
if ($tags[$i]{'name'} eq 'base')
and (length($tags[$i]{'attrib'}{'href'})) ;
}
# For each tag, call &add_url() for each URL in the tag
foreach my $tag (@tags) {
next if $tag->{'name'}=~ m#^/# ;
# Handle the "regular" tag-attributes, in %html_urls and %non_html_urls
foreach (@{$html_urls{$tag->{'name'}}}) {
&add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl),
$referer, $depth)
if length($tag->{'attrib'}{$_}) ;
=16= |