PROXY  WHOIS  RQUOTE  TEXTS  SOFT  FOREX  BBOARD
 Music  Philosophy  Code  Literature  Russian

= ROOT|Technical|Proxy_Docs|_Perl_code|cl-1.0.1.pl =

page 16 of 22



    while ( ($user, $homedir)= each %home_dir ) {
        ($dev,$inode)= stat("$homedir/.") ;
        $inode_to_user{$dev}{$inode}= $user 
            unless $inode_to_user{$dev}{$inode} ne '' ;
    }
}



#----- Extracting URLs from HTML ------------------------------------


# Parse an SGML tag, and return a hash structure with a "name" scalar and
#   an "attrib" hash.
# Parses first tag in string, ignoring all surrounding text.
# Results are canonicalized to lower case wherever case-insensitive.
sub parse_tag {
    my($tag)= @_ ;       # will be mangled
    my($tagname,%attrib) ;

    # only parse first tag in string
    ($tag)= split(/>/, $tag) ;  # remove all after >
    $tag=~ s/^([^<]*<)?\s*// ;  # remove pre-<, <, and leading blanks
    ($tagname,$tag)= split(/\s+/, $tag, 2) ;        # split out tag name

    # Extract name/value (possibly quoted), lowercase name, set $attrib{}.
    # If quoted, is delimited by quotes; if not, delimited by whitespace.
    $attrib{lc($1)}= &HTMLunescape($+)
        while ($tag=~ s/\s*(\w+)\s*=\s*(([^"']\S*)|"([^"]*)"?|'([^']*)'?)//) ;

    # now, get remaining non-valued (boolean) attributes
    $tag=~ s/^\s*|\s*$//g ;             # skip leading/trailing blanks
    foreach (split(/\s+/, $tag)) {
        $_= lc($_) ;
        $attrib{$_}= $_ ;   # booleans have values equal to their name
    }

    return { 'name'   => lc($tagname), 
             'attrib' => \%attrib       } ;
}


# Unescape any HTML character references and return resulting string.
# Support entity character references in %e_to_ch (which is incomplete),
#   plus "$#ddd;" and "Ý" forms for values<256.
# Note that not decoding a valid character is erroneous, in that a
#   subsequent re-escaping will not return the original string, because
#   of the ampersand.  Nonetheless, that's preferable to losing the data.
#   Q:  Is there an appropriate general way to represent an unescaped string?
sub HTMLunescape {
    my($s)= @_ ;

    # Try alpha, decimal, and hex representations, only substituting if valid
    $s=~ s/&(([a-zA-Z][a-zA-Z0-9.-]*);?|#([0-9]+);?|#[Xx]([0-9a-fA-F]+);?)/
              length($2)   ? ( defined($e_to_ch{$2}) ? $e_to_ch{$2}  : "&$1" )
            : length($3)   ? ( $3 < 256              ? chr($3)       : "&$1" )
            : length($4)   ? ( hex($4) < 256         ? chr(hex($4))  : "&$1" )
                           :   "&$1"
          /ge ;

    return $s ;
}


# Given a block of HTML, extracts all URLs referenced in it, and adds them
#   to our data structures to be downloaded or checked (i.e. calls 
#   &add_url()).
# Note that %html_urls and %non_html_urls are set at the start of the
#   program for efficiency, but are an integral part of this routine.
# Currently, this extracts all <.*?> patterns, which may not be valid if
#   "<" or ">" characters are e.g. inside a <script> element.
sub extract_urls {
    my($HTML, $baseurl, $referer, $depth)= @_ ;
    my(@tags) ;

    # Remove comments before extracting links, as pointed out by Tim Hunter.
    $HTML=~ s/<!--.*?--.*?>//gs ;

    # We must look for <base> tag before all the work, so we must parse
    #   all tags first.  :(  Therefore, we save this large array of
    #   structures for efficiency, hoping we don't run out of memory.
    my($i)= -1 ;  # to start at array element 0

    foreach ($HTML=~ /(<.*?>)/gs) {
        $tags[++$i]= &parse_tag($_) ;
        $baseurl= $tags[$i]{'attrib'}{'href'}
            if ($tags[$i]{'name'} eq 'base') 
                and (length($tags[$i]{'attrib'}{'href'})) ;
    }

    # For each tag, call &add_url() for each URL in the tag
    foreach my $tag (@tags) {
        next if $tag->{'name'}=~ m#^/# ;

        # Handle the "regular" tag-attributes, in %html_urls and %non_html_urls

        foreach (@{$html_urls{$tag->{'name'}}}) {
            &add_url(&absolute_url($tag->{'attrib'}{$_}, $baseurl), 
                     $referer, $depth) 
                if length($tag->{'attrib'}{$_}) ;
=16=

1.10|11|12|13|14|15| < PREV = PAGE 16 = NEXT > |17|18|19|20|21|22

UP TO ROOT | UP TO DIR | TO FIRST PAGE

Google
 


E-mail Facebook Google Digg del.icio.us BlinkList Fark Furl Ma.gnolia Netscape NewsVine Reddit Slashdot Spurl StumbleUpon Technorati YahooMyWeb LiveJournal Blogmarks TwitThis Live News2.ru BobrDobr.ru Memori.ru MoeMesto.ru

0.013166 wallclock secs ( 0.01 usr + 0.00 sys = 0.01 CPU)