PROXY  WHOIS  RQUOTE  TEXTS  SOFT  FOREX  BBOARD
 Music  Philosophy  Code  Literature  Russian

= ROOT|Technical|Code_Examples|Perl|Proxy|cl-1.0.1.pl =

page 6 of 22




    # This is a translation from entity character references to characters, 
    #   used in &HTMLunescape().
    # This simplified version only supports " < > &, but that
    #   should be enough for URL-type attributes.
    # See http://www.w3.org/TR/REC-html40/sgml/entities.html for full entity 
    #   list.

    %e_to_ch= (quot => '"', 
               'lt' => '<', 
               'gt' => '>', 
               amp  => '&') ;

}


#----------------------------------------------------------------------

# Add the URL to our data structures; specifically, to %url and @urlstoget.
# Returns a pointer to the structure in %url, or undef if already defined
#   or on error.
# Currently, this always receives the URL with the host name lowercase,
#   either from &absolute_url() or from using $LOCAL_HOST.
sub add_url {
    my($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow)= @_ ;

    # Allow the user to restrict URL patterns:  URLs must be in 
    #   @INCLUDE_PATTERNS but not in @EXCLUDE_PATTERNS (but only restrict 
    #   by @INCLUDE_PATTERNS if it's not empty).
    return undef if @INCLUDE_PATTERNS &&
                    !grep( $URL=~ /$_/, @INCLUDE_PATTERNS ) ;
    return undef if grep( $URL=~ /$_/, @EXCLUDE_PATTERNS ) ;

    # Canonicalize URL, so we don't get a page multiple times
    $URL= &canonicalize($URL) ;
    
    # for obscure case involving a <form action=___.cgi>-extracted URL being
    #   overwritten by <a href=___.cgi> extraction (don't fret over this)
    $url{$URL}{'dontfollow'}&&= $dontfollow if $url{$URL} ;

    # Don't add the record a second time!  Or will infinitely traverse.
    return undef if $url{$URL} ;  # or add to @referers, for 301 correction...?

    # Only HTTP URLs are currently supported
    return undef unless $URL=~ /^http:/i ;

    # Any self-referral here indicates a bug in the program.  It's happened.
    die "PROGRAM ERROR: $URL shows its first referer as itself.\n"
        if $referer eq $URL ;

    my(%u) ;
    @u{qw(URL referer depth ishtml iscgi dontfollow)}= 
        ($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow) ;
    $u{'islocal'}= ($URL=~ m#^http://\Q$LOCAL_HOST\E/#io) + 0 ; # make length>0
    if ($u{'islocal'}) {
#        $u{'filename'}= &url_to_filename($URL) ;
        @u{'filename', 'location'}= &url_to_filename($URL) ;
        $u{'iscgi'}= &is_cgi($u{'filename'}, $URL)  if $u{'iscgi'} eq '' ;

	# 2-27-00 JSM:  Detect ishtml by filename, not -T test.
	if ( $u{'ishtml'} eq '' ) {
	    $u{'ishtml'}=  $HTML_BY_NAME
		?  ( !$u{'iscgi'} && -e $u{'filename'} && 
		     $u{'filename'}=~ /\.html?$/i ) + 0
		:  (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 ;
	}
#        $u{'ishtml'}= (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0
#            unless length($u{'ishtml'}) ;

    }

    #  If we're only doing a file check, don't add URLs that require HTTP
    return undef if ($file_check and (!$u{'islocal'} or $u{'iscgi'}) ) ;

    push(@urlstoget, \%u) ;
    $url{$URL}= \%u ;

    # return \%u ;   # unneeded because of previous statement
}


# Guess if a file is a CGI script or not.  Returns true if the (regular) file
#   is executable, has one of @CGI_EXTENSIONS, or if the URL is in a 
#   ScriptAlias'ed directory.
# $fname must be absolute path, but $URL is optional (saves time if available).
# Note that URLs like "/path/script.cgi?a=b" are handled correctly-- the
#   previously extracted filename is tested for CGI-ness, while the URL is
#   checked for ScriptAlias matching (which is unaffected by final query
#   strings or PATH_INFO).
sub is_cgi {
    my($fname, $URL)= @_ ;
    return 1 if (-x $fname && ! -d _ ) ;   # should we really do this?
    foreach (@CGI_EXTENSIONS) { return 1 if $fname=~ /\Q$_\E$/i }

    $URL= &filename_to_url($fname) unless length($URL) ;  # currently unused
    my($URLpath)= $URL=~ m#^http://[^/]*(.*)#i ;
    foreach (keys %SCRIPT_ALIAS)        { return 1 if $URLpath=~ /^\Q$_\E/ }
    foreach (keys %SCRIPT_ALIAS_MATCH)  { return 1 if $URLpath=~ /^$_/ }

    return 0 ;
=6=

1|2|3|4|5| < PREV = PAGE 6 = NEXT > |7|8|9|10|11|12|13|14|15.22

UP TO ROOT | UP TO DIR | TO FIRST PAGE

Google
 


E-mail Facebook Google Digg del.icio.us BlinkList Fark Furl Ma.gnolia Netscape NewsVine Reddit Slashdot Spurl StumbleUpon Technorati YahooMyWeb LiveJournal Blogmarks TwitThis Live News2.ru BobrDobr.ru Memori.ru MoeMesto.ru

0.0119381 wallclock secs ( 0.00 usr + 0.00 sys = 0.00 CPU)