# This is a translation from entity character references to characters,
# used in &HTMLunescape().
# This simplified version only supports " < > &, but that
# should be enough for URL-type attributes.
# See http://www.w3.org/TR/REC-html40/sgml/entities.html for full entity
# list.
%e_to_ch= (quot => '"',
'lt' => '<',
'gt' => '>',
amp => '&') ;
}
#----------------------------------------------------------------------
# Add the URL to our data structures; specifically, to %url and @urlstoget.
# Returns a pointer to the structure in %url, or undef if already defined
# or on error.
# Currently, this always receives the URL with the host name lowercase,
# either from &absolute_url() or from using $LOCAL_HOST.
sub add_url {
my($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow)= @_ ;
# Allow the user to restrict URL patterns: URLs must be in
# @INCLUDE_PATTERNS but not in @EXCLUDE_PATTERNS (but only restrict
# by @INCLUDE_PATTERNS if it's not empty).
return undef if @INCLUDE_PATTERNS &&
!grep( $URL=~ /$_/, @INCLUDE_PATTERNS ) ;
return undef if grep( $URL=~ /$_/, @EXCLUDE_PATTERNS ) ;
# Canonicalize URL, so we don't get a page multiple times
$URL= &canonicalize($URL) ;
# for obscure case involving a <form action=___.cgi>-extracted URL being
# overwritten by <a href=___.cgi> extraction (don't fret over this)
$url{$URL}{'dontfollow'}&&= $dontfollow if $url{$URL} ;
# Don't add the record a second time! Or will infinitely traverse.
return undef if $url{$URL} ; # or add to @referers, for 301 correction...?
# Only HTTP URLs are currently supported
return undef unless $URL=~ /^http:/i ;
# Any self-referral here indicates a bug in the program. It's happened.
die "PROGRAM ERROR: $URL shows its first referer as itself.\n"
if $referer eq $URL ;
my(%u) ;
@u{qw(URL referer depth ishtml iscgi dontfollow)}=
($URL, $referer, $depth, $ishtml, $iscgi, $dontfollow) ;
$u{'islocal'}= ($URL=~ m#^http://\Q$LOCAL_HOST\E/#io) + 0 ; # make length>0
if ($u{'islocal'}) {
# $u{'filename'}= &url_to_filename($URL) ;
@u{'filename', 'location'}= &url_to_filename($URL) ;
$u{'iscgi'}= &is_cgi($u{'filename'}, $URL) if $u{'iscgi'} eq '' ;
# 2-27-00 JSM: Detect ishtml by filename, not -T test.
if ( $u{'ishtml'} eq '' ) {
$u{'ishtml'}= $HTML_BY_NAME
? ( !$u{'iscgi'} && -e $u{'filename'} &&
$u{'filename'}=~ /\.html?$/i ) + 0
: (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0 ;
}
# $u{'ishtml'}= (!$u{'iscgi'} && -e $u{'filename'} && -T _) + 0
# unless length($u{'ishtml'}) ;
}
# If we're only doing a file check, don't add URLs that require HTTP
return undef if ($file_check and (!$u{'islocal'} or $u{'iscgi'}) ) ;
push(@urlstoget, \%u) ;
$url{$URL}= \%u ;
# return \%u ; # unneeded because of previous statement
}
# Guess if a file is a CGI script or not. Returns true if the (regular) file
# is executable, has one of @CGI_EXTENSIONS, or if the URL is in a
# ScriptAlias'ed directory.
# $fname must be absolute path, but $URL is optional (saves time if available).
# Note that URLs like "/path/script.cgi?a=b" are handled correctly-- the
# previously extracted filename is tested for CGI-ness, while the URL is
# checked for ScriptAlias matching (which is unaffected by final query
# strings or PATH_INFO).
sub is_cgi {
my($fname, $URL)= @_ ;
return 1 if (-x $fname && ! -d _ ) ; # should we really do this?
foreach (@CGI_EXTENSIONS) { return 1 if $fname=~ /\Q$_\E$/i }
$URL= &filename_to_url($fname) unless length($URL) ; # currently unused
my($URLpath)= $URL=~ m#^http://[^/]*(.*)#i ;
foreach (keys %SCRIPT_ALIAS) { return 1 if $URLpath=~ /^\Q$_\E/ }
foreach (keys %SCRIPT_ALIAS_MATCH) { return 1 if $URLpath=~ /^$_/ }
return 0 ;
=6= |