}
# Put the URL in such a form that two URLs that point to the same resource
# have the same URL, to avoid superfluous retrievals.
# Host name is lowercased elsewhere-- this routine is only called from
# &add_url; see note there. To lowercase the host name here would be
# inefficient.
sub canonicalize {
my($URL)= @_ ;
$URL=~ s/#.*// ; # remove any "#" fragment from end of URL
return $URL ;
}
#----- File reading/downloading routines (includes networking) --------
# Verify that a URL exists, and set $url->{'status'} accordingly. Do
# this either by checking the local filesystem or by using the HTTP HEAD
# method for remote sites or CGI scripts.
# Set $url->{'ishtml'} accordingly if discovered from Content-Type:.
# This does not support various Redirect directives in srm.conf.
sub verify_url {
my($url)= @_ ;
print STDERR "verifying $url->{'URL'}\n" if $debug ;
# Depending on the state of $url->{islocal, iscgi, dontfollow} and
# $full_http_check, take appropriate actions to check/set the
# status code for this URL.
# NOTE: In some situations, specifically when checking a CGI script
# named in a <form action> (thus implying that dontfollow is set),
# and using HTTP to check the URL (because the script is remote or
# $full_http_check is set), the HTTP response code may not be
# accurate. This is because there is no form data sent with the
# request, as there normally would be. In these cases, a cautionary
# note is appended to $url->{'status'}. Additionally, an empty
# $url->{'status'} is changed to an explanatory note (maybe we should
# do that in load_url() too?).
# Use HEAD if file is remote, or if $full_http_check is set.
if (!$url->{'islocal'} or $full_http_check) {
&load_url_using_HTTP($url, 'HEAD') ;
$url->{'status'}= '[no status returned]'
unless length($url->{'status'}) ;
$url->{'status'}.= ' (NOTE: Form was not submitted normally)'
if $url->{'dontfollow'} ;
# URL is local: If it's not CGI, do a normal local file check
} elsif (!$url->{'iscgi'}) {
$url->{'status'}= (-e $url->{'filename'})
? "200 Local File Exists" : "404 File Not Found" ;
# URL is local CGI: Use HEAD unless dontfollow is set
} elsif (!$url->{'dontfollow'}) {
&load_url_using_HTTP($url, 'HEAD') ;
# Else it's a local CGI with dontfollow set: Check for executable file
} else {
$url->{'status'}=
(! -e $url->{'filename'}) ? "404 File Not Found"
: (! -x $url->{'filename'}) ? "403 Local File Is Not Executable"
: "200 Local Executable File Exists"
}
# Old verify routine below:
#
# # If is a local non-CGI file, check it directly from the filesystem
# if ($url->{'islocal'} and !$url->{'iscgi'} and !$full_http_check) {
# $url->{'status'}= (-e $url->{'filename'})
# ? "200 Local File Exists" : "404 File Not Found" ;
#
# # Otherwise, download its HEAD from its HTTP server
# } else {
# &load_url_using_HTTP($url, 'HEAD') ;
# }
}
# Load entire file/resource and return its contents, setting $url->{'status'}
# accordingly. Do this either by checking the local filesystem or by
# using the HTTP GET method for remote sites or CGI scripts.
# Set $url->{'ishtml'} accordingly if discovered from Content-Type:.
# This does not support various Redirect directives in srm.conf.
sub load_url {
my($url)= @_ ;
my($HTML) ;
print STDERR "loading $url->{'URL'}\n" if $debug ;
# If is a local non-CGI file, read it directly from the filesystem
=7= |