#!/usr/local/bin/perl
#
# Checklinks 1.0.1
#
# Starting at one or more seed HTML files, recursively check the
# validity of all links on the site. Major features:
#
# * Local URLs are read from the filesystem when possible (much
# faster than going through HTTP server).
# * Basic server-side includes (aka SSI or SHTML) are checked.
# * Latest standards are supported-- HTML 4.0, HTTP 1.1, URIs
# according to RFC 2396.
# * Links are traversed breadth-first.
#
# To list command-line options, run "cl -?" or see &usage() below.
#
# TO CONFIGURE:
#
# 1) Set $LOCAL_HOST and $DOCUMENT_ROOT, just below. If you don't, the
# program will try to guess them in set_needed_globals(), but it's more
# reliable if you enter them here.
#
# 2) If needed, set any further server configuration below-- things like
# path aliases and so forth. If you have the srm.conf file, you can
# feed it into this script with "-c srm.conf"; otherwise, the default
# settings will probably work OK.
#
# You can set a few parameters with the undocumented "-D <name=value>"
# command-line option, e.g. "-D LOCAL_HOST=www.myhost.com".
#
# Further comments, including an overview of script internals, are at
# the end of this file.
#
# Copyright (C) 1998, 2000 by James Marshall, james@jmarshall.com
# see http://www.jmarshall.com/tools/cl/ for more info
#
#
# CHANGES IN 1.0.1:
#
# This is just a bug fix release. Fixes include:
# . Aliases are handled correctly now. Sorry 'bout that.
# . A redirect + relative URL no longer results in infinitely
# recursing URLs.
# . More HTML tags are searched for links.
# . Non-HTML files are no longer searched for links.
# . There were other minor bug fixes.
#
#----------------------------------------------------------------------
#use strict ;
my( $LOCAL_HOST, $DOCUMENT_ROOT, $USER_DIR, @DIRECTORY_INDEX,
%ALIAS, %ALIAS_MATCH, %SCRIPT_ALIAS, %SCRIPT_ALIAS_MATCH, %UN_ALIAS,
@SHTML_EXTENSIONS, @CGI_EXTENSIONS, @INCLUDE_PATTERNS, @EXCLUDE_PATTERNS,
@INCLUDE_STATUS, @EXCLUDE_STATUS,
$verbose_report, $max_depth, $file_check, $full_http_check,
$MAX_REDIRECTS, $MAX_ATTEMPTS, $HTML_BY_NAME, $SUPPORT_NCSA_BUG,
@NO_PROXY, $DOC_ROOT_DEV, $DOC_ROOT_INODE, $DOC_ROOT_EXISTS, $CWD,
%html_urls, %non_html_urls, %e_to_ch,
%home_dir, %dir_to_user, %inode_to_user,
%url, @urlstoget,
$debug, $CL_VERSION,
) ;
#----- User Configuration ---------------------------------------------
# This should be 'localhost', or a hostname of the Web server. URLs at
# this host will be assumed to be local; URLs not at this host will not be
# traversed into. If this names a remote host, the program will not work.
# Note that 'localhost' doesn't necessarily point to your local Web server.
# $LOCAL_HOST= 'localhost' ;
# $LOCAL_HOST= 'www.example.com' ;
# This is your root Web directory, i.e. the directory that the Web server
# sends the user if the URL "http://$LOCAL_HOST" is requested. It's in
# the configuration file srm.conf (and is read by -c option).
# If you don't know the document root of your server, but you don't need
# it because you're only checking URLs whose path starts with ~, put a
# non-existent path here rather than leave it blank (a hack).
# $DOCUMENT_ROOT= '/home/www/htdocs' ;
#----- variables equivalent to srm.conf entries
# These globals are from the equivalent entries in srm.conf, etc.
# See the command-line option -c <config-file>, to read values directly
# from srm.conf instead.
$USER_DIR= 'public_html' ;
@DIRECTORY_INDEX= qw( index.html index.cgi index.shtml ) ;
# Used in &url_to_filename(), and possibly elsewhere
# Note that ALIAS_MATCH and SCRIPT_ALIAS_MATCH use Perl (not standard) regexps.
# If order of multiple e.g. "Alias" directives is important, this may not work.
=1= |