PROXY  WHOIS  RQUOTE  TEXTS  SOFT  FOREX  BBOARD
 Music  Philosophy  Code  Literature  Russian

= ROOT|Technical|Code_Examples|Perl|Proxy|cl-1.0.1.pl =

page 1 of 22



#!/usr/local/bin/perl
#
#   Checklinks 1.0.1
#
#     Starting at one or more seed HTML files, recursively check the
#     validity of all links on the site.  Major features:
#
#       * Local URLs are read from the filesystem when possible (much
#           faster than going through HTTP server).
#       * Basic server-side includes (aka SSI or SHTML) are checked.
#       * Latest standards are supported-- HTML 4.0, HTTP 1.1, URIs
#           according to RFC 2396.
#       * Links are traversed breadth-first.
#
#   To list command-line options, run "cl -?" or see &usage() below.
#
#   TO CONFIGURE: 
#
#   1) Set $LOCAL_HOST and $DOCUMENT_ROOT, just below.  If you don't, the
#      program will try to guess them in set_needed_globals(), but it's more
#      reliable if you enter them here.
#
#   2) If needed, set any further server configuration below-- things like
#      path aliases and so forth.  If you have the srm.conf file, you can 
#      feed it into this script with "-c srm.conf"; otherwise, the default 
#      settings will probably work OK.
#
#   You can set a few parameters with the undocumented "-D <name=value>"
#   command-line option, e.g. "-D LOCAL_HOST=www.myhost.com".
#
#   Further comments, including an overview of script internals, are at
#   the end of this file.
#
#   Copyright (C) 1998, 2000 by James Marshall, james@jmarshall.com
#   see http://www.jmarshall.com/tools/cl/ for more info
#
#
#   CHANGES IN 1.0.1:
#
#     This is just a bug fix release.  Fixes include:
#       . Aliases are handled correctly now.  Sorry 'bout that.
#       . A redirect + relative URL no longer results in infinitely
#           recursing URLs.
#       . More HTML tags are searched for links.
#       . Non-HTML files are no longer searched for links.
#       . There were other minor bug fixes.
#
#----------------------------------------------------------------------

#use strict ;

my( $LOCAL_HOST, $DOCUMENT_ROOT, $USER_DIR, @DIRECTORY_INDEX,
    %ALIAS, %ALIAS_MATCH, %SCRIPT_ALIAS, %SCRIPT_ALIAS_MATCH, %UN_ALIAS,
    @SHTML_EXTENSIONS, @CGI_EXTENSIONS, @INCLUDE_PATTERNS, @EXCLUDE_PATTERNS,
          @INCLUDE_STATUS, @EXCLUDE_STATUS,
    $verbose_report, $max_depth, $file_check, $full_http_check,
    $MAX_REDIRECTS, $MAX_ATTEMPTS, $HTML_BY_NAME, $SUPPORT_NCSA_BUG,
    @NO_PROXY, $DOC_ROOT_DEV, $DOC_ROOT_INODE, $DOC_ROOT_EXISTS, $CWD,
    %html_urls, %non_html_urls, %e_to_ch,

    %home_dir, %dir_to_user, %inode_to_user,

    %url, @urlstoget, 

    $debug, $CL_VERSION,
  ) ;


#----- User Configuration ---------------------------------------------

# This should be 'localhost', or a hostname of the Web server.  URLs at
#   this host will be assumed to be local; URLs not at this host will not be
#   traversed into. If this names a remote host, the program will not work.
# Note that 'localhost' doesn't necessarily point to your local Web server.

# $LOCAL_HOST= 'localhost' ;
# $LOCAL_HOST= 'www.example.com' ;

# This is your root Web directory, i.e. the directory that the Web server
#   sends the user if the URL "http://$LOCAL_HOST" is requested.  It's in
#   the configuration file srm.conf (and is read by -c option).
# If you don't know the document root of your server, but you don't need
#   it because you're only checking URLs whose path starts with ~, put a
#   non-existent path here rather than leave it blank (a hack).

# $DOCUMENT_ROOT= '/home/www/htdocs' ;


#----- variables equivalent to srm.conf entries 

# These globals are from the equivalent entries in srm.conf, etc.
# See the command-line option -c <config-file>, to read values directly 
#   from srm.conf instead.

$USER_DIR= 'public_html' ;
@DIRECTORY_INDEX= qw( index.html index.cgi index.shtml ) ;

# Used in &url_to_filename(), and possibly elsewhere
# Note that ALIAS_MATCH and SCRIPT_ALIAS_MATCH use Perl (not standard) regexps.
# If order of multiple e.g. "Alias" directives is important, this may not work.
=1=

= PAGE 1 = NEXT > |2|3|4|5|6|7|8|9|10.22

UP TO ROOT | UP TO DIR

Google
 


E-mail Facebook Google Digg del.icio.us BlinkList Fark Furl Ma.gnolia Netscape NewsVine Reddit Slashdot Spurl StumbleUpon Technorati YahooMyWeb LiveJournal Blogmarks TwitThis Live News2.ru BobrDobr.ru Memori.ru MoeMesto.ru

0.0407569 wallclock secs ( 0.01 usr + 0.00 sys = 0.01 CPU)