PROXY  WHOIS  RQUOTE  TEXTS  SOFT  FOREX  BBOARD
 Music  Philosophy  Code  Literature  Russian

= ROOT|Technical|Code_Examples|Perl|site_perl|LWP|RobotUA.pm =

page 2 of 4



    my($self, $netloc) = @_;
    return undef unless defined $netloc;
    my $last = $self->{'rules'}->last_visit($netloc);
    if ($last) {
	my $wait = int($self->{'delay'} * 60 - (time - $last));
	$wait = 0 if $wait < 0;
	return $wait;
    }
    return 0;
}


sub simple_request
{
    my($self, $request, $arg, $size) = @_;

    LWP::Debug::trace('()');

    # Do we try to access a new server?
    my $allowed = $self->{'rules'}->allowed($request->url);

    if ($allowed < 0) {
	LWP::Debug::debug("Host is not visited before, or robots.txt expired.");
	# fetch "robots.txt"
	my $robot_url = $request->url->clone;
	$robot_url->path("robots.txt");
	$robot_url->query(undef);
	LWP::Debug::debug("Requesting $robot_url");

	# make access to robot.txt legal since this will be a recursive call
	$self->{'rules'}->parse($robot_url, ""); 

	my $robot_req = new HTTP::Request 'GET', $robot_url;
	my $robot_res = $self->request($robot_req);
	my $fresh_until = $robot_res->fresh_until;
	if ($robot_res->is_success) {
	    my $c = $robot_res->content;
	    if ($robot_res->content_type =~ m,^text/, && $c =~ /^\s*Disallow\s*:/mi) {
		LWP::Debug::debug("Parsing robot rules");
		$self->{'rules'}->parse($robot_url, $c, $fresh_until);
	    }
	    else {
		LWP::Debug::debug("Ignoring robots.txt");
		$self->{'rules'}->parse($robot_url, "", $fresh_until);
	    }

	}
	else {
	    LWP::Debug::debug("No robots.txt file found");
	    $self->{'rules'}->parse($robot_url, "", $fresh_until);
	}

	# recalculate allowed...
	$allowed = $self->{'rules'}->allowed($request->url);
    }

    # Check rules
    unless ($allowed) {
	my $res = new HTTP::Response
	  &HTTP::Status::RC_FORBIDDEN, 'Forbidden by robots.txt';
	$res->request( $request ); # bind it to that request
	return $res;
    }

    my $netloc = eval { local $SIG{__DIE__}; $request->url->host_port; };
    my $wait = $self->host_wait($netloc);

    if ($wait) {
	LWP::Debug::debug("Must wait $wait seconds");
	if ($self->{'use_sleep'}) {
	    sleep($wait)
	}
	else {
	    my $res = new HTTP::Response
	      &HTTP::Status::RC_SERVICE_UNAVAILABLE, 'Please, slow down';
	    $res->header('Retry-After', time2str(time + $wait));
	    $res->request( $request ); # bind it to that request
	    return $res;
	}
    }

    # Perform the request
    my $res = $self->SUPER::simple_request($request, $arg, $size);

    $self->{'rules'}->visit($netloc);

    $res;
}


sub as_string
{
    my $self = shift;
    my @s;
    push(@s, "Robot: $self->{'agent'} operated by $self->{'from'}  [$self]");
    push(@s, "    Minimum delay: " . int($self->{'delay'}*60) . "s");
    push(@s, "    Will sleep if too early") if $self->{'use_sleep'};
    push(@s, "    Rules = $self->{'rules'}");
    join("\n", @s, '');
}
=2=

1| < PREV = PAGE 2 = NEXT > |3|4

UP TO ROOT | UP TO DIR | TO FIRST PAGE

Google
 


E-mail Facebook Google Digg del.icio.us BlinkList Fark Furl Ma.gnolia Netscape NewsVine Reddit Slashdot Spurl StumbleUpon Technorati YahooMyWeb LiveJournal Blogmarks TwitThis Live News2.ru BobrDobr.ru Memori.ru MoeMesto.ru

0.0210631 wallclock secs ( 0.00 usr + 0.00 sys = 0.00 CPU)