File: WWW-Spyder.pm
#=====================================================================
1 package WWW::Spyder;
#=====================================================================
2 use strict;
3 use warnings;
#---------------------------------------------------------------------
4 use HTML::Parser 3;
#---------------------------------------------------------------------
5 use LWP::UserAgent;
6 use HTTP::Cookies;
7 use URI::URL;
8 use HTML::Entities;
9 use POSIX; POSIX::nice(40);
#---------------------------------------------------------------------
10 use Digest::MD5 "md5_base64"; # for making seen content key/index
#---------------------------------------------------------------------
11 use Carp;
12 our $VERSION = '0.18';
13 our $VERBOSITY ||= 0;
#=====================================================================
# METHODS
#=====================================================================
14 { # make it all a bit more private
15 my %_methods = (# these are methods & roots of our attribute names
16 UA => undef,
17 bell => undef,
18 html_parser => undef,
19 sleep_base => undef,
20 cookie_file => undef,
21 _exit_epoch => undef,
22 _term_count => undef,
23 );
# those may all get hardcoded eventually, but they're handy for now
#=====================================================================
24 sub new {
25 my ( $caller ) = shift;
26 my $class = ref($caller) || $caller;
27 my $ego = bless {}, $class;
28 my ( $seed, %arg );
29 if ( @_ == 1 ) {
30 ( $seed ) = @_;
31 }
32 %arg = ( sleep => undef,
33 exit_on => undef,
34 seed => undef,
35 sleep_base => 5,
36 );
37 %arg = ( %arg, @_ ) unless @_ % 2;
# install all our methods, either set once then get only or push/shift
# array refs
38 for my $method ( keys %_methods ) {
39 no strict "refs";
40 no warnings;
41 my $attribute = '_' . $method;
42 if ( ref $_methods{$method} eq 'ARRAY' ) {
43 *{"$class::$method"} = sub {
44 my($ego,@args) = @_;
45 return shift(@{$ego->{$attribute}}) unless @args;
46 push(@{$ego->{$attribute}}, @args);
47 };
48 } else {
49 *{"$class::$method"} = sub {
50 my($ego,$arg) = @_;
51 carp "You cannot reset $method!"
52 if $arg and exists $ego->{$attribute};
53 return $ego->{$attribute} #get if already set
54 if exists $ego->{$attribute};
55 $ego->{$attribute} = $arg; #only set one time!
56 };
57 }
58 }
59 $seed ||= $arg{seed};
60 $ego->seed($seed) if $seed;
61 $ego->sleep_base($arg{sleep_base});
62 $ego->_install_exit_check(\%arg) unless $ego->can('_exit_check');
63 $ego->_install_html_parser;
64 $ego->_install_web_agent;
65 return $ego;
66 }
#=====================================================================
67 sub terms {
68 my ($ego,@terms) = @_;
69 if ( @terms and not exists $ego->{_terms} ) {
70 $ego->_term_count(scalar @terms); # makes this set once op
71 my %terms;
72 $terms{$_} = qr/$_/ for @terms;
73 $ego->{_terms} = \%terms;
74 } else {
75 return $ego->{_terms}
76 }
77 }
#=====================================================================
78 sub show_attributes {
79 my ($ego) = @_;
80 return map {/^_(.+)$/} keys %{$ego};
81 }
#=====================================================================
82 sub slept {
83 my ($ego, $time) = @_;
84 $ego->{_Slept} += $time if $time;
85 return $ego->{_Slept} unless $time;
86 }
#=====================================================================
87 sub seed {
88 my ($ego, $url) = @_;
89 $url or croak "Must provide URL to seed().";
90 croak "You have passed something besides a plain URL to seed()!"
91 if ref $url;
92 $ego->stack_urls($url);
93 return 1; # to the top of the stacks
94 }
#=====================================================================
95 sub crawl {
96 my ($ego) = @_;
97 while ('I have pages to get...') {
98 $ego->_exit_check and return;
99 my $enQ = $ego->_choose_courteously ||
100 $ego->_just_choose ||
101 return;
102 my $url = $enQ->url;
103 $ego->url($url);
104 $ego->_current_enQ($enQ);
105 print "GET'ing: $url\n" if $VERBOSITY;
106 my $response = $ego->UA->request # no redirects &c is simple_
107 ( HTTP::Request->new( GET => "$url" ) );
108 print STDERR "\a" if $ego->bell;
109 $response or
110 carp "$url failed GET!" and next;
111 push @{$ego->{_courtesy_Queue}}, $enQ->domain;
112 shift @{$ego->{_courtesy_Queue}}
113 if $ego->{_courtesy_Queue}
114 and @{$ego->{_courtesy_Queue}} > 100;
115 my $head = $response->headers_as_string;
116 $head or
117 carp "$url has no HEAD!" and
118 next; # no headless webpages
119 length($head) > 1_024 and $head = substr($head,0,1_024);
120 print $head, "\n" if $VERBOSITY > 2;
121 my $base;
122 eval { $base = $response->base };
123 $base or
124 carp "$url has no discernible BASE!" and
125 next; # no baseless webpages
# WE SHOULD also look for <HTML> b/c some servers that we might want
# to look at don't properly report the content-type
# start over unless this is something we can read
126 lc($head) =~ /content-type:\s?(?:text|html)/ or
127 carp "$url doesn't look like TEXT or HTML!" and
128 next; # no weird media, movies, flash, etc
129 my ( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n,;
130 my ( $description ) = $head =~
131 /[^:]*?DESCRIPTION:\s*((?:[^\n]+(?:\n )?)+)/i;
132 $description = $ego->_snip($description) if $description;
133 my $page = $response->content or
134 carp "Failed to fetch $url." and
135 next; # no empty pages, start over with next url
136 $ego->{_current_Bytes} = length($page);
137 $ego->spyder_data($ego->{_current_Bytes});
# we are going to use a digest to prevent parsing the identical
# content received via a different url
138 my $digest = md5_base64($page); # unique microtag of the page
# so if we've seen it before, start over with the next URL
139 $ego->{_page_Memory}{$digest}++ and
140 carp "Seen this page's content before: $url"
141 and next;
142 $ego->{_page_content} = $page;
143 print "PARSING: $url\n" if $VERBOSITY > 1;
144 $ego->{_spydered}{$url}++;
145 $ego->html_parser->parse($page);
146 $ego->html_parser->eof;
147 $ego->{_adjustment} = $ego->_parse_for_terms if $ego->terms;
# make links absolute and fix bad spacing in link names, then turn
# them into an Enqueue object
148 for my $pair ( @{$ego->{_enqueue_Objects}} ) {
149 my $url;
150 eval {
151 $url = URI::URL::url($pair->[0], $base)->abs;
152 };
153 my $name = _snip($pair->[1]);
154 my $item = WWW::Spyder::Enqueue->new("$url",$name);
155 $pair = $item;
156 }
# put links into the queue(s)
157 $ego->stack_urls() if $ego->_links;
# clean up text a bit. should this be here...?
158 if ( $ego->{_text} and ${$ego->{_text}} ) {
159 ${$ego->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g;
160 }
# in the future Page object should be installed like parsers as a
# reusable container
# return
161 my $Page =
162 WWW::Spyder::Page->new(
163 title => $title,
164 text => $ego->{_text},
165 raw => \$page,
166 url => $enQ->url,
167 domain => $enQ->domain,
168 link_name => undef,
169 link => undef,
170 description => $description || '',
171 pages_enQs => $ego->_enqueue,
172 );
173 $ego->_reset; #<<--clear out things that might remain
174 return $Page;
175 }
176 }
#=====================================================================
177 sub stack_urls { # should eventually be broken into stack and sift?
# dual purpose, w/ terms it filters as long as there are no urls
# passed, otherwise it's setting them to the top of the queues
178 my ($ego, @urls) = @_;
179 print "Stacking " . join(', ', @urls) . "\n"
180 if @urls and $VERBOSITY > 5;
181 if ( $ego->terms and not @urls ) {
182 no warnings;
183 my @Qs = $ego->_queues;
184 for my $enQ ( @{$ego->_enqueue} ) {
185 my ( $url, $name ) = ( $enQ->url, $enQ->name );
186 next if $ego->_seen($url);
187 my $match = 0;
188 while ( my ($term,$rx) = each %{$ego->terms} ) {
189 $match++ for $name =~ /$rx/g;
190 }
191 my $baseQ = 10;
192 my $adjustment = $ego->{_adjustment};
193 $baseQ -= $adjustment; # 4 to 0
194 push @{$ego->{$baseQ}}, $enQ
195 and next unless $match;
196 if ( $VERBOSITY > 1 ) {
197 print "NAME: $name\n";
198 printf " RATIO -->> %d\n", $match;
199 }
200 my $queue_index = sprintf "%d",
201 $ego->_term_count / $match;
202 $queue_index -= $adjustment;
203 $queue_index = 4 if $queue_index > 4;
204 $queue_index = 0 if $queue_index < 0;
205 my $queue = $Qs[$queue_index];
206 if ($VERBOSITY > 2) {
207 print "Q:$queue [$queue_index] match: $match terms:",
208 $ego->_term_count, " Adjust: $adjustment\n\n";
209 }
210 push @{$ego->{$queue}}, $enQ;
211 }
212 } elsif ( @urls > 0 ) {
213 for my $url ( @urls ) {
214 next if $ego->_seen($url);
215 my $queue = $ego->_queues;
216 carp "Placing $url in '$queue'\n" if $VERBOSITY > 2;
# unshift b/c seeding is priority
217 unshift @{$ego->{$queue}},
218 WWW::Spyder::Enqueue->new($url,undef);
219 }
220 } else {
221 for my $enQ ( @{$ego->_enqueue} ) {
222 my ( $url, $name ) = ( $enQ->url, $enQ->name );
223 next if $ego->_seen($url);
224 my $queue = $ego->_queues;
225 push @{$ego->{$queue}}, $enQ;
226 }
227 }
228 }
#=====================================================================
229 sub queue_count {
230 my ($ego) = @_;
231 my $count = 0;
232 for my $Q ( $ego->_queues ) {
233 next unless ref($ego->{$Q}) eq 'ARRAY';
234 $count += scalar @{$ego->{$Q}};
235 }
236 return $count;
237 }
#=====================================================================
238 sub spyder_time {
239 my ($ego,$raw) = @_;
240 my $time = time() - $^T;
241 return $time if $raw;
242 my $day = int( $time / 86400 );
243 my $hour = int( $time / 3600 ) % 24;
244 my $min = int( $time / 60 ) % 60;
245 my $sec = $time % 60;
# also collect slept time!
246 return sprintf "%d day%s %02d:%02d:%02d",
247 $day, $day == 1?'':'s', $hour, $min, $sec;
248 }
#=====================================================================
249 sub spyder_data {
250 my ($ego, $bytes) = @_;
251 $ego->{_bytes_GOT} += $bytes and return $bytes if $bytes;
252 return 0 unless $ego->{_bytes_GOT};
253 my $for_commas = int($ego->{_bytes_GOT} / 1_024);
254 for ( $for_commas ) {
255 1 while s/(\d)(\d\d\d)(?!\d)/$1,$2/;
256 }
257 return $for_commas;
258 }
#=====================================================================
259 sub spydered {
260 my ($ego) = @_;
261 return wantarray ?
262 keys %{ $ego->{_spydered} } :
263 scalar keys %{ $ego->{_spydered} };
264 }
#=====================================================================
265 sub exclude { # what about FILES TYPES!?
266 return undef; # not working yet!
267 my ($ego,$thing) = @_;
268 if ( $thing =~ m<^[^:]{3,5}://> )
269 {
270 return $ego->{_Xklood}{_domain}{$thing}++;
271 }
272 elsif ( $thing )
273 {
274 return $ego->{_Xklood}{_name}{$thing}++;
275 }
276 }
#=====================================================================
277 sub excluded_domains {
278 return undef; # not working yet!
279 my ($ego) = @_;
280 return wantarray ?
281 keys %{$ego->{_Xklood}{_domain}} :
282 [ keys %{$ego->{_Xklood}{_domain}} ];
283 }
#=====================================================================
284 sub excluded_names {
285 return undef; # not working yet!
286 my ($ego) = @_;
287 return wantarray ?
288 keys %{$ego->{_Xklood}{_name}} :
289 [ keys %{$ego->{_Xklood}{_name}} ];
290 }
#=====================================================================
291 sub go_to_seed {
292 my ( $ego, $engine, $query ) = @_;
293 carp "go_to_seed() is not functional yet!\n";
294 return; # NOT FUNCTIONAL
295 my $seed = WWW::Spyder::Seed::get_seed($engine, $query);
296 $ego->seed($seed);
297 }
#=====================================================================
298 sub verbosity {
299 my ( $ego, $verbosity ) = @_;
300 carp "Not setting verbosity! Must be integer b/t 1 & 6!\n"
301 and return
302 unless $verbosity;
303 $VERBOSITY = $verbosity;
304 }
#=====================================================================
#=====================================================================
# PRIVATE Spyder Methods
#=====================================================================
305 sub _reset {
# RESET MORE THAN THIS!?! make sure all the memory space is clean that
# needs be for clean iteration???
306 my ($ego) = @_;
307 $ego->{$_} = undef for qw( _linkText _linkSwitch _href
308 _current_enQ _page_content
309 _current_Bytes _alt _enqueue_Objects
310 _text );
311 }
#=====================================================================
312 sub _current_enQ {
313 my ($ego, $enQ) = @_;
314 my $last_enQ = $ego->{_current_enQ};
315 $ego->{_current_enQ} = $enQ if $enQ;
316 return $last_enQ; #<<-so we can get last while setting a new one
317 }
#=====================================================================
318 sub _enqueue {
319 my ($ego,$enQ) = @_;
320 push @{$ego->{_enqueue_Objects}}, $enQ if $enQ;
321 return $ego->{_enqueue_Objects};
322 }
#=====================================================================
323 sub _links {
324 my ($ego) = @_;
325 return [ map { $_->url } @{$ego->_enqueue} ];
326 }
#=====================================================================
327 sub _seen {
328 my ($ego,$url) = @_;
329 return $ego->{_seenURLs}{$url}++;
330 }
#=====================================================================
331 sub _parse_for_terms {
332 my ($ego) = @_;
333 $ego->{_page_terms_matches} = 0;
334 return 0 unless $ego->{_text};
335 while ( my ($term,$rx) = each %{$ego->terms} ) {
336 $ego->{_page_terms_matches}++ for
337 $ego->{_page_content} =~ /$rx/g;
338 }
339 my $index = int( ( $ego->{_page_terms_matches} /
340 length($ego->{_text}) ) * 1_000 );
# the algorithm might look it but isn't entirely arbitrary
341 print " PARSE TERMS : $ego->{_page_terms_matches} " .
342 "/ $ego->{_current_Bytes}\n" if $VERBOSITY > 1;
343 return 7 if $index > 25;
344 return 6 if $index > 18;
345 return 5 if $index > 14;
346 return 4 if $index > 11;
347 return 3 if $index > 7;
348 return 2 if $index > 3;
349 return 1 if $index > 0;
350 return 0;
351 }
#=====================================================================
352 sub _install_html_parser {
353 my ($ego) = @_;
354 my $Parser = HTML::Parser->new
355 (
356 start_h =>
357 [sub {
358 no warnings;
359 my ( $tag, $attr ) = @_;
360 return if $tag !~ /^(?:a|img)$/;
# need to deal with AREA tags from maps /^(?:a(?:rea)?|img)$/;
361 $attr->{href} =~ s,#[^/]*$,,;
362 return if lc($attr->{href}) =~ m,^\s*mailto:,;
363 return if lc($attr->{href}) =~ m,^\s*file:,;
364 return if lc($attr->{href}) =~ m,javascript:,;
365 $ego->{_href} ||= $attr->{href};
366 $ego->{_alt} ||= $attr->{alt};
367 $ego->{_linkSwitch} = 1;
368 }, 'tagname, attr'],
369 text_h =>
370 [sub {
371 return unless(my $it = shift);
372 return if $it =~
373 m/(?:\Q<!--\E)|(?:\Q-->\E)/;
374 ${$ego->{_text}} .= $it;
375 $ego->{_linkText} .= $it
376 if $ego->{_linkSwitch};
377 }, 'dtext'],
378 end_h =>
379 [sub {
380 my ( $tag ) = @_;
381 no warnings; # only problem: <a><b>L</b>inks</a>
382 return unless $tag eq 'a' or $ego->{_linkSwitch};
383 $ego->{_linkText} ||= $ego->{_alt} || '+';
384 decode_entities($ego->{_linkText});
385 push @{$ego->{_enqueue_Objects}},
386 [ $ego->{_href}, $ego->{_linkText} ];
387 # reset all our caching variables
388 $ego->{_linkSwitch} = $ego->{_href} = $ego->{_alt} =
389 $ego->{_linkText} = undef;
390 }, 'tagname'],
391 default_h => [""],
392 );
393 $Parser->ignore_elements(qw(script style));
394 $Parser->unbroken_text(1);
395 $ego->html_parser($Parser);
396 }
#=====================================================================
397 sub _install_web_agent {
398 my $ego = shift;
399 $ego->UA( LWP::UserAgent->new );
400 $ego->UA->agent('Mozilla/5.0');
401 $ego->UA->timeout(30);
402 $ego->UA->max_size(250_000);
403 my $jar_jar = HTTP::Cookies->new
404 (file => $ego->cookie_file || "$ENV{HOME}/spyderCookies",
405 autosave => 1,
406 max_cookie_size => 4096,
407 max_cookies_per_domain => 5, );
408 $ego->UA->cookie_jar($jar_jar);
409 }
#=====================================================================
410 sub _install_exit_check {
411 my ($ego, $arg) = @_;
412 my $class = ref $ego;
413 unless ( ref($arg) and ref($arg->{exit_on}) eq 'HASH' ) {
414 no strict "refs";
415 *{$class."::_exit_check"} =
416 sub { return 1 unless $ego->queue_count;
417 return 0;
418 };
419 return;
420 }
# checks can be: links => #, success => ratio, time => 10min...
# a piece of code we're going to build up to eval into method-hood
421 my $SUB = 'sub { my $ego = shift; ' .
422 'return 1 unless $ego->queue_count; ';
#------------------------------------------------------------
423 if ( $arg->{exit_on}{pages} ) {
424 print "Installing EXIT on links: $arg->{exit_on}{pages}\n"
425 if $VERBOSITY > 1;
426 $SUB .= ' return 1 if ' .
427 '$ego->spydered >= ' .$arg->{exit_on}{pages} .';';
428 }
#------------------------------------------------------------
429 if ( $arg->{exit_on}{success} ) {
430 #set necessary obj value and add to sub code
431 }
#------------------------------------------------------------
432 if ( $arg->{exit_on}{time} ) {
433 print "Installing EXIT on time: $arg->{exit_on}{time}\n"
434 if $VERBOSITY > 1;
435 my ($amount,$unit) =
436 $arg->{exit_on}{time} =~ /^(\d+)\W*(\w+?)s?$/;
# skip final "s" in case of hours, secs, mins
437 my %times = ( hour => 3600,
438 min => 60,
439 sec => 1 );
440 my $time_factor = 0;
441 for ( keys %times ) {
442 next unless exists $times{$unit};
443 $time_factor = $amount * $times{$unit};
444 }
445 $ego->_exit_epoch($time_factor + $^T);
446 $SUB .= q{
447 return 1 if $ego->_exit_epoch < time();
448 };
449 }
#------------------------------------------------------------
450 $SUB .= '}';
451 no strict "refs";
452 *{$class."::_exit_check"} = eval $SUB;
453 }
#=====================================================================
454 sub _choose_courteously {
455 my $ego = shift;
# w/o the switch and $i-- it acts a bit more depth first. w/ it, it's
# basically hard head down breadth first
456 print "CHOOSING courteously!\n" if $VERBOSITY > 1;
457 for my $Q ( $ego->_queues ) {
458 print "Looking for URL in $Q\n" if $VERBOSITY > 2;
459 next unless $ego->{$Q} and @{$ego->{$Q}} > 0;
460 my %seen;
461 my $total = scalar @{$ego->{$Q}};
462 my $switch;
463 for ( my $i = 0; $i < @{$ego->{$Q}}; $i++ ) {
464 my $enQ = $ego->{$Q}[$i];
465 my ($url,$name) = ( $enQ->url, $enQ->name );
# if we see one again, we've reshuffled as much as is useful
466 $seen{$url}++
467 and $switch = 1; # progress through to next Q
468 return splice(@{$ego->{$Q}},$i,1)
469 unless $ego->_courtesy_call($enQ);
470 my $fair_bump = int( log( $total - $i ) / log(1.5) );
471 my $move_me_back = splice(@{$ego->{$Q}},$i,1);
472 splice(@{$ego->{$Q}},($i+$fair_bump),0,$move_me_back);
473 $i-- unless $switch;
474 }
475 }
476 return undef; # we couldn't pick one courteously
477 } # end of _choose_courteously()
#=====================================================================
478 sub _just_choose {
479 my $ego = shift;
480 print "CHOOSING first up!\n" if $VERBOSITY > 1;
481 my $enQ;
482 for my $Q ( $ego->_queues ) {
483 next unless ref($ego->{$Q}) eq 'ARRAY';
484 $enQ = shift @{$ego->{$Q}};
485 last;
486 }
487 my $tax = $ego->_courtesy_call($enQ);
488 if ( $VERBOSITY > 4 ) {
489 print ' QUEUE: ';
490 print join("-:-", @{$ego->{_courtesy_Queue}}), "\n"
491 if $ego->{_courtesy_Queue};
492 }
493 my $sleep = int(rand($ego->sleep_base)) + $tax;
494 if ( $VERBOSITY ) {
495 printf "COURTESY NAP %d second%s ",
496 $sleep, $sleep == 1 ?'':'s';
497 printf "(Domain recently seen: %d time%s)\n",
498 $tax, $tax == 1 ?'':'s';
499 }
500 sleep $sleep; # courtesy to websites but human-ish w/ random
501 $ego->slept($sleep);
502 return $enQ;
503 }
#=====================================================================
504 sub _courtesy_call {
505 my ($ego,$enQ) = @_;
506 return 0 unless $enQ;
507 my $domain = $enQ->domain;
508 print 'COURTESY check: ', $domain, "\n" if $VERBOSITY > 5;
# yes, we have seen it in the last whatever GETs
509 my $seen = 0;
510 $seen = scalar grep { $_ eq $domain }
511 @{$ego->{_courtesy_Queue}};
512 $seen = 10 if $seen > 10;
513 return $seen;
514 }
#=====================================================================
515 sub _queues { # Q9 is purely for trash so it's not returned here
516 return wantarray ?
517 ( 0 .. 9 ) :
518 '0';
519 }
#=====================================================================
520 sub _snip {
521 my $ego = shift if ref($_[0]);
522 my ( @text ) = @_;
523 s/^\s+//, s/\s+$//, s/\s+/ /g for @text;
524 return wantarray ? @text : shift @text;
525 }
#=====================================================================
# Spyder ENDS
#=====================================================================
526 }# WWW::Spyder privacy ends
#=====================================================================
527 package WWW::Spyder::Enqueue;
#=====================================================================
528 {
529 use Carp;
#---------------------------------------------------------------------
530 use overload( q{""} => '_stringify',
531 fallback => 1 );
#---------------------------------------------------------------------
# 0 -->> URL
# 1 -->> name, if any, of link URL was got from
# 2 -->> domain
#=====================================================================
532 sub new {
533 my ( $caller, $url, $name ) = @_;
534 my $class = ref($caller) || $caller;
535 croak "Here I am. " if ref $url;
536 return undef unless $url;
537 if ( length($url) > 512 ) { # that's toooo long, don't you think?
538 $url = substr($url,0,512);
539 }
540 if ( $name and length($name) > 512 ) {
541 $name = substr($url,0,509) . '...';
542 }
543 $name = '-' unless $name; # need this to find a bug later
544 my ( $domain ) = $url =~ m,^[^:]+:/+([^/]+),;
545 bless [ $url, $name, lc($domain) ], $class;
546 }
#=====================================================================
547 sub url {
548 return $_[0]->[0];
549 }
#=====================================================================
550 sub name {
551 return $_[0]->[1];
552 }
#=====================================================================
553 sub domain {
554 return $_[0]->[2];
555 }
#=====================================================================
556 sub _stringify {
557 return $_[0]->[0];
558 }
#=====================================================================
559 }#privacy for WWW::Spyder::Enqueue ends
#=====================================================================
560 package WWW::Spyder::Page;
#=====================================================================
561 use Carp;
562 {
563 sub new {
564 my ( $caller, %arg ) = @_;
565 my $class = ref($caller) || $caller;
566 my $ego = bless {}, $class;
567 while ( my ( $method, $val ) = each %arg ) {
568 no strict "refs";
569 no warnings;
570 my $attribute = '_' . $method;
571 if ( ref $val eq 'ARRAY' ) {
572 *{"$class::$method"} = sub {
573 my($ego,$arg) = @_;
574 return @{$ego->{$attribute}} unless $arg;
575 push(@{$ego->{$attribute}}, @{$arg});
576 };
577 } else {
578 *{"$class::$method"} = sub {
579 my($ego,$arg) = @_;
580 # get if already set and deref if needed
581 if ( not $arg and exists $ego->{$attribute} ) {
582 return ref($ego->{$attribute}) eq 'SCALAR' ?
583 ${$ego->{$attribute}} : $ego->{$attribute};
584 }
585 $ego->{$attribute} = $arg if $arg; #only set one time!
586 };
587 }
588 $ego->$method($val);
589 }
590 return $ego;
591 }
#=====================================================================
592 sub links {
593 my ( $ego ) = @_;
594 return map {$_->url} @{$ego->{_pages_enQs}};
595 }
#=====================================================================
596 sub next_link {
597 my ( $ego ) = @_;
598 shift @{$ego->{_pages_enQs}};
599 }
#=====================================================================
600 }#privacy for ::Page ends
#=====================================================================
601 package WWW::Spyder::Exclusions;
#=====================================================================
602 {
# THIS PACKAGE IS NOT BEING USED
#---------------------------------------------------------------------
603 my %_domains = qw(
604 ad.doubleclick.net 1
605 ads.clickagents.com 1
606 );
607 my %_names = qw(
608 );
#=====================================================================
609 sub exclude_domain {
610 $_domains{shift}++;
611 }
#=====================================================================
612 sub excluded {
613 my $what = shift;
614 exists $_domains{$what} || $_names{$what};
615 }
#=====================================================================
616 }#privacy ends
#=====================================================================
617 package WWW::Spyder::Seed;
#=====================================================================
618 {
# THIS PACKAGE IS NOT BEING USED
#---------------------------------------------------------------------
619 use URI::Escape;
620 use Carp;
#---------------------------------------------------------------------
621 my %engine_url =
622 (
623 google => 'http://www.google.com/search?q=',
624 yahoo => 1
625 );
# should we exclude the search domain at this point? i think so b/c
# otherwise we've introduced dozens of erroneous links and the engine
# is gonna get hammered over time for it
#=====================================================================
626 sub get_seed {
627 my $engine = shift || croak "Must provide search engine! " .
628 join(', ', sort keys %engine_url) . "\n";
629 my $query = shift || croak "Must provide query terms!\n";
630 $query = uri_escape($query);
631 croak "$engine is not a valid choice!\n"
632 unless exists $engine_url{lc$engine};
633 return $engine_url{lc$engine} . $query;
634 }
#=====================================================================
635 }#privacy for WWW::Spyder::Seed ends
#=====================================================================
636 1; # eval true
#=====================================================================
# Plain Old D'errrrr
#=====================================================================
637 =pod
638 =head1 NAME
639 WWW::Spyder
640 =head1 VERSION
641 0.18
642 =head1 SYNOPSIS
643 A web spider that returns plain text, HTML, and other information per
644 page crawled and can determine what pages to get and parse based on
645 supplied terms compared to the text in links as well as page content.
646 =head1 METHODS
647 =over 2
648 =item * $spyder->new()
649 Construct a new spyder object. Without at least the seed() set, or
650 go_to_seed() turned on, the spyder isn't ready to crawl.
651 $spyder = WWW::Spyder->new(shift||die"Gimme a URL!\n");
652 # ...or...
653 $spyder = WWW::Spyder->new( %options );
654 Options include: sleep_base (in seconds), exit_on (hash of methods and
655 settings). Examples below.
656 =item * $spyder->seed($url)
657 Adds a URL (or URLs) to the top of the queues for crawl'ing. If the
658 spyder is constructed with a single scalar argument, that is considered
659 the seed_url.
660 =item * $spyder->bell([bool])
661 This will print a bell ("\a") to STDERR on every successfully crawled
662 page. It might seem annoying but it is an excellent way to know your
663 spyder is behaving and working. True value turns it on. Right now it
664 can't be turned off.
665 =item * $spyder->spyder_time([bool])
666 Returns raw seconds since I<Spyder> was created if given a
667 boolean value, otherwise returns "D day(s) HH::MM:SS."
668 =item * $spyder->terms([list of terms to match])
669 The more terms, the more the spyder is going to grasp at. If you give
670 a straight list of strings, they will be turned into very open
671 regexes. E.g.: "king" would match "sulking" and "kinglet" but not
672 "King." It is case sensitive right now. If you want more specific
673 matching or different behavior, pass your own regexes instead of
674 strings.
675 $spyder->terms( qr/\bkings?\b/i, qr/\bqueens?\b/i );
676 terms() is only settable once right now, then it's a done deal.
677 =item * $spyder->spyder_data()
678 A comma formatted number of kilobytes retrieved so far. B<Don't> give
679 it an argument. It's a set/get routine.
680 =item * $spyder->slept()
681 Returns the total number of seconds the spyder has slept while
682 running. Useful for getting accurate page/time counts (spyder
683 performance) discounting the added courtesy naps.
684 =item * $spyder->UA->...
685 The LWP::UserAgent. You can reset them, I do believe, by calling
686 methods on the UA. Here are the initialized values you might want to
687 tweak (see LWP::UserAgent for more information):
688 $spyder->UA->timeout(30);
689 $spyder->UA->max_size(250_000);
690 $spyder->UA->agent('Mozilla/5.0');
691 Changing the agent name can hurt your spyder b/c some servers won't
692 return content unless it's requested by a "browser" they recognize.
693 You should probably add your email with from() as well.
694 $spyder->UA->from('bluefintuna@fish.net');
695 =item * $spyder->cookie_file([local_file])
696 They live in $ENV{HOME}/spyderCookie by default but you can set your
697 own file if you prefer or want to save different cookie files for
698 different spyders.
699 =back
700 =head2 Weird courteous behavior
701 Courtesy didn't used to be weird, but that's another story. You will
702 probably notice that the courtesy routines force a sleep when a
703 recently seen domain is the only choice for a new link. The sleep is
704 partially randomized. This is to prevent the spyder from being
705 recognized in weblogs as a robot.
706 =head2 The web and courtesy
707 B<Please>, I beg of thee, exercise the most courtesy you can. Don't
708 let impatience get in the way. Bandwidth and server traffic are
709 C<$MONEY> for real. The web is an extremely disorganized and corrupted
710 database at the root but companies and individuals pay to keep it
711 available. The less pain you cause by banging away on a webserver with
712 a web agent, the more welcome the next web agent will be.
713 B<Update>: Google seems to be excluding generic LWP agents now. See, I
714 told you so. A single parallel robot can really hammer a major server,
715 even someone with as big a farm and as much bandwidth as Google.
716 =head2 VERBOSITY
717 =over 2
718 =item * $spyder->verbosity([1-6]) -OR-
719 =item * $WWW::Spyder::VERBOSITY = ...
720 Set it from 1 to 6 right now to get varying amounts of extra info to
721 STDOUT. It's an uneven scale and will be straightened out pretty soon.
722 If kids have a preference for sending the info to STDERR, I'll do
723 that. I might anyway.
724 =back
725 =head1 SAMPLE USAGE
726 =head2 See "spyder-mini-bio" in this distribution
727 It's an extremely simple, but fairly cool pseudo bio-researcher.
728 =head2 Simple continually crawling spyder:
729 In the following code snippet:
730 use WWW::Spyder;
731 my $spyder = WWW::Spyder->new( shift || die"Give me a URL!\n" );
732 while ( my $page = $spyder->crawl ) {
733 print '-'x70,"\n";
734 print "Spydering: ", $page->title, "\n";
735 print " URL: ", $page->url, "\n";
736 print " Desc: ", $page->description || 'n/a', "\n";
737 print '-'x70,"\n";
738 while ( my $link = $page->next_link ) {
739 printf "%22s ->> %s\n",
740 length($link->name) > 22 ?
741 substr($link->name,0,19).'...' : $link->name,
742 length($link) > 43 ?
743 substr($link,0,40).'...' : $link;
744 }
745 }
746 as long as unique URLs are being found in the pages crawl'd, the
747 spyder will never stop.
748 Each "crawl" returns a page object which gives the following methods
749 to get information about the page.
750 =over 2
751 =item * $page->links
752 URLs found on the page.
753 =item * $page->title
754 Page's <TITLE> Title </TITLE> if there is one.
755 =item * $page->text
756 The parsed plain text out of the page. Uses HTML::Parser and tries to
757 ignore non-readable stuff like comments and scripts.
758 =item * $page->url
759 =item * $page->domain
760 =item * $page->raw
761 The content returned by the server. Should be HTML.
762 =item * $page->description
763 The META description of the page if there is one.
764 =item * $page->links
765 Returns a list of the URLs in the page. Note: next_link() will shift
766 the available list of links() each time it's called.
767 =item * $link = $page->next_link
768 next_link() destructively returns the next URI-ish object in the page.
769 They are objects with three accessors.
770 =over 5
771 =item * $link->url
772 This is also overloaded so that interpolating "$link" will get the
773 URL just as the method does.
774 =item * $link->name
775 =item * $link->domain
776 =back
777 =back
778 =head2 Spyder that will give up the ghost...
779 The following spyder is initialized to stop crawling when I<either> of
780 its conditions are met: 10mins pass or 300 pages are crawled.
781 use WWW::Spyder;
782 my $url = shift || die "Please give me a URL to start!\n";
783 my $spyder = WWW::Spyder->new
784 (seed => $url,
785 sleep_base => 10,
786 exit_on => { pages => 300,
787 time => '10min', },);
788 while ( my $page = $spyder->crawl ) {
789 print '-'x70,"\n";
790 print "Spydering: ", $page->title, "\n";
791 print " URL: ", $page->url, "\n";
792 print " Desc: ", $page->description || '', "\n";
793 print '-'x70,"\n";
794 while ( my $link = $page->next_link ) {
795 printf "%22s ->> %s\n",
796 length($link->name) > 22 ?
797 substr($link->name,0,19).'...' : $link->name,
798 length($link) > 43 ?
799 substr($link,0,40).'...' : $link;
800 }
801 }
802 =head2 Primitive page reader
803 use WWW::Spyder;
804 use Text::Wrap;
805 my $url = shift || die "Please give me a URL to start!\n";
806 @ARGV or die "Please also give me a search term.\n";
807 my $spyder = WWW::Spyder->new;
808 $spyder->seed_url($url);
809 $spyder->terms(@ARGV);
810 while ( my $page = $spyder->crawl ) {
811 print '-'x70,"\n * ";
812 print $page->title, "\n";
813 print '-'x70,"\n";
814 print wrap('','', $page->text);
815 sleep 60;
816 }
817 =head1 TIPS
818 If you are going to do anything important with it, implement some
819 signal blocking to prevent accidental problems and tie your gathered
820 information to a DB_File or some such.
821 Right now the module loads C<POSIX::nice(40)>. It won't do that in
822 future versions but you might consider doing it yourself. It should
823 top the nice off at your system's max and prevent your spyder from
824 interfering with your system.
825 You might want to to set $| = 1.
826 =head1 PRIVATE METHODS
827 =head2 are private but hack away if you're inclined
828 =head1 TO DO
829 I<Spyder> is conceived to live in a future namespace as a servant class
830 for a complex web research agent with simple interfaces to
831 pre-designed grammars for research reports; or self-designed
832 grammars/reports (might be implemented via Parse::FastDescent if that
833 lazy-bones Conway would just find another 5 hours in the paltry 32
834 hour day he's presently working).
835 I'd like the thing to be able to parse RTF, PDF, and perhaps even
836 resource sections of image files but that isn't on the radar right
837 now.
838 =head1 TO DOABLE BY 1.0
839 Add 2-4 sample scripts that are a bit more useful.
840 There are many functions that should be under the programmer's control
841 and not buried in the spyder. They will emerge soon. I'd like to put
842 in hooks to allow the user to keep(), toss(), or exclude(), urls, link
843 names, and domains, while crawl'ing.
844 Clean up some redundant, sloppy, and weird code. Probably change or
845 remove the AUTOLOAD.
846 Put in a go_to_seed() method and a subclass, ::Seed, with rules to
847 construct query URLs by search engine. It would be the autostart or the
848 fallback for perpetual spyders that run out of links. It would hit a
849 given or default search engine with the I<Spyder>'s terms as the query.
850 Obviously this would only work with terms() defined.
851 Implement auto-exclusion for failure vs. success rates on names as well
852 as domains (maybe URI suffixes too).
853 Turn length of courtesy queue into the breadth/depth setting? make it
854 automatically adjusting...?
855 Consistently found link names are excluded from term strength sorting?
856 Eg: "privacy policy," "read more," "copyright..."
857 Fix some image tag parsing problems and add area tag parsing.
858 Configuration for user:password by domain.
859 ::Page objects become reusable so that a spyder only needs one.
860 ::Enqueue objects become indexed so they are nixable from anywhere.
861 Expand exit_on routines to size, slept time, dwindling success ratio,
862 and maybe more.
863 Make methods to set "skepticism" and "effort" which will influence the
864 way the terms are used to keep, order, and toss URLs.
865 =head1 BE WARNED
866 This module already does some extremely useful things but it's in its
867 infancy and it is conceived to live in a different namespace and
868 perhaps become more private as a subservient part of a parent class.
869 This may never happen but it's the idea. So don't put this into
870 production code yet. I am endeavoring to keep its interface constant
871 either way. That said, it could change completely.
872 =head2 Also!
873 This module saves cookies to the user's home. There will be more
874 control over cookies in the future, but that's how it is right now.
875 They live in $ENV{HOME}/spyderCookie.
876 =head2 Anche!
877 Robot Rules aren't respected. I<Spyder> endeavors to be polite as far
878 as server hits are concerned, but doesn't take "no" for answer right
879 now. I want to add this, and not just by domain, but by page settings.
880 =head1 UNDOCUMENTED FEATURES
881 A.k.a. Bugs. Don't be ridiculous! Bugs in B<my code>?!
882 There is a bug that is causing retrieval of image src tags, I think
883 but haven't tracked it down yet, as links. I also think the plain text
884 parsing has some problems which will be remedied shortly.
885 If you are building more than one spyder in the same script they are
886 going to share the same exit_on parameters because it's a
887 self-installing method. This will not always be so.
888 See B<Bugs> file for more open and past issues.
889 Let me know if you find any others. If you find one that is platform
890 specific, please send patch code/suggestion b/c I might not have any
891 idea how to fix it.
892 =head1 WHY I<Spyder?>
893 I didn't want to use the more appropriate I<Spider> because I think
894 there is a better one out there somewhere in the zeitgeist and the
895 namespace future of I<Spyder> is uncertain. It may end up a
896 semi-private part of a bigger family. And I may be King of Kenya
897 someday. One's got to dream.
898 If you like I<Spyder>, have feedback, wishlist usage, better
899 algorithms/implementations for any part of it, please let me know!
900 =head1 AUTHOR, AUTHOR
901 Ashley5, ashley@cpan.org. Bob's your monkey's uncle.
902 =head1 COPYRIGHT
903 (c)2001-2002 Ashley Pond V. All rights reserved. This program is free
904 software; you may redistribute or modify it under the same terms as
905 Perl.
906 =head1 THANKS TO
907 Most all y'all. Especially Lincoln Stein, Gisle Aas, The Conway,
908 Raphael Manfredi, Gurusamy Sarathy, and plenty of others.
909 =head1 COMPARE WITH
910 WWW::Robot, LWP::UserAgent, WWW::SimpleRobot, WWW::RobotRules,
911 LWP::RobotUA, and other kith and kin.
912 =cut