File: WWW-Spyder.pm
# All Lines | No Line #s | View the Viewer
    #=====================================================================
  1 package WWW::Spyder;
    #=====================================================================
  2 use strict;
  3 use warnings;
    #---------------------------------------------------------------------
  4 use HTML::Parser 3;
    #---------------------------------------------------------------------
  5 use LWP::UserAgent;
  6 use HTTP::Cookies;
  7 use URI::URL;
  8 use HTML::Entities;
  9 use POSIX; POSIX::nice(40);
    #---------------------------------------------------------------------
 10 use Digest::MD5 "md5_base64";      # for making seen content key/index
    #---------------------------------------------------------------------
 11 use Carp;
 12 our $VERSION = '0.18';
 13 our $VERBOSITY ||= 0;
    #=====================================================================
    #  METHODS
    #=====================================================================
 14 { # make it all a bit more private
 15     my %_methods = (# these are methods & roots of our attribute names
 16                     UA           => undef,
 17                     bell         => undef,
 18                     html_parser  => undef,
 19                     sleep_base   => undef,
 20                     cookie_file  => undef,
 21                     _exit_epoch  => undef,
 22                     _term_count  => undef,
 23                     );
    #  those may all get hardcoded eventually, but they're handy for now
    #=====================================================================
 24 sub new {
 25     my ( $caller ) = shift;
 26     my $class = ref($caller) || $caller;
 27     my $ego = bless {}, $class;
    
 28     my ( $seed, %arg );
 29     if ( @_ == 1 ) {
 30         ( $seed ) = @_;
 31     }
 32     %arg = ( sleep      => undef,
 33              exit_on    => undef,
 34              seed       => undef,
 35              sleep_base => 5,
 36              );
 37     %arg = ( %arg, @_ ) unless @_ % 2;
    
    # install all our methods, either set once then get only or push/shift
    # array refs
 38     for my $method ( keys %_methods ) {
 39         no strict "refs";
 40         no warnings;
 41         my $attribute = '_' . $method;
    
 42         if ( ref $_methods{$method} eq 'ARRAY' ) {
 43             *{"$class::$method"} = sub { 
 44                 my($ego,@args) = @_;
 45                 return shift(@{$ego->{$attribute}}) unless @args;
 46                 push(@{$ego->{$attribute}}, @args);
 47             };
 48         } else {
 49             *{"$class::$method"} = sub { 
 50                 my($ego,$arg) = @_;
 51                 carp "You cannot reset $method!"
 52                     if $arg and exists $ego->{$attribute};
 53                 return $ego->{$attribute}   #get if already set
 54                 if exists $ego->{$attribute};
 55                 $ego->{$attribute} = $arg;  #only set one time!
 56             };
 57         }
 58     }
 59     $seed ||= $arg{seed};
 60     $ego->seed($seed) if $seed;
 61     $ego->sleep_base($arg{sleep_base});
 62     $ego->_install_exit_check(\%arg) unless $ego->can('_exit_check');
 63     $ego->_install_html_parser;
 64     $ego->_install_web_agent;
 65     return $ego;
 66 }
    #=====================================================================
 67 sub terms {
 68     my ($ego,@terms) = @_;
 69     if ( @terms and not exists $ego->{_terms} ) {
 70         $ego->_term_count(scalar @terms);  # makes this set once op
 71         my %terms;
 72         $terms{$_} = qr/$_/ for @terms;
 73         $ego->{_terms} = \%terms;
 74     } else {
 75         return $ego->{_terms}
 76     }
 77 }
    #=====================================================================
 78 sub show_attributes {
 79     my ($ego) = @_;
 80     return map {/^_(.+)$/} keys %{$ego};
 81 }
    #=====================================================================
 82 sub slept {
 83     my ($ego, $time) = @_;
 84     $ego->{_Slept} += $time if $time;
 85     return $ego->{_Slept} unless $time;
 86 }
    #=====================================================================
 87 sub seed {
 88     my ($ego, $url) = @_;
 89     $url or croak "Must provide URL to seed().";
 90     croak "You have passed something besides a plain URL to seed()!"
 91         if ref $url;
 92     $ego->stack_urls($url);
 93     return 1; # to the top of the stacks
 94 }
    #=====================================================================
 95 sub crawl {
 96     my ($ego) = @_;
    
 97     while ('I have pages to get...') {
    
 98         $ego->_exit_check and return;
    
 99         my $enQ = $ego->_choose_courteously || 
100             $ego->_just_choose || 
101                 return;
    
102         my $url = $enQ->url;
103         $ego->url($url);
104         $ego->_current_enQ($enQ);
    
105         print "GET'ing: $url\n" if $VERBOSITY;
    
106         my $response = $ego->UA->request   # no redirects &c is simple_
107             ( HTTP::Request->new( GET => "$url" ) );
108         print STDERR "\a" if $ego->bell;
    
109         $response or
110             carp "$url failed GET!" and next;
    
111         push @{$ego->{_courtesy_Queue}}, $enQ->domain;
112         shift @{$ego->{_courtesy_Queue}} 
113         if $ego->{_courtesy_Queue} 
114         and @{$ego->{_courtesy_Queue}} > 100;
    
115         my $head = $response->headers_as_string;
116         $head or
117             carp "$url has no HEAD!" and
118                 next; # no headless webpages
    
119         length($head) > 1_024 and $head = substr($head,0,1_024);
    
120         print $head, "\n" if $VERBOSITY > 2;
    
121         my $base;
122         eval { $base = $response->base };
123         $base or
124             carp "$url has no discernible BASE!" and
125                 next; # no baseless webpages
    
    # WE SHOULD also look for <HTML> b/c some servers that we might want
    # to look at don't properly report the content-type
    
    # start over unless this is something we can read
126         lc($head) =~ /content-type:\s?(?:text|html)/ or
127             carp "$url doesn't look like TEXT or HTML!" and
128                 next; # no weird media, movies, flash, etc
    
129         my ( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n,;
    
130         my ( $description ) = $head =~ 
131             /[^:]*?DESCRIPTION:\s*((?:[^\n]+(?:\n )?)+)/i;
    
132         $description = $ego->_snip($description) if $description;
    
133         my $page = $response->content or
134             carp "Failed to fetch $url." and
135                 next; # no empty pages, start over with next url
    
136         $ego->{_current_Bytes} = length($page);
137         $ego->spyder_data($ego->{_current_Bytes});
    
    # we are going to use a digest to prevent parsing the identical
    # content received via a different url
138         my $digest = md5_base64($page); # unique microtag of the page
    # so if we've seen it before, start over with the next URL
139         $ego->{_page_Memory}{$digest}++ and 
140             carp "Seen this page's content before: $url" 
141                 and next;
    
142         $ego->{_page_content} = $page;
143         print "PARSING: $url\n" if $VERBOSITY > 1;
144         $ego->{_spydered}{$url}++;
145         $ego->html_parser->parse($page);
146         $ego->html_parser->eof;
    
147         $ego->{_adjustment} = $ego->_parse_for_terms if $ego->terms;
    
    # make links absolute and fix bad spacing in link names, then turn
    # them into an Enqueue object
148         for my $pair ( @{$ego->{_enqueue_Objects}} ) {
149             my $url;
150             eval {
151                 $url = URI::URL::url($pair->[0], $base)->abs;
152             };
153             my $name =  _snip($pair->[1]);
154             my $item = WWW::Spyder::Enqueue->new("$url",$name);
155             $pair = $item;
156         }
    # put links into the queue(s)
157        $ego->stack_urls() if $ego->_links;
    
    # clean up text a bit. should this be here...?
158         if ( $ego->{_text} and ${$ego->{_text}} ) {
159             ${$ego->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g;
160         }
    
    # in the future Page object should be installed like parsers as a
    # reusable container
    #    return
161   my $Page =
162       WWW::Spyder::Page->new(
163                              title  => $title,
164                              text   => $ego->{_text},
165                              raw    => \$page,
166                              url    => $enQ->url,
167                              domain => $enQ->domain,
168                              link_name   => undef,
169                              link        => undef,
170                              description => $description || '',
171                              pages_enQs  => $ego->_enqueue,
172                              );
173         $ego->_reset;       #<<--clear out things that might remain
174         return $Page;
175     }
176 }
    #=====================================================================
177 sub stack_urls {  # should eventually be broken into stack and sift?
    
    # dual purpose, w/ terms it filters as long as there are no urls
    # passed, otherwise it's setting them to the top of the queues
178     my ($ego, @urls) = @_;
    
179     print "Stacking " . join(', ', @urls) . "\n"
180         if @urls and $VERBOSITY > 5;
    
181     if ( $ego->terms and not @urls ) {
182         no warnings;
183         my @Qs = $ego->_queues;
184         for my $enQ ( @{$ego->_enqueue} ) {
185             my ( $url, $name ) = ( $enQ->url, $enQ->name );
    
186             next if $ego->_seen($url);
    
187             my $match = 0;
188             while ( my ($term,$rx) = each %{$ego->terms} ) {
189                 $match++ for $name =~ /$rx/g;
190             }
191             my $baseQ = 10;
192             my $adjustment = $ego->{_adjustment};
193             $baseQ -= $adjustment; # 4 to 0
    
194             push @{$ego->{$baseQ}}, $enQ
195                 and next unless $match;
    
196             if ( $VERBOSITY > 1 ) {
197                 print "NAME: $name\n";
198                 printf "   RATIO -->> %d\n", $match;
199             }
200             my $queue_index = sprintf "%d", 
201                 $ego->_term_count / $match;
    
202             $queue_index -= $adjustment;
203             $queue_index = 4 if $queue_index > 4;
204             $queue_index = 0 if $queue_index < 0;
205             my $queue = $Qs[$queue_index];
206             if ($VERBOSITY > 2) {
207                 print "Q:$queue [$queue_index] match: $match terms:",
208                 $ego->_term_count, "  Adjust: $adjustment\n\n";
209             }
210             push @{$ego->{$queue}}, $enQ;
211         }
212     } elsif ( @urls > 0 ) {
213         for my $url ( @urls ) {
214             next if $ego->_seen($url);
215             my $queue = $ego->_queues;
216             carp "Placing $url in '$queue'\n" if $VERBOSITY > 2;
    
    # unshift b/c seeding is priority
217             unshift @{$ego->{$queue}},
218                WWW::Spyder::Enqueue->new($url,undef);
219         }
220     } else {
221        for my $enQ ( @{$ego->_enqueue} ) {
222             my ( $url, $name ) = ( $enQ->url, $enQ->name );
223             next if $ego->_seen($url);
224             my $queue = $ego->_queues;
225             push @{$ego->{$queue}}, $enQ;
226         }
227    }
228 }
    #=====================================================================
229 sub queue_count {
230     my ($ego) = @_;
231     my $count = 0;
232     for my $Q ( $ego->_queues ) {
233         next unless ref($ego->{$Q}) eq 'ARRAY';
234         $count += scalar @{$ego->{$Q}};
235     }
236     return $count;
237 }
    #=====================================================================
238 sub spyder_time {
239     my ($ego,$raw) = @_;
    
240     my $time = time() - $^T;
241     return $time if $raw;
    
242     my $day  = int( $time / 86400 );
243     my $hour = int( $time / 3600 ) % 24;
244     my $min  = int( $time / 60 ) % 60;
245     my $sec  = $time % 60;
    
    # also collect slept time!
246     return sprintf "%d day%s %02d:%02d:%02d",
247     $day, $day == 1?'':'s', $hour, $min, $sec;
248 }
    #=====================================================================
249 sub spyder_data {
250     my ($ego, $bytes) = @_;
251     $ego->{_bytes_GOT} += $bytes and return $bytes if $bytes;
    
252     return 0 unless $ego->{_bytes_GOT};
    
253     my $for_commas = int($ego->{_bytes_GOT} / 1_024);
    
254     for ( $for_commas ) {
255         1 while s/(\d)(\d\d\d)(?!\d)/$1,$2/;
256     }
257     return $for_commas;
258 }
    #=====================================================================
259 sub spydered {
260     my ($ego) = @_;
261     return wantarray ? 
262         keys %{ $ego->{_spydered} } : 
263             scalar keys %{ $ego->{_spydered} };
264 }
    #=====================================================================
265 sub exclude {  # what about FILES TYPES!?
266     return undef; # not working yet!
267     my ($ego,$thing) = @_;
268     if ( $thing =~ m<^[^:]{3,5}://> ) 
269     {
270         return $ego->{_Xklood}{_domain}{$thing}++;
271     } 
272     elsif ( $thing ) 
273     {
274         return $ego->{_Xklood}{_name}{$thing}++;
275     }
276 }
    #=====================================================================
277 sub excluded_domains {
278     return undef; # not working yet!
279     my ($ego) = @_;
280     return wantarray ?
281         keys %{$ego->{_Xklood}{_domain}} : 
282             [ keys %{$ego->{_Xklood}{_domain}} ];    
283 }
    #=====================================================================
284 sub excluded_names {
285     return undef; # not working yet!
286     my ($ego) = @_;
287     return wantarray ?
288         keys %{$ego->{_Xklood}{_name}} : 
289             [ keys %{$ego->{_Xklood}{_name}} ];    
290 }
    #=====================================================================
291 sub go_to_seed {
292     my ( $ego, $engine, $query ) = @_;
293     carp "go_to_seed() is not functional yet!\n";
294     return;  # NOT FUNCTIONAL
295     my $seed = WWW::Spyder::Seed::get_seed($engine, $query);
296     $ego->seed($seed);
297 }
    #=====================================================================
298 sub verbosity {
299     my ( $ego, $verbosity ) = @_;
300     carp "Not setting verbosity! Must be integer b/t 1 & 6!\n"
301         and return
302             unless $verbosity;
303     $VERBOSITY = $verbosity;
304 }
    #=====================================================================
    
    #=====================================================================
    #  PRIVATE Spyder Methods
    #=====================================================================
305 sub _reset {
    # RESET MORE THAN THIS!?! make sure all the memory space is clean that
    # needs be for clean iteration???
306     my ($ego) = @_;
307     $ego->{$_} = undef for qw( _linkText _linkSwitch _href
308                                _current_enQ _page_content
309                                _current_Bytes _alt _enqueue_Objects
310                                _text );
311 }
    #=====================================================================
312 sub _current_enQ {
313     my ($ego, $enQ) = @_;
314     my $last_enQ = $ego->{_current_enQ};
315     $ego->{_current_enQ} = $enQ if $enQ;
316     return $last_enQ; #<<-so we can get last while setting a new one
317 }
    #=====================================================================
318 sub _enqueue {
319     my ($ego,$enQ) = @_;
320     push @{$ego->{_enqueue_Objects}}, $enQ if $enQ;
321     return $ego->{_enqueue_Objects};
322 }
    #=====================================================================
323 sub _links {
324     my ($ego) = @_;
325     return [ map { $_->url } @{$ego->_enqueue} ];
326 }
    #=====================================================================
327 sub _seen {
328     my ($ego,$url) = @_;
329     return $ego->{_seenURLs}{$url}++;
330 }
    #=====================================================================
331 sub _parse_for_terms {
332     my ($ego) = @_;
333     $ego->{_page_terms_matches} = 0;
    
334     return 0 unless $ego->{_text};
    
335     while ( my ($term,$rx) = each %{$ego->terms} ) {
336         $ego->{_page_terms_matches}++ for 
337             $ego->{_page_content} =~ /$rx/g;
338     }
    
339     my $index = int( ( $ego->{_page_terms_matches} /
340                        length($ego->{_text}) ) * 1_000 );
    # the algorithm might look it but isn't entirely arbitrary
    
341     print " PARSE TERMS : $ego->{_page_terms_matches} " .
342         "/ $ego->{_current_Bytes}\n" if $VERBOSITY > 1;
    
343     return 7 if $index > 25;
344     return 6 if $index > 18;
345     return 5 if $index > 14;
346     return 4 if $index > 11;
347     return 3 if $index > 7;
348     return 2 if $index > 3;
349     return 1 if $index > 0;
350     return 0;
351 }
    #=====================================================================
352 sub _install_html_parser {
353     my ($ego) = @_;
    
354     my $Parser = HTML::Parser->new
355         (
356          start_h =>
357          [sub {
358              no warnings;
359              my ( $tag, $attr ) = @_;
360              return if $tag !~ /^(?:a|img)$/;
    # need to deal with AREA tags from maps /^(?:a(?:rea)?|img)$/;
361              $attr->{href} =~ s,#[^/]*$,,;
362              return if lc($attr->{href}) =~ m,^\s*mailto:,;
363              return if lc($attr->{href}) =~ m,^\s*file:,;
364              return if lc($attr->{href}) =~ m,javascript:,;
365              $ego->{_href} ||= $attr->{href};
366              $ego->{_alt}  ||= $attr->{alt};
367              $ego->{_linkSwitch} = 1;
368          }, 'tagname, attr'],
369          text_h =>
370          [sub {
371              return unless(my $it = shift);
372              return if $it =~
373                  m/(?:\Q<!--\E)|(?:\Q-->\E)/;
374              ${$ego->{_text}} .= $it;
375              $ego->{_linkText} .= $it 
376                  if $ego->{_linkSwitch};
377           }, 'dtext'],
378          end_h =>
379          [sub {
380              my ( $tag ) = @_;
381              no warnings; # only problem: <a><b>L</b>inks</a>
382              return unless $tag eq 'a' or $ego->{_linkSwitch};
383              $ego->{_linkText} ||= $ego->{_alt} || '+';
384              decode_entities($ego->{_linkText});
    
385              push @{$ego->{_enqueue_Objects}}, 
386                  [ $ego->{_href}, $ego->{_linkText} ];
    
387           # reset all our caching variables
388              $ego->{_linkSwitch} = $ego->{_href} = $ego->{_alt} = 
389                  $ego->{_linkText} = undef;
390          }, 'tagname'],
391          default_h => [""],
392         );
393     $Parser->ignore_elements(qw(script style));
394     $Parser->unbroken_text(1);
395     $ego->html_parser($Parser);
396 }
    #=====================================================================
397 sub _install_web_agent {
398     my $ego = shift;
399     $ego->UA( LWP::UserAgent->new );
400     $ego->UA->agent('Mozilla/5.0');
401     $ego->UA->timeout(30);
402     $ego->UA->max_size(250_000);
    
403     my $jar_jar = HTTP::Cookies->new
404         (file => $ego->cookie_file || "$ENV{HOME}/spyderCookies",
405          autosave => 1,
406          max_cookie_size => 4096,
407          max_cookies_per_domain => 5, );
408     $ego->UA->cookie_jar($jar_jar);
409 }
    #=====================================================================
410 sub _install_exit_check {
411     my ($ego, $arg) = @_;
412     my $class = ref $ego;
    
413     unless ( ref($arg) and ref($arg->{exit_on}) eq 'HASH' ) {
414         no strict "refs";
415         *{$class."::_exit_check"} = 
416             sub { return 1 unless $ego->queue_count;
417                   return 0;
418               };
419         return;
420     }
    
    # checks can be: links => #, success => ratio, time => 10min...
    # a piece of code we're going to build up to eval into method-hood
421     my $SUB = 'sub {  my $ego = shift; ' .
422     'return 1 unless $ego->queue_count; ';
    #------------------------------------------------------------
423     if ( $arg->{exit_on}{pages} ) {
424         print "Installing EXIT on links: $arg->{exit_on}{pages}\n"
425             if $VERBOSITY > 1;
426         $SUB .= ' return 1 if ' .
427             '$ego->spydered >= ' .$arg->{exit_on}{pages} .';';
428     }
    #------------------------------------------------------------
429     if ( $arg->{exit_on}{success} ) {
430         #set necessary obj value and add to sub code
431     }
    #------------------------------------------------------------
432     if ( $arg->{exit_on}{time} ) {
433         print "Installing EXIT on time: $arg->{exit_on}{time}\n"
434             if $VERBOSITY > 1;
    
435         my ($amount,$unit) = 
436             $arg->{exit_on}{time} =~ /^(\d+)\W*(\w+?)s?$/;
    # skip final "s" in case of hours, secs, mins
    
437         my %times = ( hour => 3600,
438                       min  => 60,
439                       sec  => 1 );
    
440         my $time_factor = 0;
441         for ( keys %times ) {
442             next unless exists $times{$unit};
443             $time_factor = $amount * $times{$unit};
444         }
445         $ego->_exit_epoch($time_factor + $^T);
    
446         $SUB .= q{
447             return 1 if $ego->_exit_epoch < time();
448         };
449     }
    #------------------------------------------------------------
450     $SUB .= '}';
    
451     no strict "refs";
452     *{$class."::_exit_check"} = eval $SUB;
453 }
    #=====================================================================
454 sub _choose_courteously {
455     my $ego = shift;
    
    # w/o the switch and $i-- it acts a bit more depth first. w/ it, it's
    # basically hard head down breadth first
456     print "CHOOSING courteously!\n" if $VERBOSITY > 1;
457     for my $Q ( $ego->_queues ) {
458         print "Looking for URL in $Q\n" if $VERBOSITY > 2;
459         next unless $ego->{$Q} and @{$ego->{$Q}} > 0;
460         my %seen;
461         my $total = scalar @{$ego->{$Q}};
462         my $switch;
463         for ( my $i = 0; $i < @{$ego->{$Q}}; $i++ ) {
464             my $enQ = $ego->{$Q}[$i];
465             my ($url,$name) = ( $enQ->url, $enQ->name );
    
    # if we see one again, we've reshuffled as much as is useful
466             $seen{$url}++ 
467                 and $switch = 1; # progress through to next Q
    
468             return splice(@{$ego->{$Q}},$i,1)
469                 unless $ego->_courtesy_call($enQ);
    
470             my $fair_bump = int( log( $total - $i ) / log(1.5) );
    
471             my $move_me_back = splice(@{$ego->{$Q}},$i,1);
472             splice(@{$ego->{$Q}},($i+$fair_bump),0,$move_me_back);
473             $i-- unless $switch;
474         }
475     }
476     return undef;  # we couldn't pick one courteously
477 } # end of _choose_courteously()
    #=====================================================================
478 sub _just_choose {
479     my $ego = shift;
480     print "CHOOSING first up!\n" if $VERBOSITY > 1;
    
481     my $enQ;
482     for my $Q ( $ego->_queues ) {
483         next unless ref($ego->{$Q}) eq 'ARRAY';
484         $enQ = shift @{$ego->{$Q}};
485         last;
486     }
487     my $tax = $ego->_courtesy_call($enQ);
488     if ( $VERBOSITY > 4 ) {
489         print ' QUEUE: ';
490         print join("-:-", @{$ego->{_courtesy_Queue}}), "\n" 
491             if $ego->{_courtesy_Queue};
492     }
493     my $sleep = int(rand($ego->sleep_base)) + $tax;
    
494     if ( $VERBOSITY ) {
495         printf "COURTESY NAP %d second%s ",
496         $sleep, $sleep == 1 ?'':'s';
497         printf "(Domain recently seen: %d time%s)\n",
498         $tax, $tax == 1 ?'':'s';
499     }
500     sleep $sleep; # courtesy to websites but human-ish w/ random
501     $ego->slept($sleep);
502     return $enQ;
503 }
    #=====================================================================
504 sub _courtesy_call {
505     my ($ego,$enQ) = @_;
506     return 0 unless $enQ;
507     my $domain = $enQ->domain;
    
508     print 'COURTESY check: ', $domain, "\n" if $VERBOSITY > 5;
    
    # yes, we have seen it in the last whatever GETs
509     my $seen = 0;
510     $seen = scalar grep { $_ eq $domain } 
511         @{$ego->{_courtesy_Queue}};
512     $seen = 10 if $seen > 10;
513     return $seen;
514 }
    #=====================================================================
515 sub _queues {  # Q9 is purely for trash so it's not returned here
516     return wantarray ?
517         ( 0 .. 9 ) :
518             '0';
519 }
    #=====================================================================
520 sub _snip {
521     my $ego = shift if ref($_[0]);
522     my ( @text ) = @_;
523     s/^\s+//, s/\s+$//, s/\s+/ /g for @text;
524     return wantarray ? @text : shift @text;
525 }
    #=====================================================================
    # Spyder ENDS
    #=====================================================================
526 }# WWW::Spyder privacy ends
    
    
    #=====================================================================
527 package WWW::Spyder::Enqueue;
    #=====================================================================
528 {
529     use Carp;
    #---------------------------------------------------------------------
530 use overload( q{""} => '_stringify',
531               fallback => 1 );
    #---------------------------------------------------------------------
    #  0 -->> URL
    #  1 -->> name, if any, of link URL was got from
    #  2 -->> domain
    #=====================================================================
532 sub new {
533     my ( $caller, $url, $name ) = @_;
534     my $class = ref($caller) || $caller;
535     croak "Here I am. " if ref $url;
536     return undef unless $url;
537     if ( length($url) > 512 ) { # that's toooo long, don't you think?
538         $url = substr($url,0,512);
539     }
540     if ( $name and length($name) > 512 ) {
541         $name = substr($url,0,509) . '...';
542     }
543     $name = '-' unless $name; # need this to find a bug later
544     my ( $domain ) = $url =~ m,^[^:]+:/+([^/]+),;
545     bless [ $url, $name, lc($domain) ], $class;
546 }
    #=====================================================================
547 sub url {
548     return $_[0]->[0];
549 }
    #=====================================================================
550 sub name {
551     return $_[0]->[1];
552 }
    #=====================================================================
553 sub domain {
554     return $_[0]->[2];
555 }
    #=====================================================================
556 sub _stringify {
557     return $_[0]->[0];
558 }
    #=====================================================================
559 }#privacy for WWW::Spyder::Enqueue ends
    
    
    #=====================================================================
560 package WWW::Spyder::Page;
    #=====================================================================
561 use Carp;
562 {
563 sub new {
564     my ( $caller, %arg ) = @_;
565     my $class = ref($caller) || $caller;
566     my $ego = bless {}, $class;
    
567     while ( my ( $method, $val ) = each %arg ) {
    
568         no strict "refs";
569         no warnings;
570         my $attribute = '_' . $method;
    
571         if ( ref $val eq 'ARRAY' ) {
572             *{"$class::$method"} = sub {
573                 my($ego,$arg) = @_;
574                 return @{$ego->{$attribute}} unless $arg;
575                 push(@{$ego->{$attribute}}, @{$arg});
576             };
577         } else {
578             *{"$class::$method"} = sub {
579                 my($ego,$arg) = @_;
580                 # get if already set and deref if needed
581                 if ( not $arg and exists $ego->{$attribute} ) {
582                     return ref($ego->{$attribute}) eq 'SCALAR' ?
583                         ${$ego->{$attribute}} : $ego->{$attribute};
584                  }
585                 $ego->{$attribute} = $arg if $arg;  #only set one time!
586             };
587         }
588         $ego->$method($val);
589     }
590     return $ego;
591 }
    #=====================================================================
592 sub links {
593     my ( $ego ) = @_;
594     return map {$_->url} @{$ego->{_pages_enQs}};
595 }
    #=====================================================================
596 sub next_link {
597     my ( $ego ) = @_;
598     shift @{$ego->{_pages_enQs}};
599 }
    #=====================================================================
600 }#privacy for ::Page ends
    
    
    #=====================================================================
601 package WWW::Spyder::Exclusions;
    #=====================================================================
602 {
    # THIS PACKAGE IS NOT BEING USED
    #---------------------------------------------------------------------
603 my %_domains = qw(
604                   ad.doubleclick.net        1
605                   ads.clickagents.com       1
606                   );
607 my %_names = qw(
    
608                 );
    #=====================================================================
609 sub exclude_domain {
610     $_domains{shift}++;
611 }
    #=====================================================================
612 sub excluded {
613     my $what = shift;
614     exists $_domains{$what} || $_names{$what};
615 }
    #=====================================================================
616 }#privacy ends
    
    
    #=====================================================================
617 package WWW::Spyder::Seed;
    #=====================================================================
618 {
    # THIS PACKAGE IS NOT BEING USED
    #---------------------------------------------------------------------
619 use URI::Escape;
620 use Carp;
    #---------------------------------------------------------------------
621 my %engine_url = 
622     (
623      google => 'http://www.google.com/search?q=',
624      yahoo =>  1
625      );
    
    # should we exclude the search domain at this point? i think so b/c
    # otherwise we've introduced dozens of erroneous links and the engine
    # is gonna get hammered over time for it
    #=====================================================================
626 sub get_seed {
    
627     my $engine = shift || croak "Must provide search engine! " . 
628         join(', ', sort keys %engine_url) . "\n";
    
629     my $query  = shift || croak "Must provide query terms!\n";
630     $query = uri_escape($query);
    
631     croak "$engine is not a valid choice!\n" 
632         unless exists $engine_url{lc$engine};
    
633     return $engine_url{lc$engine} . $query;
634 }
    #=====================================================================
635 }#privacy for WWW::Spyder::Seed ends
    #=====================================================================
636 1;  # eval true
    #=====================================================================
    #  Plain Old D'errrrr
    #=====================================================================
637 =pod
    
638 =head1 NAME
    
639 WWW::Spyder
    
640 =head1 VERSION 
    
641 0.18
    
642 =head1 SYNOPSIS
    
643 A web spider that returns plain text, HTML, and other information per
644 page crawled and can determine what pages to get and parse based on
645 supplied terms compared to the text in links as well as page content.
    
646 =head1 METHODS
    
647 =over 2
    
648 =item * $spyder->new()
    
649 Construct a new spyder object. Without at least the seed() set, or
650 go_to_seed() turned on, the spyder isn't ready to crawl.
    
651  $spyder = WWW::Spyder->new(shift||die"Gimme a URL!\n");
652     # ...or...
653  $spyder = WWW::Spyder->new( %options );
    
654 Options include: sleep_base (in seconds), exit_on (hash of methods and
655 settings). Examples below.
    
656 =item * $spyder->seed($url)
    
657 Adds a URL (or URLs) to the top of the queues for crawl'ing. If the
658 spyder is constructed with a single scalar argument, that is considered
659 the seed_url.
    
660 =item * $spyder->bell([bool])
    
661 This will print a bell ("\a") to STDERR on every successfully crawled
662 page. It might seem annoying but it is an excellent way to know your
663 spyder is behaving and working. True value turns it on. Right now it
664 can't be turned off.
    
665 =item * $spyder->spyder_time([bool])
    
666 Returns raw seconds since I<Spyder> was created if given a
667 boolean value, otherwise returns "D day(s) HH::MM:SS."
    
668 =item * $spyder->terms([list of terms to match])
    
669 The more terms, the more the spyder is going to grasp at. If you give
670 a straight list of strings, they will be turned into very open
671 regexes. E.g.: "king" would match "sulking" and "kinglet" but not
672 "King." It is case sensitive right now. If you want more specific
673 matching or different behavior, pass your own regexes instead of
674 strings.
    
675     $spyder->terms( qr/\bkings?\b/i, qr/\bqueens?\b/i );
    
676 terms() is only settable once right now, then it's a done deal.
    
677 =item * $spyder->spyder_data()
    
678 A comma formatted number of kilobytes retrieved so far. B<Don't> give
679 it an argument. It's a set/get routine.
    
680 =item * $spyder->slept()
    
681 Returns the total number of seconds the spyder has slept while
682 running. Useful for getting accurate page/time counts (spyder
683 performance) discounting the added courtesy naps.
    
684 =item * $spyder->UA->...
    
685 The LWP::UserAgent. You can reset them, I do believe, by calling
686 methods on the UA. Here are the initialized values you might want to
687 tweak (see LWP::UserAgent for more information):
    
688     $spyder->UA->timeout(30);
689     $spyder->UA->max_size(250_000);
690     $spyder->UA->agent('Mozilla/5.0');
    
691 Changing the agent name can hurt your spyder b/c some servers won't
692 return content unless it's requested by a "browser" they recognize.
    
693 You should probably add your email with from() as well.
    
694     $spyder->UA->from('bluefintuna@fish.net');
    
695 =item * $spyder->cookie_file([local_file])
    
696 They live in $ENV{HOME}/spyderCookie by default but you can set your
697 own file if you prefer or want to save different cookie files for
698 different spyders.
    
699 =back
    
700 =head2 Weird courteous behavior
    
701 Courtesy didn't used to be weird, but that's another story. You will
702 probably notice that the courtesy routines force a sleep when a
703 recently seen domain is the only choice for a new link. The sleep is
704 partially randomized. This is to prevent the spyder from being
705 recognized in weblogs as a robot.
    
706 =head2 The web and courtesy
    
707 B<Please>, I beg of thee, exercise the most courtesy you can. Don't
708 let impatience get in the way. Bandwidth and server traffic are
709 C<$MONEY> for real. The web is an extremely disorganized and corrupted
710 database at the root but companies and individuals pay to keep it
711 available. The less pain you cause by banging away on a webserver with
712 a web agent, the more welcome the next web agent will be.
    
713 B<Update>: Google seems to be excluding generic LWP agents now. See, I
714 told you so. A single parallel robot can really hammer a major server,
715 even someone with as big a farm and as much bandwidth as Google.
    
716 =head2 VERBOSITY
    
717 =over 2
    
718 =item * $spyder->verbosity([1-6])  -OR-
    
719 =item * $WWW::Spyder::VERBOSITY = ...
    
720 Set it from 1 to 6 right now to get varying amounts of extra info to
721 STDOUT. It's an uneven scale and will be straightened out pretty soon.
722 If kids have a preference for sending the info to STDERR, I'll do
723 that. I might anyway.
    
724 =back
    
725 =head1 SAMPLE USAGE
    
726 =head2 See "spyder-mini-bio" in this distribution
    
727 It's an extremely simple, but fairly cool pseudo bio-researcher.
    
728 =head2 Simple continually crawling spyder:
    
729 In the following code snippet:
    
730  use WWW::Spyder;
    
731  my $spyder = WWW::Spyder->new( shift || die"Give me a URL!\n" );
    
732  while ( my $page = $spyder->crawl ) {
    
733     print '-'x70,"\n";
734     print "Spydering: ", $page->title, "\n";
735     print "      URL: ", $page->url, "\n";
736     print "     Desc: ", $page->description || 'n/a', "\n";
737     print '-'x70,"\n";
738     while ( my $link = $page->next_link ) {
739         printf "%22s ->> %s\n",
740         length($link->name) > 22 ?
741             substr($link->name,0,19).'...' : $link->name,
742             length($link) > 43 ?
743                 substr($link,0,40).'...' : $link;
744     }
745  }
    
746 as long as unique URLs are being found in the pages crawl'd, the
747 spyder will never stop.
    
748 Each "crawl" returns a page object which gives the following methods
749 to get information about the page.
    
750 =over 2
    
751 =item * $page->links
    
752 URLs found on the page.
    
753 =item * $page->title
    
754 Page's <TITLE> Title </TITLE> if there is one.
    
755 =item * $page->text
    
756 The parsed plain text out of the page. Uses HTML::Parser and tries to
757 ignore non-readable stuff like comments and scripts.
    
758 =item * $page->url
    
759 =item * $page->domain
    
760 =item * $page->raw
    
761 The content returned by the server. Should be HTML.
    
762 =item * $page->description
    
763 The META description of the page if there is one.
    
764 =item * $page->links
    
765 Returns a list of the URLs in the page. Note: next_link() will shift
766 the available list of links() each time it's called.
    
767 =item * $link = $page->next_link
    
768 next_link() destructively returns the next URI-ish object in the page.
769 They are objects with three accessors.
    
770 =over 5
    
771 =item * $link->url
    
772 This is also overloaded so that interpolating "$link" will get the
773 URL just as the method does.
    
774 =item * $link->name
    
775 =item * $link->domain
    
776 =back
    
777 =back
    
778 =head2 Spyder that will give up the ghost...
    
779 The following spyder is initialized to stop crawling when I<either> of
780 its conditions are met: 10mins pass or 300 pages are crawled.
    
781  use WWW::Spyder;
    
782  my $url = shift || die "Please give me a URL to start!\n";
    
783  my $spyder = WWW::Spyder->new
784       (seed        => $url,
785        sleep_base  => 10,
786        exit_on     => { pages => 300,
787                         time  => '10min', },);
    
788  while ( my $page = $spyder->crawl ) {
    
789     print '-'x70,"\n";
790     print "Spydering: ", $page->title, "\n";
791     print "      URL: ", $page->url, "\n";
792     print "     Desc: ", $page->description || '', "\n";
793     print '-'x70,"\n";
794     while ( my $link = $page->next_link ) {
795         printf "%22s ->> %s\n",
796         length($link->name) > 22 ?
797             substr($link->name,0,19).'...' : $link->name,
798             length($link) > 43 ?
799                 substr($link,0,40).'...' : $link;
800     }
801  }
    
802 =head2 Primitive page reader
    
803  use WWW::Spyder;
804  use Text::Wrap;
    
805  my $url = shift || die "Please give me a URL to start!\n";
806  @ARGV or die "Please also give me a search term.\n";
807  my $spyder = WWW::Spyder->new;
808  $spyder->seed_url($url);
809  $spyder->terms(@ARGV);
    
810  while ( my $page = $spyder->crawl ) {
811      print '-'x70,"\n * ";
812      print $page->title, "\n";
813      print '-'x70,"\n";
814      print wrap('','', $page->text);
815      sleep 60;
816  }
    
817 =head1 TIPS
    
818 If you are going to do anything important with it, implement some
819 signal blocking to prevent accidental problems and tie your gathered
820 information to a DB_File or some such.
    
821 Right now the module loads C<POSIX::nice(40)>. It won't do that in
822 future versions but you might consider doing it yourself. It should
823 top the nice off at your system's max and prevent your spyder from
824 interfering with your system.
    
825 You might want to to set $| = 1.
    
826 =head1 PRIVATE METHODS 
    
827 =head2 are private but hack away if you're inclined
    
828 =head1 TO DO
    
829 I<Spyder> is conceived to live in a future namespace as a servant class
830 for a complex web research agent with simple interfaces to
831 pre-designed grammars for research reports; or self-designed
832 grammars/reports (might be implemented via Parse::FastDescent if that
833 lazy-bones Conway would just find another 5 hours in the paltry 32
834 hour day he's presently working).
    
835 I'd like the thing to be able to parse RTF, PDF, and perhaps even
836 resource sections of image files but that isn't on the radar right
837 now.
    
838 =head1 TO DOABLE BY 1.0
    
839 Add 2-4 sample scripts that are a bit more useful.
    
840 There are many functions that should be under the programmer's control
841 and not buried in the spyder. They will emerge soon. I'd like to put
842 in hooks to allow the user to keep(), toss(), or exclude(), urls, link
843 names, and domains, while crawl'ing.
    
844 Clean up some redundant, sloppy, and weird code. Probably change or
845 remove the AUTOLOAD.
    
846 Put in a go_to_seed() method and a subclass, ::Seed, with rules to
847 construct query URLs by search engine. It would be the autostart or the
848 fallback for perpetual spyders that run out of links. It would hit a
849 given or default search engine with the I<Spyder>'s terms as the query.
850 Obviously this would only work with terms() defined.
    
851 Implement auto-exclusion for failure vs. success rates on names as well
852 as domains (maybe URI suffixes too).
    
853 Turn length of courtesy queue into the breadth/depth setting? make it
854 automatically adjusting...?
    
855 Consistently found link names are excluded from term strength sorting?
856 Eg: "privacy policy," "read more," "copyright..."
    
857 Fix some image tag parsing problems and add area tag parsing.
    
858 Configuration for user:password by domain.
    
859 ::Page objects become reusable so that a spyder only needs one.
    
860 ::Enqueue objects become indexed so they are nixable from anywhere.
    
861 Expand exit_on routines to size, slept time, dwindling success ratio,
862 and maybe more.
    
863 Make methods to set "skepticism" and "effort" which will influence the
864 way the terms are used to keep, order, and toss URLs.
    
865 =head1 BE WARNED
    
866 This module already does some extremely useful things but it's in its
867 infancy and it is conceived to live in a different namespace and
868 perhaps become more private as a subservient part of a parent class.
869 This may never happen but it's the idea. So don't put this into
870 production code yet. I am endeavoring to keep its interface constant
871 either way. That said, it could change completely.
    
872 =head2 Also!
    
873 This module saves cookies to the user's home. There will be more
874 control over cookies in the future, but that's how it is right now.
875 They live in $ENV{HOME}/spyderCookie.
    
876 =head2 Anche!
    
877 Robot Rules aren't respected. I<Spyder> endeavors to be polite as far
878 as server hits are concerned, but doesn't take "no" for answer right
879 now. I want to add this, and not just by domain, but by page settings.
    
880 =head1 UNDOCUMENTED FEATURES
    
881 A.k.a. Bugs. Don't be ridiculous! Bugs in B<my code>?! 
    
882 There is a bug that is causing retrieval of image src tags, I think
883 but haven't tracked it down yet, as links. I also think the plain text
884 parsing has some problems which will be remedied shortly.
    
885 If you are building more than one spyder in the same script they are
886 going to share the same exit_on parameters because it's a
887 self-installing method. This will not always be so.
    
888 See B<Bugs> file for more open and past issues.
    
889 Let me know if you find any others. If you find one that is platform
890 specific, please send patch code/suggestion b/c I might not have any
891 idea how to fix it.
    
892 =head1 WHY I<Spyder?>
    
893 I didn't want to use the more appropriate I<Spider> because I think
894 there is a better one out there somewhere in the zeitgeist and the
895 namespace future of I<Spyder> is uncertain. It may end up a
896 semi-private part of a bigger family. And I may be King of Kenya
897 someday. One's got to dream.
    
898 If you like I<Spyder>, have feedback, wishlist usage, better
899 algorithms/implementations for any part of it, please let me know!
    
900 =head1 AUTHOR, AUTHOR
    
901 Ashley5, ashley@cpan.org. Bob's your monkey's uncle.
    
902 =head1 COPYRIGHT
    
903 (c)2001-2002 Ashley Pond V. All rights reserved. This program is free
904 software; you may redistribute or modify it under the same terms as
905 Perl.
    
906 =head1 THANKS TO
    
907 Most all y'all. Especially Lincoln Stein, Gisle Aas, The Conway,
908 Raphael Manfredi, Gurusamy Sarathy, and plenty of others.
    
909 =head1 COMPARE WITH
    
910 WWW::Robot, LWP::UserAgent, WWW::SimpleRobot, WWW::RobotRules,
911 LWP::RobotUA, and other kith and kin.
    
912 =cut