Perl @ 10 March 2011, “5,359 Comments”

Here’s a little contribution. A perl script I used a Long Time Ago to pull web Content, the example itself pull’s content from noticias24.com a new’s venezuelan blog. Use the script wisely :) for educational purposes . I will post later on some useful Scripts to install perl modules fastĀ  without the CPAN.


#!/usr/bin/perl
use HTML::Tree;
use LWP::Simple;
use encoding "UTF-8";

$main_site = "http://www.noticias24.com";
$content = get($main_site);

$tree = HTML::Tree->new;
$tree->parse($content);

@divs = $tree->look_down('_tag', 'div');

  foreach (@divs) {
     if ($_->attr('class') eq 'post x') {
        @content = $_->content_list;
          foreach (@content) {
             if ($_->tag eq 'h2') {
             print "Title: ", $_->as_text, "\n";
             @h2_children = $_->content_list;
             $url = @h2_children[0]->attr('href');
             &get_noticia($url);
           last;
        }
      }
    }
  }

sub get_noticia {
   my ($noticia) = @_;
   my $content = get($noticia);
   my $noti_tree = HTML::Tree->new;
   $noti_tree->parse($content);
   my @divs = $noti_tree->look_down('_tag', 'div');

   foreach (@divs) {
    if ($_->attr('class') eq 'theContent') {
     print $_->as_text, "\n";
    }
  }
 print "\n";
}