Perl @ 10 March 2011, “5,359 Comments”
Here’s a little contribution. A perl script I used a Long Time Ago to pull web Content, the example itself pull’s content from noticias24.com a new’s venezuelan blog. Use the script wisely
for educational purposes . I will post later on some useful Scripts to install perl modules fastĀ without the CPAN.
#!/usr/bin/perl
use HTML::Tree;
use LWP::Simple;
use encoding "UTF-8";
$main_site = "http://www.noticias24.com";
$content = get($main_site);
$tree = HTML::Tree->new;
$tree->parse($content);
@divs = $tree->look_down('_tag', 'div');
foreach (@divs) {
if ($_->attr('class') eq 'post x') {
@content = $_->content_list;
foreach (@content) {
if ($_->tag eq 'h2') {
print "Title: ", $_->as_text, "\n";
@h2_children = $_->content_list;
$url = @h2_children[0]->attr('href');
&get_noticia($url);
last;
}
}
}
}
sub get_noticia {
my ($noticia) = @_;
my $content = get($noticia);
my $noti_tree = HTML::Tree->new;
$noti_tree->parse($content);
my @divs = $noti_tree->look_down('_tag', 'div');
foreach (@divs) {
if ($_->attr('class') eq 'theContent') {
print $_->as_text, "\n";
}
}
print "\n";
}