功能分析:
给定一个 天涯论坛 的帖子URL链接地址,获取从当前地址开始的所有整个 主题帖子的内容。
1, 先看看天涯论坛帖子中 我们所关心的文档结构:
<body> <div class="pagewrap"> <h1><span>xxx</span> <span>标题</span></h1> <div class="function"> <div class="info">点击数 回复数</div> <!-- 或者没有下面的tag--> <div class="pages" id="pageDivTop"> <form> <input type="hidden" name="idArticleslist" value="所有的帖子地址ID"> </form> </div> <!--楼主信息--> <table cellspacing="0" cellpadding="0" id="firstAuthor"><td><a>作者</a>发表日期</td></table> <div class="allpost" id="pContentDiv"> <!--楼主贴--> <div class="post" _app="eleshare"> <!--回复人信息--> <table bgcolor="#f5f9fa" width="100%"><font size="-1" color="green"><a>作者</a>发表日期</font></table> <!--回复人帖子--> <div class="post" _app="eleshare"></div> </div> </body>
2,用HTML::TreeBuilder,LWP::Simple,HTML::Element,Encode库 来实现Reference :CPAN
Perl crawl.pl http://www.tianya.cn/publicforum/content/funinfo/1/2854941.shtml >ylbg.txt
Perl crawl.pl http://www.tianya.cn/publicforum/content/no05/1/204253.shtml > yslt.txt
4,crawl.pl文件源代码,呵呵,写的比较挫 还挺见谅啊 下载
【注意 引用处的 $$ 被CSDN替换为$了,CSDN编辑器太烂,都不支持Perl】
use strict;
use warnings; use HTML::TreeBuilder; use LWP::Simple; use Encode; no utf8; #pragma tells Perl to switch back to treating the source text as literal bytes in the current lexical scope. #check to print the content of the others # if 1,print the others,else 0,not to print my $other=1; $other=0 if (@ARGV > 1); my ($url)=@ARGV; if (@ARGV <1) { print "Input an URL\n"; exit; } # http://www.baidu.com if($url !~/^http:/) { $url="http://$url"; } my ($pagewrap,$title,$auther,#auther $clicks,#click number $totalPage,@idArticleslist,#all Articile ID list $root #HTML document root element ); &PrintPage($url); sub InitRootfromURl() { my ($url)=@_; my $content=get($url); #print "Get HTML document\n"; die "Couldn't get the URL $url!" unless defined $content; $root = HTML::TreeBuilder->new_from_content($content); $root->eof( ); # done parsing for this tree } sub PrintPage() { #http://www.tianya.cn/publicforum/content/feeling/1/1323274.shtml #urlPattern is : #http://www.tianya.cn/publicforum/content/feeling/1 #posfitx is: #shtml my ($link)=@_; my ( $urlPattern,#link pattern $posfitx,#the link postfix $CurrID #Input URL'ID ); my @tmp=split(/\//,$link); $urlPattern=join('/',@tmp[0..@tmp-2]); ($CurrID,$posfitx)=split(/\./,$tmp[@tmp-1]); @tmp=(); &InitRootfromURl($link); &InitFromfirstLink(\$root); return unless ($idArticleslist[0]); if ($CurrID ne $idArticleslist[0]) { $link="$urlPattern/$idArticleslist[0].$posfitx"; &InitRootfromURl($link); &InitFromfirstLink(\$root); $root->delete(); #the first page will be deleted. } {#This infor below got from the first page. print "title:";&toGBK(\$title) if ($title); print "\nauther:";&toGBK(\$auther) if ($auther); print "\nclicks:";&toGBK(\$clicks) if ($clicks); if ($totalPage){ print "\n";&toGBK(\$totalPage); print "\n";} } #Print the content infor of the all posts print "正文:\n"; my ($i,$start)=(0,0); foreach my $id (@idArticleslist) { $i++; $start=1 if ( !$start && ($id eq $CurrID)); if($start) { print "\nPage $i\n"; $link="$urlPattern/$id.$posfitx"; &InitRootfromURl($link); &ParseAllPost(\$root); $root->delete();;# destroy the document } } } sub toGBK() { my ($content)=@_; $$content=~ s/牋牋/ /g; #print $$content;return 0; #If your OS is English,please comment it my @chars=split(//,$$content); foreach my $char (@chars) { print encode("gbk",$char); } } sub printContent() { my ($content)=@_; $$content =~ s/\s+$//g; $$content =~ s/TML::Element=.+$//; &toGBK($content); } sub printReplier() { my ($writer)=@_; return if ($$writer =~ /^\s*$/); $$writer =~ s/\s+//; my $dil="-" x length(Encode::encode_utf8($$writer)); print "\n$dil\n|"; &toGBK($writer); print "|\n$dil\n"; } #===== #print content info sub ParsePost() { my ($node)=@_; foreach my $n ($$node->content_list()) { if ( $n =~ /^HTML::Element=/) { next if (! (my $tag=$n->tag())); #print "$tag\n"; if ( $tag eq 'br'){print "\n"; next;} next if ( ($tag eq 'div') && ($n->attr('class')) && ($n->attr('class') eq 'post-jb')); next if ( $tag eq 'img'); next if ( $tag eq 'span'); next if ( $tag eq 'u'); &ParsePost(\$n); } { &printContent(\$n); } } } #Parse allpost #===== #print content info sub ParseAllPost() { my ($node)=@_; #allpost,HTML::Element my ($tag,$attr,@poster,$writer,$isShow); return unless ($$node); my @a=$$node->look_down(_tag=>'div',class=>'allpost',id=>'pContentDiv'); return unless ($a[0]); $isShow=1; foreach my $n ($a[0]->content_list()) { next if ( $n !~ /^HTML::Element=/); next if (! ($tag=$n->tag())); #parse the content of writer if ( $isShow && ($tag eq 'div')) { next if (!($attr=$n->attr('class'))); next if ($attr ne 'post'); &ParsePost(\$n); } #parse the writer if ( $tag eq 'table') { @poster = $n->look_down(_tag=>'a',target=>'_blank'); $writer=$poster[0]->as_text(); if (!$other) { if ($writer eq $auther){$isShow =1;} else {$isShow = 0;} } if ($isShow) { &printReplier(\$writer); } } } #$$node->delete();#Delete all elements } #Initialize the auther info and link info #Parse pagewrap and,find title,auther,and all links sub InitFromfirstLink() { my ($node)=@_; my ($tag,@tmp,@a,$b); #find the 'pagewrap' tag return unless ($$node); @tmp = $$node->look_down(_tag=>'div',class=>'pagewrap'); return unless ($tmp[0]); foreach my $n ($tmp[0]->content_list()) { next if ( $n !~ /^HTML::Element=/); next if (! ($tag=$n->tag())); if ( $tag eq 'h1') { @tmp = $n->look_down(_tag=>'span'); next unless ($tmp[1]); $title=$tmp[1]->as_text(); @tmp=(); next; } if ( $tag eq 'table') { @tmp = $n->look_down(_tag=>'a'); next unless ($tmp[0]); $auther=$tmp[0]->as_text(); @tmp=(); next; } next if (!($attr=$n->attr('class'))); if ( ($tag eq 'div') && ( $attr eq 'function')) { @tmp = $n->content_list(); { @a=(); @a=$n->look_down(_tag=>'span'); if ( @a > 0) {$totalPage=$a[0]->as_text();} @a=(); } if ($tmp[0]->attr('class') eq 'info') { @a=split(/ /,$tmp[0]->as_text()); $clicks=$a[1]; @a=(); }else{ #print "error:",$tmp[0]->dump(); next; } {#get idArticleslist @a=(); @a=$n->look_down(_tag=>'input',name=>'idArticleslist'); if ( @a > 0){ $b=$a[0]->attr('value'); @idArticleslist=split(/,/,$b); } @a=(); } #print $tmp[1]->dump(); @tmp=(); next; } if ( ($tag eq 'div') && ( $attr eq 'allpost')) { #$$node->detach; last; } } } =cut 天涯帖子结构 <body> <div class="pagewrap"> <h1><span>xxx</span> <span>标题</span></h1> <div class="function"> <div class="info">点击数 回复数</div> <!-- 或者没有下面的tag--> <div class="pages" id="pageDivTop"> <form> <input type="hidden" name="idArticleslist" value="所有的帖子地址ID"> </form> </div> <!--楼主信息--> <table cellspacing="0" cellpadding="0" id="firstAuthor"><td><a>作者</a>发表日期</td></table> <div class="allpost" id="pContentDiv"> <!--楼主贴--> <div class="post" _app="eleshare"> <!--回复人信息--> <table bgcolor="#f5f9fa" width="100%"><font size="-1" color="green"><a>作者</a>发表日期</font></table> <!--回复人帖子--> <div class="post" _app="eleshare"></div> </div> </body> =cut