TopCoder是一个非常好的学习网站,但由于服务器在国外,查看常常很慢。同时,如果在线看,常常不是很方便,因此用perl写了个小爬虫,去批量抓取想要的这个题目和解答,非常好用。
注意抓取是需要cookie的,自己登录,用fiddler抓个包就可以看到cookie了。
#!/usr/bin/perl use strict; sub main { my @SRM_url_list = (); my @SRM_idx = (); get_SRM_url_list(\@SRM_url_list,\@SRM_idx); foreach my $index (0 .. scalar(@SRM_idx)) { my $SRM_idx = $SRM_idx[$index]; my $SRM_url = $SRM_url_list[$index]; my @div_dir; $div_dir[0] = "../archive/SRM_$SRM_idx/Div_1"; $div_dir[1] = "../archive/SRM_$SRM_idx/Div_2"; foreach my $div (1 .. 2) { `mkdir -p $div_dir[$div-1]`; my @plm_url = (); my @plm_file_name = (); get_div_problem($SRM_idx,$SRM_url,$div,\@plm_url,\@plm_file_name); #print "@plm_url\n,@plm_file_name\n"; foreach my $j (0 .. scalar(@plm_url)-1) { get_plm_stat($div_dir[$div-1],$plm_file_name[$j],$plm_url[$j]); get_plm_solve($div_dir[$div-1],$plm_url[$j],$div); #get_plm_solve("","http://community.topcoder.com/stat?c=problem_statement&pm=12033&rd=15172",1); } } } } sub get_plm_stat { my ($div_dir,$plm_file_name,$plm_url) = @_; my $plm_file_path = $div_dir . "/" . $plm_file_name; get_html_to_file($plm_url,$plm_file_path); } sub get_plm_solve { my ($div_dir,$plm_url,$div) = @_; my $plm_str = get_html_to_string($plm_url); my $plm_detail; if ($plm_str =~ /ProblemDetail(.+)\"\>Single/) { my $plm_detail_url = "http://community.topcoder.com/tc?module=ProblemDetail" . $1; $plm_detail = get_html_to_string($plm_detail_url); } # while ($plm_detail =~ s/\n/;/g) { } my $solve_list = $plm_detail; while ($solve_list =~ s/amp;//g) {} while ($solve_list =~ s/;/\n/g) {} my @temp_list; my @solve_name; my @solve_url; while ($solve_list =~ /problem_solution(.+)/g) { push @temp_list,$1; } my $base; if (scalar(@temp_list)>=10) { $base=$div; }else { $base=1; } foreach my $i (0+($base-1)*5 .. 4+($base-1)*5) { push @solve_url,"http://community.topcoder.com/" . "stat?c=problem_solution" . $temp_list[$i]; my $temp_str = $plm_file_name; if ($i % 5 == 0) { if ($temp_str =~ s/problem/Java_solve/g){} } elsif ($i % 5 == 1) { if ($temp_str =~ s/problem/Cpp_solve/g){} } elsif ($i % 5 == 2) { if ($temp_str =~ s/problem/Csharp_solve/g){} } elsif ($i % 5 == 3) { if ($temp_str =~ s/problem/VB_solve/g){} } elsif ($i % 5 == 4) { if ($temp_str =~ s/problem/Overall_solve/g){} } push @solve_name,$temp_str; } foreach my $i (0 .. 4) { my $solve_name = $solve_name[$i]; my $solve_url; my $temp_url = $solve_url[$i]; if ($temp_url =~/(.+)\" class\=/) { $solve_url = $1; } if ($solve_url =~ /cr=(\d+)/) { if ($1 > 0) { my $solve_file_path = $div_dir . "/" . $solve_name; get_html_to_file($solve_url,$solve_file_path); } } } } sub get_div_problem { my ($SRM_idx,$plm_file_name) = @_; #print "\nSRM_$SRM_idx,$SRM_url\n"; my $srm_div_list = get_html_to_string($SRM_url); while ($srm_div_list =~ s/\n/;/g) { } my $plm_list_str; if ($div == 1) { if ($srm_div_list =~ /Division I Problem Stats(.+)Division II Problem Stats/) { $plm_list_str = $1; } } elsif ($div == 2) { if ($srm_div_list =~ /Division II Problem Stats(.+)submitForm/) { $plm_list_str = $1; } } while ($plm_list_str =~ s/;/\n/g) { } my $cnt = 0; while ($plm_list_str =~ /HREF\=\"(.+)\" class\=\"statText\"\>(.+)\<\/A\>\<\/td\>/g) { push @$plm_url,"http://community.topcoder.com" . $1; $cnt ++; my $plm_level; if ($cnt == 1) { $plm_level = "Level_One_problem_"; } elsif ($cnt == 2) { $plm_level = "Level_Two_problem_"; } elsif ($cnt == 3) { $plm_level = "Level_Three_problem_"; } else { $plm_level = "Level_None_problem_"; } push @$plm_file_name,$plm_level . $2 . ".html"; } } sub get_SRM_url_list { my ($SRM_url_list,$SRM_idx) = @_; my $pre_url = "http://community.topcoder.com/tc?module=MatchList&sc=&sd=&nr=50&sr="; my $MAX_SRM_CNT = 5000; my $flag = 1; my $index = 1; while ($flag == 1) { my $url = $pre_url . "$index"; #print "index=$index,url=$url\n"; my $SRM_list_page = get_html_to_string($url); if ($SRM_list_page =~/An error has occurred when attempting to process your request/) { print "index=$index,ALL list has been got\n"; print @$SRM_url_list; $flag = 0; } if ($flag == 1) { while ($SRM_list_page =~ /\<td class\=\"value\" Nowrap\=\"Nowrap\"\>\<a href\=\"(.+)">SRM (\d+)\<\/a\>\<\/td\>/g) { my ($suf_url,$srm_idx) = ($1,$2); my $srm_url = "http://community.topcoder.com" . $suf_url; push @$SRM_url_list,$srm_url; push @$SRM_idx,$srm_idx; } } $index += 50; if ($index >= $MAX_SRM_CNT) { last; } } } sub get_html_to_string { my ($url) = @_; my $temp_file = "../archive/file.tmp.txt"; get_html_to_file($url,$temp_file); my $str = `cat $temp_file`; `rm $temp_file`; return $str; } sub get_html_to_file { my ($url,$output_file) = @_; my $cookie_file = "../data/cookie.txt"; open FV,$cookie_file; my $cookie = <FV>; chomp $cookie; my $cmd = "curl \"$url\" $cookie -o $output_file"; system($cmd); } main();