TopCoder是一個非常好的學習網站,但由於服務器在國外,查看常常很慢。同時,如果在線看,常常不是很方便,因此用perl寫了個小爬蟲,去批量抓取想要的這個題目和解答,非常好用。
注意抓取是需要cookie的,自己登錄,用fiddler抓個包就可以看到cookie了。
#!/usr/bin/perl
use strict;
sub main {
my @SRM_url_list = ();
my @SRM_idx = ();
get_SRM_url_list(\@SRM_url_list, \@SRM_idx);
foreach my $index (0 .. scalar(@SRM_idx)) {
my $SRM_idx = $SRM_idx[$index];
my $SRM_url = $SRM_url_list[$index];
my @div_dir;
$div_dir[0] = "../archive/SRM_$SRM_idx/Div_1";
$div_dir[1] = "../archive/SRM_$SRM_idx/Div_2";
foreach my $div (1 .. 2) {
`mkdir -p $div_dir[$div-1]`;
my @plm_url = ();
my @plm_file_name = ();
get_div_problem($SRM_idx, $SRM_url, $div, \@plm_url, \@plm_file_name);
#print "@plm_url\n, @plm_file_name\n";
foreach my $j (0 .. scalar(@plm_url)-1) {
get_plm_stat($div_dir[$div-1], $plm_file_name[$j], $plm_url[$j]);
get_plm_solve($div_dir[$div-1], $plm_file_name[$j], $plm_url[$j], $div);
#get_plm_solve("", $plm_file_name[$j], "http://community.topcoder.com/stat?c=problem_statement&pm=12033&rd=15172", 1);
}
}
}
}
sub get_plm_stat {
my ($div_dir, $plm_file_name, $plm_url) = @_;
my $plm_file_path = $div_dir . "/" . $plm_file_name;
get_html_to_file($plm_url, $plm_file_path);
}
sub get_plm_solve {
my ($div_dir, $plm_file_name, $plm_url, $div) = @_;
my $plm_str = get_html_to_string($plm_url);
my $plm_detail;
if ($plm_str =~ /ProblemDetail(.+)\"\>Single/) {
my $plm_detail_url = "http://community.topcoder.com/tc?module=ProblemDetail" . $1;
$plm_detail = get_html_to_string($plm_detail_url);
}
#
while ($plm_detail =~ s/\n/;/g) {
}
my $solve_list = $plm_detail;
while ($solve_list =~ s/amp;//g) {}
while ($solve_list =~ s/;/\n/g) {}
my @temp_list;
my @solve_name;
my @solve_url;
while ($solve_list =~ /problem_solution(.+)/g) {
push @temp_list, $1;
}
my $base;
if (scalar(@temp_list)>=10) {
$base=$div;
}else {
$base=1;
}
foreach my $i (0+($base-1)*5 .. 4+($base-1)*5) {
push @solve_url , "http://community.topcoder.com/" . "stat?c=problem_solution" . $temp_list[$i];
my $temp_str = $plm_file_name;
if ($i % 5 == 0) {
if ($temp_str =~ s/problem/Java_solve/g){}
} elsif ($i % 5 == 1) {
if ($temp_str =~ s/problem/Cpp_solve/g){}
} elsif ($i % 5 == 2) {
if ($temp_str =~ s/problem/Csharp_solve/g){}
} elsif ($i % 5 == 3) {
if ($temp_str =~ s/problem/VB_solve/g){}
} elsif ($i % 5 == 4) {
if ($temp_str =~ s/problem/Overall_solve/g){}
}
push @solve_name , $temp_str;
}
foreach my $i (0 .. 4) {
my $solve_name = $solve_name[$i];
my $solve_url;
my $temp_url = $solve_url[$i];
if ($temp_url =~/(.+)\" class\=/) {
$solve_url = $1;
}
if ($solve_url =~ /cr=(\d+)/) {
if ($1 > 0) {
my $solve_file_path = $div_dir . "/" . $solve_name;
get_html_to_file($solve_url, $solve_file_path);
}
}
}
}
sub get_div_problem {
my ($SRM_idx, $SRM_url, $div, $plm_url, $plm_file_name) = @_;
#print "\nSRM_$SRM_idx, $SRM_url\n";
my $srm_div_list = get_html_to_string($SRM_url);
while ($srm_div_list =~ s/\n/;/g) {
}
my $plm_list_str;
if ($div == 1) {
if ($srm_div_list =~ /Division I Problem Stats(.+)Division II Problem Stats/) {
$plm_list_str = $1;
}
} elsif ($div == 2) {
if ($srm_div_list =~ /Division II Problem Stats(.+)submitForm/) {
$plm_list_str = $1;
}
}
while ($plm_list_str =~ s/;/\n/g) {
}
my $cnt = 0;
while ($plm_list_str =~ /HREF\=\"(.+)\" class\=\"statText\"\>(.+)\<\/A\>\<\/td\>/g) {
push @$plm_url, "http://community.topcoder.com" . $1;
$cnt ++;
my $plm_level;
if ($cnt == 1) {
$plm_level = "Level_One_problem_";
} elsif ($cnt == 2) {
$plm_level = "Level_Two_problem_";
} elsif ($cnt == 3) {
$plm_level = "Level_Three_problem_";
} else {
$plm_level = "Level_None_problem_";
}
push @$plm_file_name, $plm_level . $2 . ".html";
}
}
sub get_SRM_url_list {
my ($SRM_url_list, $SRM_idx) = @_;
my $pre_url = "http://community.topcoder.com/tc?module=MatchList&sc=&sd=&nr=50&sr=";
my $MAX_SRM_CNT = 5000;
my $flag = 1;
my $index = 1;
while ($flag == 1) {
my $url = $pre_url . "$index";
#print "index=$index, url=$url\n";
my $SRM_list_page = get_html_to_string($url);
if ($SRM_list_page =~/An error has occurred when attempting to process your request/) {
print "index=$index, ALL list has been got\n";
print @$SRM_url_list;
$flag = 0;
}
if ($flag == 1) {
while ($SRM_list_page =~ /\<td class\=\"value\" nowrap\=\"nowrap\"\>\<a href\=\"(.+)">SRM (\d+)\<\/a\>\<\/td\>/g) {
my ($suf_url, $srm_idx) = ($1, $2);
my $srm_url = "http://community.topcoder.com" . $suf_url;
push @$SRM_url_list, $srm_url;
push @$SRM_idx, $srm_idx;
}
}
$index += 50;
if ($index >= $MAX_SRM_CNT) {
last;
}
}
}
sub get_html_to_string {
my ($url) = @_;
my $temp_file = "../archive/file.tmp.txt";
get_html_to_file($url, $temp_file);
my $str = `cat $temp_file`;
`rm $temp_file`;
return $str;
}
sub get_html_to_file {
my ($url, $output_file) = @_;
my $cookie_file = "../data/cookie.txt";
open FV,$cookie_file;
my $cookie = <FV>;
chomp $cookie;
my $cmd = "curl \"$url\" $cookie -o $output_file";
system($cmd);
}
main();