Perl程序抓取TopCoder的題目和解答

TopCoder是一個非常好的學習網站,但由於服務器在國外,查看常常很慢。同時,如果在線看,常常不是很方便,因此用perl寫了個小爬蟲,去批量抓取想要的這個題目和解答,非常好用。

注意抓取是需要cookie的,自己登錄,用fiddler抓個包就可以看到cookie了。

#!/usr/bin/perl
use strict;

sub main {
	my @SRM_url_list = ();
	my @SRM_idx = ();
	get_SRM_url_list(\@SRM_url_list, \@SRM_idx);
	foreach my $index (0 .. scalar(@SRM_idx)) {
		my $SRM_idx = $SRM_idx[$index];
		my $SRM_url = $SRM_url_list[$index];
		my @div_dir;
		$div_dir[0] = "../archive/SRM_$SRM_idx/Div_1";
		$div_dir[1] = "../archive/SRM_$SRM_idx/Div_2";
		foreach my $div (1 .. 2) {
			`mkdir  -p $div_dir[$div-1]`;
			my @plm_url = ();
			my @plm_file_name = ();
			get_div_problem($SRM_idx, $SRM_url, $div, \@plm_url, \@plm_file_name);
			#print "@plm_url\n, @plm_file_name\n";
			foreach my $j (0 .. scalar(@plm_url)-1) {
				get_plm_stat($div_dir[$div-1], $plm_file_name[$j], $plm_url[$j]);
				get_plm_solve($div_dir[$div-1], $plm_file_name[$j], $plm_url[$j], $div);
				#get_plm_solve("", $plm_file_name[$j], "http://community.topcoder.com/stat?c=problem_statement&pm=12033&rd=15172", 1);
			}
		}
	}
}

sub get_plm_stat {
	my ($div_dir, $plm_file_name, $plm_url) = @_;
	my $plm_file_path = $div_dir . "/" . $plm_file_name;
	get_html_to_file($plm_url, $plm_file_path);
}

sub get_plm_solve {
	my ($div_dir, $plm_file_name, $plm_url, $div) = @_;
	my $plm_str = get_html_to_string($plm_url);
	my $plm_detail;
	if ($plm_str =~ /ProblemDetail(.+)\"\>Single/) {
		my $plm_detail_url = "http://community.topcoder.com/tc?module=ProblemDetail" . $1;
	        $plm_detail = get_html_to_string($plm_detail_url);
       }
#
       while ($plm_detail =~ s/\n/;/g) {
       }

       my $solve_list = $plm_detail;

       while ($solve_list =~ s/amp;//g) {}
       while ($solve_list =~ s/;/\n/g) {}
       my @temp_list;
       my @solve_name;
       my @solve_url;
       while ($solve_list =~ /problem_solution(.+)/g) {
	       push @temp_list, $1;
       }
       my $base;
       if (scalar(@temp_list)>=10) {
	       $base=$div;
       }else {
	       $base=1;
       }
       foreach my $i (0+($base-1)*5 .. 4+($base-1)*5) {
	       push @solve_url , "http://community.topcoder.com/" . "stat?c=problem_solution" . $temp_list[$i];
	       my $temp_str =  $plm_file_name;
	       if ($i % 5 == 0) {
		       if ($temp_str =~ s/problem/Java_solve/g){}
	       } elsif ($i % 5 == 1) {
		       if ($temp_str =~ s/problem/Cpp_solve/g){}
	       } elsif ($i % 5 == 2) {
		       if ($temp_str =~ s/problem/Csharp_solve/g){}
	       } elsif ($i % 5 == 3) {
		       if ($temp_str =~ s/problem/VB_solve/g){}
	       } elsif ($i % 5 == 4) {
		       if ($temp_str =~ s/problem/Overall_solve/g){}
	       }
	       push @solve_name , $temp_str;
       }
       foreach my $i (0 .. 4) {
	       my $solve_name = $solve_name[$i];
	       my $solve_url;
	       my $temp_url = $solve_url[$i];
	       if ($temp_url =~/(.+)\" class\=/) {
		       $solve_url = $1;
       }
       if ($solve_url =~ /cr=(\d+)/) {
	       if ($1 > 0) {
		       my $solve_file_path = $div_dir . "/" . $solve_name;
		       get_html_to_file($solve_url, $solve_file_path);
	       }
       }
       }
}

sub get_div_problem {
	my ($SRM_idx, $SRM_url, $div, $plm_url, $plm_file_name) = @_;
	#print "\nSRM_$SRM_idx, $SRM_url\n";
	my $srm_div_list = get_html_to_string($SRM_url);
	while ($srm_div_list =~ s/\n/;/g) {
	}
	my $plm_list_str;
	if ($div == 1) {
		if ($srm_div_list =~ /Division I Problem Stats(.+)Division II Problem Stats/) {
			$plm_list_str = $1; 
		}
	} elsif ($div == 2) {
		if ($srm_div_list =~ /Division II Problem Stats(.+)submitForm/) {
			$plm_list_str = $1; 
		}
	}
	while ($plm_list_str =~ s/;/\n/g) {
	}
	my $cnt = 0;
	while ($plm_list_str =~ /HREF\=\"(.+)\" class\=\"statText\"\>(.+)\<\/A\>\<\/td\>/g) {
		push @$plm_url, "http://community.topcoder.com" . $1;
	$cnt ++;
	my $plm_level;
	if ($cnt == 1) {
		$plm_level = "Level_One_problem_";
	} elsif ($cnt == 2) {
		$plm_level = "Level_Two_problem_";
	} elsif ($cnt == 3) {
		$plm_level = "Level_Three_problem_";
	} else {
		$plm_level = "Level_None_problem_";
	}
	push @$plm_file_name, $plm_level . $2 . ".html";
}
}

sub get_SRM_url_list {
	my ($SRM_url_list, $SRM_idx) = @_;
	my $pre_url = "http://community.topcoder.com/tc?module=MatchList&sc=&sd=&nr=50&sr=";
	my $MAX_SRM_CNT = 5000;
	my $flag = 1;
	my $index = 1;
	while ($flag == 1) {
		my $url = $pre_url . "$index";
		#print "index=$index, url=$url\n";
		my $SRM_list_page = get_html_to_string($url);
		if ($SRM_list_page =~/An error has occurred when attempting to process your request/) {
			print "index=$index, ALL list has been got\n";
			print @$SRM_url_list;
			$flag = 0;
		}
		if ($flag == 1) {
			while ($SRM_list_page =~ /\<td class\=\"value\" nowrap\=\"nowrap\"\>\<a href\=\"(.+)">SRM (\d+)\<\/a\>\<\/td\>/g) {
				my ($suf_url, $srm_idx) = ($1, $2);
				my $srm_url = "http://community.topcoder.com" . $suf_url;
				push @$SRM_url_list, $srm_url;
				push @$SRM_idx, $srm_idx;
			}
		}
		$index += 50;
		if ($index >= $MAX_SRM_CNT) {
			last;
		}
	}
}

sub get_html_to_string {
	my ($url) = @_;
	my $temp_file = "../archive/file.tmp.txt";
	get_html_to_file($url, $temp_file);
	my $str = `cat $temp_file`;
	`rm $temp_file`;
	return $str;
}

sub get_html_to_file {
	my ($url, $output_file) = @_;
	my $cookie_file = "../data/cookie.txt";
	open FV,$cookie_file;
	my $cookie = <FV>;
	chomp $cookie;
	my $cmd = "curl \"$url\" $cookie -o $output_file";
	system($cmd);
}

main();


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章