#!/usr/bin/perl
use
strict;
use
warnings;
use
threads;
use
threads::shared;
use
Thread::Queue;
use
Thread::Semaphore;
use
Bloom::Filter;
use
URI;
use
URI::URL;
use
Web::Scraper;
use
LWP::Simple;
use
LWP::UserAgent;
use
HTTP::Cookies;
#use HTTP::Cookies::Guess;
use
String::Diff;
use
String::Diff
qw(diff_fully diff diff_merge diff_regexp)
;
use
URI::Split
qw(uri_split uri_join)
;
my
$fid
: shared;
#下載的頁面以遞增的數字命名
share(
$fid
);
#多線程共享該變量
$fid
=0;
#crawling with signed cookie
my
$cookie_jar
=
'.mozilla/firefox/bg146ia6.default/cookies.sqlite'
;
my
$tmp_ua
= LWP::UserAgent->new;
#UserAgent用來發送網頁訪問請求
$tmp_ua
->timeout(15);
##連接超時時間設爲15秒
$tmp_ua
->protocols_allowed( [
'http'
,
'https'
] );
##只允許http和https協議
$tmp_ua
->agent(
"Mozilla/4.0
(compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;.NET CLR
3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
)
;
##用來在header中告訴服務器你用的是什麼"瀏覽器",設置文件頭的User-Agent
$tmp_ua
->cookie_jar(HTTP::Cookies->new(
'file'
=>
"$ENV{'HOME'}/$cookie_jar"
,
'autosave'
=>1));
# 設置cookie,在運行過程中必須執行兩個方法,extract_cookies($request) 和 add_cookie_header($response)。在運行的過程中實際用到了HTTP::Cookies模塊。如:
# $ua->cookie_jar({ file => "$ENV{HOME}/.cookies.txt" });
# 等價於
# require HTTP::Cookies;
# $ua->cookie_jar(HTTP::Cookies->new(file => "$ENV{HOME}/.cookies.txt"));
push
@{
$tmp_ua
->requests_redirectable},
'POST'
;
#告訴LWP在POST請求發送後如果發生重新定向就自動跟隨
my
$max_threads
= 5;
my
$host
= URI::URL->new(
$base_url
)->host;
print
"Host Name: $host.\n"
;
my
$queue
= Thread::Queue->new( );
#線程隊列,每個線程負責去處理一個url
my
$semaphore
= Thread::Semaphore->new(
$max_threads
);
my
$mutex
= Thread::Semaphore->new( 1 );
#my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime;
#my $logfile = "crawler".($year+1900).($mon+1).$mday.$hour.$min.$sec.".log";
#open(BANLOG,">>$logfile") or die("can't open logfile:$!\n");
# Bloom::Filter使用更少的內存採用一種基於概率的算法來進行存在性測試。
my
$filter
= shared_clone( Bloom::Filter->new(
capacity
=> 1000000,
error_rate
=> 0.001) );
$queue
->enqueue(
$base_url
);
#放入線程隊列的URL就要被線程所處理
$filter
->add(
$base_url
);
#放入filter中好判斷該URL是否已經存在
my
@tmp_url
= ();
#@tmp_url存在處理過的url
push
(
@tmp_url
,
$base_url
);
while
( 1 )
{
# join所有可以被join的線程
#my $joined = 0;
foreach
( threads->list(threads::joinable) )
{
#$joined ++;
$_
->
join
( );
}
#print $joined, " joinedn";
# if there are no url need process.
my
$item
=
$queue
->pending();
#返回隊列中url的個數
# 線程隊列爲空
if
(
$item
== 0 )
{
my
$active
= threads->list(threads::running);
# 已經沒有active線程了,結束所有的工作
if
(
$active
== 0 )
{
print
"All done!\n"
;
last
;
}
# 如果還有活動線程,那麼主線程sleep,等待處理URL的子線程結束
else
{
#print "[MAIN] 0 URL, but $active active threadn";
sleep
1;
next
;
}
}
# 線程隊列不爲空,信號量減1,佔用一個線程來處理url
#print "[MAIN] $item URLn";
$semaphore
->down;
#print "[MAIN]Create thread.n";
threads->create( \
&ProcessUrl
);
}
# join all threads which can be joined
foreach
( threads->list() )
{
$_
->
join
( );
}
sub
ProcessUrl
{
my
$scraper
= scraper
{
process
'//a'
,
'links[]'
=>
'@href'
;
#根據XPath表達式尋找所有的標籤a,把href屬性存到散列的value中
};
my
$res
;
my
$link
;
while
(
my
$url
=
$queue
->dequeue_nb() )
{
eval
#eval BLOCK,BLOCK只會被解析一次,並且在編譯時進行代碼語法檢查。
{
print
"開始下載"
,URI->new(
$url
)->as_string,
"\t\$fid=$fid\n"
;
LWP::Simple::getstore(URI->new(
$url
)->as_string,
"$ENV{'HOME'}/master/cnblog/cn$fid"
) or
print
"Can't download the web page."
;
$fid
+=1;
$scraper
->user_agent(
$tmp_ua
);
#設置$scraper的user_agent
$res
=
$scraper
->scrape( URI->new(
$url
) )->{
'links'
};
#把URI傳給scrape函數。scrape函數返回一個數組引用,因爲links是數組
};
if
( $@ )
# 當BLOCK中有語法錯誤、運行時錯誤遇到 die 語句, eval 將返回 undef 。錯誤碼被保存在 $@ 中。
{
warn
"$@\n"
;
next
;
}
next
if
(!
defined
$res
);
#如果HTML文檔中沒有發現a標籤
#print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.n";
foreach
( @{
$res
} )
{
# $_ => URI->new("http://example.com/") 所以要調用sa_string來獲取"http://example.com/"
$link
=
$_
->as_string;
$link
= URI::URL->new(
$link
,
$url
);
#$u1 = URI::URL->new($str, $base);
#$u2 = $u1->abs;
# not http and not https?
next
if
(
$link
->scheme ne
'http'
&&
$link
->scheme ne
'https'
);
#The three forms of URI reference syntax are summarized as follows:
#<scheme>:<scheme-specific-part>#<fragment>
#<scheme>://<authority><path>?<query>#<fragment>
#<path>?<query>#<fragment>
#可以通過URL::Split把名個部分分離出來
# another domain?
# next if( $link->host ne $host );
#search for the sub domain
next
if
(!(
$link
->host =~ /
$host
/));
$link
=
$link
->
abs
->as_string;
#獲得絕對路徑
if
(
$link
=~ /(.*?)
#(.*)/ )#去除書籤錨點,即#以後的內容
{
$link
= $1;
}
next
if
(
$link
=~ /rss|.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf)$/i );
#這些文件格式我們不抓取
#print "test:$link\n";
#EscapeUrl,skip query form values
my
$tmp_link
=
&EscapeUrl
(
$link
);
#$tmp_link中已經把查詢參數的值去掉了
#print "Escape:".$tmp_link."\n";
$mutex
->down();
#互質體減1,進入線程臨界資源區
my
$tmp_mark
= 0;
#print "test start:$link\n";
if
( !
$filter
->check(
$tmp_link
) )
#如果$tmp_link不在$filter中
{
#print "Test filter ok:$tmp_link\n";
#DiffUrl,diff $link from queue with number
foreach
(
@tmp_url
)
{
#print "Test Queue:".$tmpurl."\n";
#print "test-1:$_\ntest-2:$tmp_link\n";
if
(
&DiffUrl
(
$_
,
$link
))
#如果發現@tmp_url中的url和當前頁面中的一個鏈接url僅是在某些數字上不同(很可能是查詢參數值不同),則跳過該鏈接,即跳到else裏面去。
{
$tmp_mark
= 2;
last
;
}
}
if
(
$tmp_mark
!= 2 )
{
$queue
->enqueue(
$link
);
#把頁面上的鏈接$link交給線程進行處理
#print "add queue:$link\n";
$filter
->add(
$tmp_link
);
#$tmp_link放入$filter
#print "add filter:$tmp_link\n";
#print BANLOG $filter->key_count(), " ", $link, "\n";
#print $filter->key_count(), " ", $link, "\n";
push
(
@tmp_url
,
$link
);
#把$link放入已處理的url數組@tmp_url
}
else
{
#print "pass:$link\n";#$link被忽略
}
}
#print "pass:$link\n";
$mutex
->up();
#互斥信號量加1
undef
$link
;
}
undef
$res
;
#清除創建的一些object,否則在while循環中這些object越積越多
}
undef
$scraper
;
$semaphore
->up( );
##普通信號量加1
}
#close(BANLOG);
print
"ALL DONE.\n"
;
#把URL尾部的request參數置爲空
#比如http://category.dangdang.com/?ref=www-0-C&name=orisun-zhang#ref=www-0-C被處理爲http://category.dangdang.com/?ref=&name=
sub
EscapeUrl
{
my
$urlold
=
shift
;
my
(
$scheme
,
$auth
,
$path
,
$query
,
$frag
) = uri_split(
$urlold
);
#把一個url的各部分分離出來
my
$urlnew
= uri_join(
$scheme
,
$auth
,
$path
);
my
$u
= URI->new(
$urlold
);
my
@tmp_array
=
$u
->query_form();
my
$tmp
=
''
;
my
$i
= 0;
for
(
$i
=0;
$i
<
@tmp_array
;
$i
+=2)
#把request參數的值去掉
{
$tmp
.=
$tmp_array
[
$i
].
"=&"
;
}
if
(
@tmp_array
!= 0)
{
$tmp
=~ s/&$//;
$urlnew
.=
"?"
.
$tmp
;
}
undef
$u
;
#清除子例程中創建的object
#print $urlnew."\n";
return
$urlnew
;
}
sub
DiffUrl
{
my
$urlold
=
shift
;
my
$urlnew
=
shift
;
my
$urloldx
=
&EscapeUrl
(
$urlold
);
my
$urlnewx
=
&EscapeUrl
(
$urlnew
);
my
(
$old
,
$new
) = String::Diff::diff(
$urloldx
,
$urlnewx
);
#my($old,$new) = String::Diff::diff($urlold,$urlnew);
if
((
$old
=~ m/(\[\d+\])/i) && (
$new
=~ m/{\d+}/i))
#如果兩個url僅是在某些數字上不同
#if ($new =~ m/{\d+}/i)
{
#print "test num success.\n";
return
1;
}
else
{
#print "test num failed.\n";
return
0;
}
}