採集數據,也就是爬蟲,對於PHP同樣能夠做到,通過封裝curl採集數據函數,從而實現對數據的採集利用
<?php
public function g_url_contents($url, $reurl = '', $ip = '')
{
$ch = curl_init();
$timeout = 30;
//設置你需要抓取的URL
curl_setopt($ch, CURLOPT_URL, $url);
//僞造來源ip
if ($ip) {
$header = array(
'CLIENT-IP:' . $ip,
'X-FORWARDED-FOR:' . $ip,
);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
}
//隨機生成一個user_agent
$binfo = array('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1;
Trident/4.0; .NET CLR 2.0.50727; InfoPath.2; AskTbPTV/5.17.0.25589;
Alexa Toolbar)','Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101
Firefox/22.0','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1;
Trident/4.0; .NET4.0C; Alexa Toolbar)','Mozilla/4.0(compatible;
MSIE 6.0; Windows NT 5.1; SV1)',$_SERVER['HTTP_USER_AGENT']);
$user_agent = $binfo[mt_rand(0,3)];
//下面這個是單個的,有可能被監測到
//$user_agent = $_SERVER['HTTP_USER_AGENT']
//執行curl_exec是把輸出做爲返回值,不會輸出到瀏覽器
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//在發起連接前等待的時間,如果設置爲0,則不等待
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
//在HTTP請求中包含一個”user-agent”頭的字符串
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
//設置這個選項爲一個非零值(像 “Location: “)的頭,服務器會把它當做
//HTTP頭的一部分發送(注意這是遞歸的,PHP將發送形如 “Location: “的頭)
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
//僞造來源referer
if ($reurl) {
curl_setopt($ch, CURLOPT_REFERER, $reurl);
}
@$c = curl_exec($ch);//執行請求
curl_close($ch);//關閉url請求
return $c;
}