大家都知道,你採集別人的網站,可能封掉你的IP地址,那麼肯定就不能用自己的真實IP地址了,今天我們就淺談PHP僞裝IP地址,進行數據採集!
首先,我用的是TP5框架,把IP僞裝及數據採集封裝到common.php中了,代碼如下:
<?php
// +----------------------------------------------------------------------
// | ThinkPHP [ WE CAN DO IT JUST THINK ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2016 http://thinkphp.cn All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: 流年 <[email protected]>
// +----------------------------------------------------------------------
// 應用公共文件
use think\Db;
/************************************* 封裝公共方法 *****************************************/
/**
* 僞裝IP 地址 - 抓取數據
* GET 請求
* @param $url
* @return mixed
*/
function pretendIpData($url){
// 給與IP 段
$data = array(
119.120.'.'.rand(1,255).'.'.rand(1,255),
124.174.'.'.rand(1,255).'.'.rand(1,255),
116.249.'.'.rand(1,255).'.'.rand(1,255),
118.125.'.'.rand(1,255).'.'.rand(1,255),
42.175.'.'.rand(1,255).'.'.rand(1,255),
124.162.'.'.rand(1,255).'.'.rand(1,255),
211.167.'.'.rand(1,255).'.'.rand(1,255),
58.206.'.'.rand(1,255).'.'.rand(1,255),
117.24.'.'.rand(1,255).'.'.rand(1,255),
203.93.'.'.rand(1,255).'.'.rand(1,255),
);
//隨機獲取一個IP地址
$ip = $data[array_rand($data)];
//模擬來源網址
$referUrl = "http://www.baidu.com";
$agentArray=[
//PC端的UserAgent
"safari 5.1 – MAC"=>"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"safari 5.1 – Windows"=>"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Firefox 38esr"=>"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"IE 11"=>"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"IE 9.0"=>"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"IE 8.0"=>"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"IE 7.0"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"IE 6.0"=>"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Firefox 4.0.1 – MAC"=>"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Firefox 4.0.1 – Windows"=>"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera 11.11 – MAC"=>"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera 11.11 – Windows"=>"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Chrome 17.0 – MAC"=>"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"傲遊(Maxthon)"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"騰訊TT"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"世界之窗(The World) 2.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"世界之窗(The World) 3.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"360瀏覽器"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"搜狗瀏覽器 1.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Avant"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Green Browser"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
];
$userAgent=$agentArray[array_rand($agentArray,1)]; //隨機瀏覽器userAgent
$header = array(
'CLIENT-IP:'.$ip,
'X-FORWARDED-FOR:'.$ip,
); //構造ip
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url); //要抓取的網址
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_RETURNTRANSFER,true);
curl_setopt($curl, CURLOPT_REFERER, $referUrl); //模擬來源網址
curl_setopt($curl, CURLOPT_USERAGENT, $userAgent); //模擬常用瀏覽器的userAgent
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳過證書檢查
$info = curl_exec($curl);
return $info;
}
/**
* 僞裝IP 地址 - 抓取數據
* POST 請求
* @param $url
* @param array $data
* @return mixed
*/
function getPostIpData($url,$data=array()){
// 給與IP 段
$ipData = array(
119.120.'.'.rand(1,255).'.'.rand(1,255),
124.174.'.'.rand(1,255).'.'.rand(1,255),
116.249.'.'.rand(1,255).'.'.rand(1,255),
118.125.'.'.rand(1,255).'.'.rand(1,255),
42.175.'.'.rand(1,255).'.'.rand(1,255),
124.162.'.'.rand(1,255).'.'.rand(1,255),
211.167.'.'.rand(1,255).'.'.rand(1,255),
58.206.'.'.rand(1,255).'.'.rand(1,255),
117.24.'.'.rand(1,255).'.'.rand(1,255),
203.93.'.'.rand(1,255).'.'.rand(1,255),
);
//隨機獲取一個IP地址
$ip = $ipData[array_rand($ipData)];
//模擬來源網址
$referUrl = "http://www.baidu.com";
$agentArray=[
//PC端的UserAgent
"safari 5.1 – MAC"=>"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"safari 5.1 – Windows"=>"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Firefox 38esr"=>"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"IE 11"=>"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"IE 9.0"=>"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"IE 8.0"=>"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"IE 7.0"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"IE 6.0"=>"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Firefox 4.0.1 – MAC"=>"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Firefox 4.0.1 – Windows"=>"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera 11.11 – MAC"=>"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera 11.11 – Windows"=>"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Chrome 17.0 – MAC"=>"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"傲遊(Maxthon)"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"騰訊TT"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"世界之窗(The World) 2.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"世界之窗(The World) 3.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"360瀏覽器"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"搜狗瀏覽器 1.x"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Avant"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Green Browser"=>"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
];
$userAgent=$agentArray[array_rand($agentArray,1)]; //隨機瀏覽器userAgent
$header = array(
'CLIENT-IP:'.$ip,
'X-FORWARDED-FOR:'.$ip,
); //構造ip
$curl = curl_init(); // 啓動一個CURL會話
curl_setopt($curl, CURLOPT_URL, $url); // 要訪問的地址
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_RETURNTRANSFER,true);
curl_setopt($curl, CURLOPT_REFERER, $referUrl); //模擬來源網址
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // 對認證證書來源的檢查
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0); // 0-跳過證書 1-從證書中檢查SSL加密算法是否存在
curl_setopt($curl, CURLOPT_USERAGENT, $userAgent); // 模擬用戶使用的瀏覽器
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // 使用自動跳轉
curl_setopt($curl, CURLOPT_AUTOREFERER, 1); // 自動設置Referer
curl_setopt($curl, CURLOPT_POST, 1); // 發送一個常規的Post請求
curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Post提交的數據包
curl_setopt($curl, CURLOPT_TIMEOUT, 30); // 設置超時限制防止死循環
$info = curl_exec($curl); // 執行操作
if (curl_errno($curl)) {
echo 'Errno'.curl_error($curl);//捕抓異常
}
curl_close($curl); // 關閉CURL會話
return $info; // 返回數據,json格式
}
/**
* 時間戳轉日期 - 精確到毫秒的時間戳
* @param $time
* @return false|string
*/
function timeDate($time)
{
$tag='Y-m-d H:i:s';
$a = substr($time,0,10);
$date = date($tag,$a);
return $date;
}
/**
* 數字字符串求和
* @param $numberString
* @return float|int
*/
function strSum ($numberString) {
$arr = explode(',',$numberString);
return array_sum($arr);
}
/**
* 隨機數:1-10
* @param $num
* @return array
*/
function randStr($num)
{
$numbers = range (1,10);
//shuffle 將數組順序隨即打亂
shuffle ($numbers);
//array_slice 取該數組中的某一段
$result = array_slice($numbers,0,$num);
//將數組的值按升序排列
array_multisort($result,SORT_ASC,SORT_NUMERIC);
return $result;
}
/**
* 單雙判斷 - 1 單 2 雙
* @param $num
* @return bool|int
*/
function isDouble($num){
$is_double = 0;
if(is_numeric($num)){
if(is_int($num)){
if($num % 2 == 0){
$is_double = 2;
}else{
$is_double = 1;
}
}
}else{
return false;
}
return $is_double;
}
GET請求,我們要在控制器中,調用此方法,控制器代碼如下:
<?php
namespace app\index\controller\choose;
use think\Controller;
use think\Db;
class ChooseGd extends Controller
{
public function collection(){
$url = '你要採集的網址';
// 將數據格式 轉換成你想要的數據類型
$data = json_decode(pretendData($url),true);
dump($data);die;
}
}
最後,看看打印結果:
POST請求,我們要在控制器中,調用此方法,控制器代碼如下:
這個是我們要抓取的的POST請求,接下來調用上面封裝的POST請求方法:控制器代碼如下:
<?php
namespace app\index\controller\quick;
use think\Controller;
use think\Db;
class QuickXx extends Controller
{
public function collection(){
//模擬要抓取網址的參數
$postData['lotterytype'] = 'GP_K3_JiLin';
//要抓取的網址
$url = '你要抓取的網址';
$data = json_decode(getPostIpData($url,$postData),true);
dump($data);die;
}
}
結果打印如下:(順便打印一下header頭)
希望能幫到你!