PHP模擬登陸抓取,不使用cookieJar文件保存cookie,第一次嘗試寫面向對象Mark,自己留着看。其中的幾個難點,password的加密方法,cookie的接連獲取,巧用substr()和strpos取值。
<?php
set_time_limit(120);
date_default_timezone_set('Asia/Shanghai');
class yingji{
private $cookie="";
private $username="email";
private $password="password";
private $url ="https://host/login";
private $loginaction="https://host/loginAction";
private $getcloneEidurl="https://host/monitor/pad/addAttention";
private $targeturl="https://host/monitor/query-micro-blogs-count";
private $request_headers=array (
'Host' => 'host',
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Accept' => '*/*',
'Accept-Language' => 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding' => 'gzip, deflate',
'X-Requested-With'=>'XMLHttpRequest',
'Connection' => 'keep-alive'
);
private $cloneEid;
public $keyword;
public $fromdate;
public $todate;
public function __construct($keyword,$fromdate,$todate){
$this->keyword=$keyword;
$this->fromdate=$fromdate;
$this->todate=$todate;
}
public function geturl(){
if($this->cookie==""|$this->cookie==NULL){
$this::dologin();
}
$this::getcloneEid();
$result =$this::again();
$times=1;
while($result=="0"){
$result=$this::again();
$times++;
if($times==3)
break;
}
return $result;
}
private function again(){
$post_data="viewType=day&startTime=".$this->fromdate."&endTime=".$this->todate."&dt=&dtt=day&st=MICRO_BLOG_ALL&fq=%7B%22blogType%22%3A0%2C%22blogStatus%22%3A0%2C%22content%22%3A%22%22%2C%22bloggerVipType%22%3A-1%2C%22minFans%22%3A%220%22%2C%22maxFans%22%3A%22-1%22%2C%22bloggerType%22%3A0%2C%22platformType%22%3A%22MICRO_BLOG_ALL%22%7D&q=".urlencode($this->keyword)."&cloneEid=".$this->cloneEid;
$result=$this::docurl($this->targeturl,TRUE,$post_data,$this->cookie);
// var_dump($this->cookie);
$status=substr($result, 9,3);
if($status!="200"){
return "0";
}
return substr($result, strpos($result, "count")+8,-2);
}
private function getcloneEid(){//獲取最終頁面post的其中一個值
//$this::dologin();
$post="at=EVENT&st=MICRO_BLOG_ALL&name=".urlencode($this->keyword)."&keywords=".urlencode($this->keyword);
$output=$this::docurl($this->getcloneEidurl,TRUE,$post,$this->cookie);
$this->cloneEid=substr($output, strpos($output, "id")+5,36);
}
private function dologin(){//積累cookie
$this::prelogin();
$post="client_screen=1440+x+900&langCode=&username=15221197583%40139.com&password=7c2605c596c3002057999999776af6d7";
$result=$this::docurl($this->loginaction,TRUE,$post,$this->cookie);
list($header, $body) = explode("\r\n\r\n", $result);
// 解析COOKIE
preg_match("/set\-cookie:([^\r\n]*)/i", $header, $matches);
$cookie=substr($header, strrpos($header, "Cookie:")+8,58);
$this->cookie=substr($this->cookie, 0,40);
$this->cookie .= $cookie;
}
private function prelogin(){//積累cookie
$url=$this->url;
$result=$this::docurl($url,FALSE,"",$this->cookie);
list($header, $body) = explode("\r\n\r\n", $result);
// 解析COOKIE
// preg_match("/set\-cookie:([^\r\n]*)/i", $header, $matches);
$cookie_route=substr($header, strrpos($header, "route"),38);
$cookie_JSESS=substr($header, strrpos($header, "JSESSIONID"),51);
// 後面用CURL提交的時候可以直接使用
// curl_setopt($ch, CURLOPT_COOKIE, $cookie);
$this->cookie =$cookie_route."; ".$cookie_JSESS;
}
private function docurl($url,$is_post=FALSE,$postdata="",$cookie=""){
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->request_headers);
if($is_post){
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $postdata);
}
if($cookie!=""){
curl_setopt($ch, CURLOPT_COOKIE, $cookie);
}
$output=curl_exec($ch);
curl_close($ch);
return $output;
}
}
//這裏想鏈接數據庫,在外操作數據庫,以便添加或刪除爬蟲任務
$conn=mysqli_connect('localhost','','');
if(!$conn){
die("連接數據庫失敗".mysqli_error());
}
mysqli_select_db($conn,"");
mysqli_query($conn,"set names utf8");
$query="SELECT * FROM keywords";
$result=mysqli_query($conn,$query);
$daycount1="begin";
while ($row=mysqli_fetch_object($result)) {
echo "正在抓取".$row->words."...<br/>";
/*這裏是沒有之前沒有進行過爬蟲的程序*/
if($row->tempdate=='1'){
$todate=(strtotime(date("Y-m-d",time()))-1);
$fromdate =$row->Fromdate;
if(strtotime(date("Y-m-d",time()))!=strtotime($fromdate)||strtotime(date("Y-m-d",time()))>strtotime($fromdate)){
$daycount = $row->daycount;
echo "fromdate:".date("Y-m-d H:i:s",($todate+1-86400))." todate ".date("Y-m-d H:i:s",($todate));
$a=new yingji($row->words,(1000*($todate+1-86400)),(1000*$todate));
$count= $a->geturl();
$daycount.=",".$count;
unset($a);
$update="UPDATE keywords set tempdate='1' ,daycount=\"".$daycount."\" where words=\"".$row->words."\"";
var_dump($update);
$resul=mysqli_query($conn,$update);
}else{
//do nothing
}
}else{
$fromdate=strtotime($row->Fromdate);
$times=floor((time()-strtotime($row->Fromdate))/86400);
for ($i=1; $i <=$times ; $i++) {
$a=new yingji($row->words,(1000*$fromdate),(1000*($fromdate+86400-1)));
$count= $a->geturl();
echo date("Y-m-d H:i:s",$fromdate)." to ".date("Y-m-d H:i:s",($fromdate+86400-1))."<br/>";
$fromdate+=86400;
$daycount1.=",".$count;
unset($a);
ob_flush();
flush();
sleep(1);
}
$update="UPDATE keywords set tempdate='1' ,daycount=\"".$daycount1."\" where words=\"".$row->words."\"";
var_dump($update);
$resul=mysqli_query($conn,$update);
if($resul){
$daycount1="begin";
}
}
/*抓取之前的數據到此結束*/
}
mysqli_close($conn);
Mark給自己看