1.curl 爬蟲
$url = "http://www.weather.com.cn/weather/101120501.shtml";
$ch = curl_init($url); //初始化會話
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //將curl_exec()獲取的信息以文件流的形式返回,而不是直接輸出。
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$rs = curl_exec($ch);
curl_close($ch);
$rs=preg_replace(array('/<head>([\s\S]+?)<\/head>/i','/<p>/i'),array('<head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head>','<br><p>'),$rs); // echo $out;
$dom = new \DOMDocument();
@$dom->loadHTML($rs);
$xpath = new \DOMXPath($dom);
// dump($xpath);
//溫度
$temperature=$xpath->query("//p[@class='tem']");//<p class="tem">
//天氣狀況
$weather_condition = $xpath->query("//p[@class='wea']"); //<p title="晴" class="wea">
//風級
$wind_grade=$xpath->query("//p[@class='win']"); //<p class="win">
// 獲取 今天 明天 後天 天氣信息
$weatherThreeDays = self::getTodayTomorrowAfterTomorrow($temperature, $weather_condition, $wind_grade);
dump($weatherThreeDays);
2.curl模擬ajax 爬蟲
$URls = "http://d1.weather.com.cn/sk_2d/101120501.html";
$referer = "http://www.weather.com.cn/weather1d/101120501.shtml";
$header[]="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36";
$curl = curl_init();
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_REFERER,$referer);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_POST, 0);
curl_setopt($curl, CURLOPT_URL, $URls);
$rs = curl_exec($curl);
$wheathInfo = explode("=", $rs);
$wheathInfo = json_decode(trim(($wheathInfo[1])),true);
dump($wheathInfo);
3.原生模擬ajax 爬蟲
開始爬蟲
public function ajaxRequest(){
$url = "http://d1.weather.com.cn/sk_2d/101120501.html";
$params['_']=1554343042108;
$ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36";
$referer = "http://www.weather.com.cn/weather1d/101120501.shtml";
$result = $this->ajax_http_request($url, $data = array(), $type = "get", $params, $accept = "", $ua,$referer,$file = false);
dump($result);die();
}
// 爬蟲方法
function ajax_http_request($url, $data = array(), $type = "get", $params = array(), $accept = "", $ua = "", $referer = "", $file = false)
{
$type = strtolower($type);
$url_params = parse_url($url);
if (!$url_params) {
echo 'url錯誤';
return false;
}
$hostip = gethostbyname($url_params['host']);
if (!$hostip) {
echo '無法訪問服務器';
return false;
}
$fp = fsockopen($url_params['host'], 80, $errno, $errstr, 30);
if (!$fp) {
echo "$errstr ($errno)<br />";
return false;
}
$query_string = http_build_query($data);
if ($type == 'post') {
$out = 'POST '.$url_params['path']." HTTP/1.1\r\n";
} else {
if (strpos($url, '?') != false){
$path = $url .'&'.$query_string;
}
else {
$path = $url . '?'.$query_string;
}
$out = 'GET '.$path.' HTTP/1.1'."\r\n";
}
$out .= 'Host: '.$url_params['host']."\r\n";
$out .= "Connection: Close\r\n";
if ($type == 'post') {
if ($file) {
$out .= ("Content-Type: multipart/form-data\r\n"); // ajax文件上傳暫時沒有此功能
} else {
$out .= ("Content-Type: application/x-www-form-urlencoded\r\n");
}
$out .= ("Content-Length: ".strlen($query_string)."\r\n");
}
if (isset($ua)) {
$out .= ('User-Agent: '.$ua."\r\n");
} else {
$out .= ("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36\r\n");
}
if (isset($referer)) {
$out .= ('Referer: http://'.$url_params['host'].'/'.$referer."\r\n");
} else {
$out .= ('Referer: http://'.$url_params['host']."\r\n");
}
$out .= ("Origin: http://".$url_params['host']."\r\n");
$out .= ("X-Requested-With: XMLHttpRequest\r\n");
$out .= ("Accept:application/json, text/javascript, */*\r\n");
$out .= "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6\r\n\r\n";
if ($type == 'post') {
// 接下來是消息體信息
$out .= $query_string;
}
fwrite($fp, $out);
$output = "";
while (!feof($fp)) {
$output .= fgets($fp, 128);
}
fclose($fp);
$result = explode("\r\n\r\n", $output);
$result = explode("\r\n",$result[1]);
array_pop($result); array_shift($result);
return implode("\r\n", $result);
}