爬蟲
從互聯網上按照一定的邏輯和算法抓取和下載互聯網的網頁,是搜索引擎的一個重要組成部分。一般的爬蟲從一部分start url 開始,按照一定的策略開始爬取,爬取到的新url再放到爬取隊列中,然後進行新一輪的爬取,知道抓取完畢爲止
PHP snoopy
- 頁面解析(字符串處理)
phpcrawler庫
參考以下網址:
http://blog.csdn.net/flitrue/article/details/51004491
<?php
// It may take a whils to crawl a site ...
set_time_limit(10000);
// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");
// Extend the class and override the handleDocumentInfo()-method
class CuteCrawler extends PHPCrawler
{
//解析頁面內容
function handleDocumentInfo($DocInfo) {
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
// Print the URL and the HTTP-status-Code
echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . $lb;
// Print the refering URL
// echo "Referer-page: ".$DocInfo->referer_url.$lb;
// Print if the content of the document was be recieved or not
if ($DocInfo->received == true)
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
else
echo "Content not received" . $lb;
// Now you should do something with the content of the actual
// received page or file ($DocInfo->source), we skip it in this example
$url = $DocInfo->url;
$pat = "/http:\/\/www\.kugou\.com\/yy\/special\/single\/\d+\.html/";
if (preg_match($pat, $url) > 0) {
$this->parseSonlistDetail($DocInfo);
}
// echo $lb;
flush();
}
public function parseSonlistDetail($DocInfo) {
$songlistArr = array();
$songlistArr['raw_url'] = $DocInfo->url;
$page = $DocInfo->content;
//名稱
$pat = "/<span>名稱:<\/span>([^(<br)]+)<br \/>/";
$matches = array();
$res = preg_match($pat, $page, $matches);
if ($res > 0) {
$songlistArr['title'] = $matches[1];
} else {
$songlistArr['title'] = '';
print "error:get title fail\n";
}
//創建人
$pat = "/<span>創建人:<\/span>([^(<br)]+)<br \/>/";
$matches = array();
$res = preg_match($pat, $page, $matches);
if ($res > 0) {
$songlistArr['creator'] = $matches[1];
} else {
$songlistArr['creator'] = '';
print "error:get creator fail\n";
}
//更新時間
$pat = "/<span>更新時間:<\/span>([^(<br \/>)]+)<br \/>/";
$matches = array();
$res = preg_match($pat, $page, $matches);
if ($res > 0) {
$songlistArr['create_date'] = $matches[1];
} else {
$songlistArr['create_date'] = '';
print "error:get create_time fail\n";
}
//更新時間
$pat = "/<p><span>簡介:<\/span>([^(<\/p>)]+)<\/p>/";
$matches = array();
$res = preg_match($pat, $page, $matches);
if ($res > 0) {
$songlistArr['info'] = $matches[1];
} else {
$songlistArr['info'] = '';
print "error:get info fail\n";
}
//歌曲
$pat = "/<span class=\"text\"><i>([^(<\/i>)]+)<\/i><\/span>/";
$matches = array();
$res = preg_match_all($pat, $page, $matches);
$songlistArr['songs'] = array();
if ($res > 0) {
for ($i = 0; $i < count($matches[0]); $i++) {
$song_title = $matches[1][$i];
array_push($songlistArr['songs'], array('title' => $song_title));
}
} else {
$songlistArr['songs'] = '';
print "error:get info fail\n";
}
print_r($songlistArr);
}
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
//創建一個爬蟲
$crawler = new CuteCrawler();
// URL to crawl 設置一個開始鏈接
$start_url = "http://www.kugou.com/yy/html/special.html";
$crawler->setURL($start_url);
// Only receive content of files with content-type "text/html"
//設置內容的類型
$crawler->addContentTypeReceiveRule("#text/html#");
//獲取的鏈接:
$crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/index/1-\d+\-0\.html# i");
$crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/single/\d+\.html# i");
// Ignore links to pictures, dont even request pictures
//設置過濾鏈接,不需要下載
$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//數據內容的總量,多少M
//$crawler->setTrafficLimit(1000 * 1024);
//無限
$crawler->setTrafficLimit(0);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
?>
百度郵編抓取地區編碼(201807附數據表)
/**
* 設置地區郵編
* @return array|false
* @throws \think\db\exception\DataNotFoundException
* @throws \think\db\exception\ModelNotFoundException
* @throws \think\exception\DbException
* @throws \Exception
* @author llj <[email protected]>
*/
public function setAreaCode()
{
$code = $this->searchByBaidu('東莞');
$Region = new Region();
$area_ids = $Region->column('id');
$update_data = [];
$update_empty = [];
$code_num = 0;
foreach ($area_ids as $value) {
$area_info = $Region->getAllByArea($value,false);
$level = $Region->where('id', $value)->value('level');
if (in_array($level, [2, 3])) {
$search = str_replace('/', '', $area_info['area_str']);
$search_res = $this->searchByBaidu($search);
$code_num ++;
if (empty($search_res) && $level == 3) { //如果市級以下爲空,則使用市級郵編
$update_empty[] = $value;
}
} else {
$search_res = '';
}
$update_data[] = [
'id' => $value,
'code' => $search_res,
];
}
$update_res = $Region->saveAll($update_data);
//處理code爲空的地區爲pid的code
if (!empty($update_empty)) {
$update_empty_data = [];
foreach ($update_empty as $value) {
$pid = $Region->where('id', $value)->value('pid');
$pid_code = $Region->where('id', $pid)->value('code') ?:'';
$update_empty_data[] = [
'id' => $value,
'code' => $pid_code,
];
}
$update_empty_res = $Region->saveAll($update_empty_data);
}
return [
'all_num' => count($update_data),
'update_res_num' => count($update_res),
'code_num' => $code_num,
'empty_num' => count($update_empty),
];
}
/**
* 搜索郵編
* @param $area_name
* @return mixed
* @throws \Exception
* @author llj <[email protected]>
*/
public function searchByBaidu($area_name)
{
$area_name = urlencode(mb_convert_encoding($area_name, 'gb2312'));
$url = 'http://opendata.baidu.com/post/s?wd='.$area_name.'&rn=20';
$ch = curl_init();
$timeout = 10;
curl_setopt($ch, CURLOPT_URL, $url);
//由於curl先解析IPV6,再解析IPV4域名的,故直接設置成ipv4網絡, 如果超時請檢查dns解析是否正確
curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36');
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$html = curl_exec($ch);
if (curl_errno($ch)) {
throw new \Exception('ERROR:'. curl_error($ch));
}
curl_close($ch);
$coding = mb_detect_encoding($html);
if ($coding != "UTF-8" || !mb_check_encoding($html, "UTF-8")){
$html = mb_convert_encoding($html, 'utf-8', 'GBK,UTF-8,ASCII');
}
$pattern = '/<ul><!-- baidu-tc begin.*?baidu-tc end --><\/ul>/';
preg_match($pattern, $html, $matches);
$pattern = '/[0-9]{6}/';
if (!empty($matches)) {
preg_match($pattern, $matches[0],$matches_res);
$code = (is_array($matches_res) && !empty($matches_res)) ? $matches_res[0] : '';
return $code;
} else {
return '';
}
}