PHP獲取郵政編碼【附2018地區行政表】

爬蟲

從互聯網上按照一定的邏輯和算法抓取和下載互聯網的網頁,是搜索引擎的一個重要組成部分。一般的爬蟲從一部分start url 開始,按照一定的策略開始爬取,爬取到的新url再放到爬取隊列中,然後進行新一輪的爬取,知道抓取完畢爲止

PHP snoopy

  • 頁面解析(字符串處理)

phpcrawler庫

參考以下網址:
http://blog.csdn.net/flitrue/article/details/51004491

<?php

// It may take a whils to crawl a site ...
set_time_limit(10000);

// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");

// Extend the class and override the handleDocumentInfo()-method 
class CuteCrawler extends PHPCrawler
{
    //解析頁面內容
    function handleDocumentInfo($DocInfo) {
        // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
        if (PHP_SAPI == "cli") $lb = "\n";
        else $lb = "<br />";

        // Print the URL and the HTTP-status-Code
        echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . $lb;

        // Print the refering URL
//    echo "Referer-page: ".$DocInfo->referer_url.$lb;

        // Print if the content of the document was be recieved or not
        if ($DocInfo->received == true)
            echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
        else
            echo "Content not received" . $lb;

        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        $url = $DocInfo->url;
        $pat = "/http:\/\/www\.kugou\.com\/yy\/special\/single\/\d+\.html/";
        if (preg_match($pat, $url) > 0) {
            $this->parseSonlistDetail($DocInfo);
        }
//    echo $lb;

        flush();
    }

    public function parseSonlistDetail($DocInfo) {
        $songlistArr = array();
        $songlistArr['raw_url'] = $DocInfo->url;
        $page = $DocInfo->content;
        //名稱
        $pat = "/<span>名稱:<\/span>([^(<br)]+)<br \/>/";
        $matches = array();
        $res = preg_match($pat, $page, $matches);
        if ($res > 0) {
            $songlistArr['title'] = $matches[1];
        } else {
            $songlistArr['title'] = '';
            print "error:get title fail\n";
        }
        //創建人
        $pat = "/<span>創建人:<\/span>([^(<br)]+)<br \/>/";
        $matches = array();
        $res = preg_match($pat, $page, $matches);
        if ($res > 0) {
            $songlistArr['creator'] = $matches[1];
        } else {
            $songlistArr['creator'] = '';
            print "error:get creator fail\n";
        }
        //更新時間
        $pat = "/<span>更新時間:<\/span>([^(<br \/>)]+)<br \/>/";
        $matches = array();
        $res = preg_match($pat, $page, $matches);
        if ($res > 0) {
            $songlistArr['create_date'] = $matches[1];
        } else {
            $songlistArr['create_date'] = '';
            print "error:get create_time fail\n";
        }
        //更新時間
        $pat = "/<p><span>簡介:<\/span>([^(<\/p>)]+)<\/p>/";
        $matches = array();
        $res = preg_match($pat, $page, $matches);
        if ($res > 0) {
            $songlistArr['info'] = $matches[1];
        } else {
            $songlistArr['info'] = '';
            print "error:get info fail\n";
        }
        //歌曲
        $pat = "/<span class=\"text\"><i>([^(<\/i>)]+)<\/i><\/span>/";
        $matches = array();
        $res = preg_match_all($pat, $page, $matches);
        $songlistArr['songs'] = array();
        if ($res > 0) {
            for ($i = 0; $i < count($matches[0]); $i++) {
                $song_title = $matches[1][$i];
                array_push($songlistArr['songs'], array('title' => $song_title));
            }
        } else {
            $songlistArr['songs'] = '';
            print "error:get info fail\n";
        }
        print_r($songlistArr);
    }


}

// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.

//創建一個爬蟲
$crawler = new CuteCrawler();

// URL to crawl  設置一個開始鏈接
$start_url = "http://www.kugou.com/yy/html/special.html";
$crawler->setURL($start_url);

// Only receive content of files with content-type "text/html"
//設置內容的類型
$crawler->addContentTypeReceiveRule("#text/html#");

//獲取的鏈接:
$crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/index/1-\d+\-0\.html# i");
$crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/single/\d+\.html# i");
// Ignore links to pictures, dont even request pictures
//設置過濾鏈接,不需要下載
$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");

// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);

// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//數據內容的總量,多少M
//$crawler->setTrafficLimit(1000 * 1024);

//無限
$crawler->setTrafficLimit(0);

// Thats enough, now here we go
$crawler->go();

// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();

if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";

echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
?>

百度郵編抓取地區編碼(201807附數據表)

    /**
     * 設置地區郵編
     * @return array|false
     * @throws \think\db\exception\DataNotFoundException
     * @throws \think\db\exception\ModelNotFoundException
     * @throws \think\exception\DbException
     * @throws \Exception
     * @author llj <[email protected]>
     */
    public function setAreaCode()
    {
        $code = $this->searchByBaidu('東莞');
        $Region = new Region();
        $area_ids = $Region->column('id');
        $update_data = [];
        $update_empty = [];
        $code_num = 0;
        foreach ($area_ids as $value) {
            $area_info = $Region->getAllByArea($value,false);
            $level = $Region->where('id', $value)->value('level');
            if (in_array($level, [2, 3])) {
                $search = str_replace('/', '', $area_info['area_str']);
                $search_res = $this->searchByBaidu($search);
                $code_num ++;
                if (empty($search_res) && $level == 3) {    //如果市級以下爲空,則使用市級郵編
                    $update_empty[] = $value;
                }
            } else {
                $search_res = '';
            }
            $update_data[] = [
                'id' => $value,
                'code' => $search_res,
            ];
        }
        $update_res = $Region->saveAll($update_data);
        //處理code爲空的地區爲pid的code
        if (!empty($update_empty)) {
            $update_empty_data = [];
            foreach ($update_empty as $value) {
                $pid = $Region->where('id', $value)->value('pid');
                $pid_code = $Region->where('id', $pid)->value('code') ?:'';
                $update_empty_data[] = [
                    'id' => $value,
                    'code' => $pid_code,
                ];
            }
            $update_empty_res = $Region->saveAll($update_empty_data);
        }
        return [
            'all_num' => count($update_data),
            'update_res_num' => count($update_res),
            'code_num' => $code_num,
            'empty_num' => count($update_empty),
        ];
    }

    /**
     * 搜索郵編
     * @param $area_name
     * @return mixed
     * @throws \Exception
     * @author llj <[email protected]>
     */
    public function searchByBaidu($area_name)
    {
        $area_name = urlencode(mb_convert_encoding($area_name, 'gb2312'));
        $url = 'http://opendata.baidu.com/post/s?wd='.$area_name.'&rn=20';
        $ch = curl_init();
        $timeout = 10;
        curl_setopt($ch, CURLOPT_URL, $url);
        //由於curl先解析IPV6,再解析IPV4域名的,故直接設置成ipv4網絡, 如果超時請檢查dns解析是否正確
        curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36');
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
        $html = curl_exec($ch);
        if (curl_errno($ch)) {
            throw new \Exception('ERROR:'. curl_error($ch));
        }
        curl_close($ch);
        $coding = mb_detect_encoding($html);
        if ($coding != "UTF-8" || !mb_check_encoding($html, "UTF-8")){
            $html = mb_convert_encoding($html, 'utf-8', 'GBK,UTF-8,ASCII');
        }
        $pattern = '/<ul><!-- baidu-tc begin.*?baidu-tc end --><\/ul>/';
        preg_match($pattern, $html, $matches);
        $pattern = '/[0-9]{6}/';
        if (!empty($matches)) {
            preg_match($pattern, $matches[0],$matches_res);
            $code = (is_array($matches_res) && !empty($matches_res)) ? $matches_res[0] : '';
            return $code;
        } else {
            return '';
        }
    }

地區行政表

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章