php生成百度換量 XML文件

百度開放平臺提交資源的方式主要有兩種,一種是將數據生成xml文件提交,另一種是使用開放平臺的所見即所得方式直接提交。

如何用PHP來生成符合開放平臺 要求規範的xml文件

  1. 首先看下我們都需要提交的東西,在百度的分佈情況

這裏寫圖片描述

  1. 然後再看具體的xml文件格式

這裏寫圖片描述

  1. 然後我們進行php程序的書寫
<?php
date_default_timezone_set('Etc/GMT-8');
error_reporting(E_ERROR | E_PARSE);
set_time_limit(0);

//抓取程序
function post($sUrl){
    $oCurl = curl_init();  
    $header[] = "Content-type: application/x-www-form-urlencoded";
    $user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36";
    curl_setopt($oCurl, CURLOPT_URL, $sUrl);
    curl_setopt($oCurl, CURLOPT_HTTPHEADER,$header);
    curl_setopt($oCurl, CURLOPT_USERAGENT,$user_agent);
    curl_setopt($oCurl, CURLOPT_RETURNTRANSFER, 1 );
    curl_setopt($oCurl, CURLOPT_POST,false);
    $sContent = curl_exec($oCurl);
    $aStatus = curl_getinfo($oCurl);
    curl_close($oCurl);

    return $sContent;
}

//生成xml
$xml = <<<isEOF
<?xml version="1.0" encoding="utf-8" ?>
<DOCUMENT>\r\n
isEOF;
$lines = array(
    'name'=>'日喀則地區裝修公司排名','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/',
    'name'=>'日喀則地區裝修設計公司','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/',
    'name'=>'日喀則地區裝飾','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangshi/',
    'name'=>'日喀則地區裝飾公司','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/'
);

foreach ($lines as $loop) {
    $key = $loop['name'];
    $thisurl = $loop['url'];

    $edengStr = post($thisurl); //抓取

    $edengPreg = '#<head>[\s\S.]*?<title>(.*?)</title>[\s\S.]*?<meta name="Description" content="(.*?)" />';
    $edengPreg .= '[\s\S.]*?</head>#i';

    $edengTitle = "";
    $edengDesc = "";

    preg_match_all($edengPreg, $edengStr, $edengMat); //正則匹配所需數據

    $edengTitle = $edengMat[1][0];
    $edengDesc = $edengMat[2][0];
    $baiduCache = "";

    if($edengMat == null || $edengMat[0] == null || count($edengMat[1]) <= 0){
        $baiduUrl = "http://www.baidu.com/s?wd=".$thisurl;
        $baiduUrl .= "&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=baiduhome_pg&rsv_n=2&rsv_sug3=1&rsv_sug4=251&inputT=3150";

        $sContent = post($baiduUrl);

        $infoPreg = '#<h3 class="t"><a[^>]*?>(.*?)</a>.*?</h3>.*?<div class=\"c-abstract\">(.*?)</div>';
        $infoPreg .= '<div class="f13">.*?<span class="g">(.*?)&nbsp;(\d{4}-\d{1,2}-\d{1,2})&nbsp;</span>';
        $infoPreg .= '.*?<a data-nolog href="([^>]*?)"[^>]*?>百度快照</a>#i';

        preg_match_all($infoPreg, $sContent, $matches);

        if(count($matches) <=0 || count($matches[1]) <= 0){
            $baiduTitle = null;
            $baiduContent = null;
            $baiduCache = null;
        }else{
            $baiduTitle = $matches[1][0];
            $baiduContent = $matches[2][0];
            $baiduUrl = $matches[3][0];
            // $baiduDate = $matches[4][0];

            $baiduCache = $matches[5][0];

            $baiduTitle = strip_tags($baiduTitle);
            $baiduContent = strip_tags($baiduContent);
            $baiduUrl = strip_tags($baiduUrl);

            $baiduCacheArr = explode("?", $baiduCache);
            $baiduCacheParmArr = explode("&", $baiduCacheArr[1]);

            foreach ($baiduCacheParmArr as $loopCacheParm) {
                $loopArr = explode("=", $loopCacheParm);
                $nowKey = $loopArr[0];
                $nowVal = $loopArr[1];
                if($nowKey == "m"){
                    $baiduCache = "m=".$nowVal;
                }
            }
        }
    }

    if($edengTitle == null || time($edengTitle) == ""){
        $edengTitle = $baiduTitle;
    }
    if($edengDesc == null || time($edengDesc) == ""){
        $edengDesc = $baiduContent;
    }
    $baiduDate = date("Y-m-d");
    $xml .= <<<isEOF
<item>
            <key>{$key}</key>
            <display>
                <url>{$thisurl}</url>
                <title>{$edengTitle}</title>
                <content>{$edengDesc}</content>
                <showurl>{$thisurl}</showurl>
                <date>{$baiduDate}</date>
                <capture>{$baiduCache}</capture>
            </display>
        </item>

isEOF;
    sleep(1);
}

$xml .= <<<isEOF
</DOCUMENT>
isEOF;
file_put_contents("baidu_open.xml", $xml);
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章