php生成百度换量 XML文件

百度开放平台提交资源的方式主要有两种,一种是将数据生成xml文件提交,另一种是使用开放平台的所见即所得方式直接提交。

如何用PHP来生成符合开放平台 要求规范的xml文件

  1. 首先看下我们都需要提交的东西,在百度的分布情况

这里写图片描述

  1. 然后再看具体的xml文件格式

这里写图片描述

  1. 然后我们进行php程序的书写
<?php
date_default_timezone_set('Etc/GMT-8');
error_reporting(E_ERROR | E_PARSE);
set_time_limit(0);

//抓取程序
function post($sUrl){
    $oCurl = curl_init();  
    $header[] = "Content-type: application/x-www-form-urlencoded";
    $user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36";
    curl_setopt($oCurl, CURLOPT_URL, $sUrl);
    curl_setopt($oCurl, CURLOPT_HTTPHEADER,$header);
    curl_setopt($oCurl, CURLOPT_USERAGENT,$user_agent);
    curl_setopt($oCurl, CURLOPT_RETURNTRANSFER, 1 );
    curl_setopt($oCurl, CURLOPT_POST,false);
    $sContent = curl_exec($oCurl);
    $aStatus = curl_getinfo($oCurl);
    curl_close($oCurl);

    return $sContent;
}

//生成xml
$xml = <<<isEOF
<?xml version="1.0" encoding="utf-8" ?>
<DOCUMENT>\r\n
isEOF;
$lines = array(
    'name'=>'日喀则地区装修公司排名','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/',
    'name'=>'日喀则地区装修设计公司','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/',
    'name'=>'日喀则地区装饰','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangshi/',
    'name'=>'日喀则地区装饰公司','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/'
);

foreach ($lines as $loop) {
    $key = $loop['name'];
    $thisurl = $loop['url'];

    $edengStr = post($thisurl); //抓取

    $edengPreg = '#<head>[\s\S.]*?<title>(.*?)</title>[\s\S.]*?<meta name="Description" content="(.*?)" />';
    $edengPreg .= '[\s\S.]*?</head>#i';

    $edengTitle = "";
    $edengDesc = "";

    preg_match_all($edengPreg, $edengStr, $edengMat); //正则匹配所需数据

    $edengTitle = $edengMat[1][0];
    $edengDesc = $edengMat[2][0];
    $baiduCache = "";

    if($edengMat == null || $edengMat[0] == null || count($edengMat[1]) <= 0){
        $baiduUrl = "http://www.baidu.com/s?wd=".$thisurl;
        $baiduUrl .= "&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=baiduhome_pg&rsv_n=2&rsv_sug3=1&rsv_sug4=251&inputT=3150";

        $sContent = post($baiduUrl);

        $infoPreg = '#<h3 class="t"><a[^>]*?>(.*?)</a>.*?</h3>.*?<div class=\"c-abstract\">(.*?)</div>';
        $infoPreg .= '<div class="f13">.*?<span class="g">(.*?)&nbsp;(\d{4}-\d{1,2}-\d{1,2})&nbsp;</span>';
        $infoPreg .= '.*?<a data-nolog href="([^>]*?)"[^>]*?>百度快照</a>#i';

        preg_match_all($infoPreg, $sContent, $matches);

        if(count($matches) <=0 || count($matches[1]) <= 0){
            $baiduTitle = null;
            $baiduContent = null;
            $baiduCache = null;
        }else{
            $baiduTitle = $matches[1][0];
            $baiduContent = $matches[2][0];
            $baiduUrl = $matches[3][0];
            // $baiduDate = $matches[4][0];

            $baiduCache = $matches[5][0];

            $baiduTitle = strip_tags($baiduTitle);
            $baiduContent = strip_tags($baiduContent);
            $baiduUrl = strip_tags($baiduUrl);

            $baiduCacheArr = explode("?", $baiduCache);
            $baiduCacheParmArr = explode("&", $baiduCacheArr[1]);

            foreach ($baiduCacheParmArr as $loopCacheParm) {
                $loopArr = explode("=", $loopCacheParm);
                $nowKey = $loopArr[0];
                $nowVal = $loopArr[1];
                if($nowKey == "m"){
                    $baiduCache = "m=".$nowVal;
                }
            }
        }
    }

    if($edengTitle == null || time($edengTitle) == ""){
        $edengTitle = $baiduTitle;
    }
    if($edengDesc == null || time($edengDesc) == ""){
        $edengDesc = $baiduContent;
    }
    $baiduDate = date("Y-m-d");
    $xml .= <<<isEOF
<item>
            <key>{$key}</key>
            <display>
                <url>{$thisurl}</url>
                <title>{$edengTitle}</title>
                <content>{$edengDesc}</content>
                <showurl>{$thisurl}</showurl>
                <date>{$baiduDate}</date>
                <capture>{$baiduCache}</capture>
            </display>
        </item>

isEOF;
    sleep(1);
}

$xml .= <<<isEOF
</DOCUMENT>
isEOF;
file_put_contents("baidu_open.xml", $xml);
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章