百度開放平臺提交資源的方式主要有兩種,一種是將數據生成xml文件提交,另一種是使用開放平臺的所見即所得方式直接提交。
如何用PHP來生成符合開放平臺 要求規範的xml文件
- 首先看下我們都需要提交的東西,在百度的分佈情況
- 然後再看具體的xml文件格式
- 然後我們進行php程序的書寫
<?php
date_default_timezone_set('Etc/GMT-8');
error_reporting(E_ERROR | E_PARSE);
set_time_limit(0);
//抓取程序
function post($sUrl){
$oCurl = curl_init();
$header[] = "Content-type: application/x-www-form-urlencoded";
$user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36";
curl_setopt($oCurl, CURLOPT_URL, $sUrl);
curl_setopt($oCurl, CURLOPT_HTTPHEADER,$header);
curl_setopt($oCurl, CURLOPT_USERAGENT,$user_agent);
curl_setopt($oCurl, CURLOPT_RETURNTRANSFER, 1 );
curl_setopt($oCurl, CURLOPT_POST,false);
$sContent = curl_exec($oCurl);
$aStatus = curl_getinfo($oCurl);
curl_close($oCurl);
return $sContent;
}
//生成xml
$xml = <<<isEOF
<?xml version="1.0" encoding="utf-8" ?>
<DOCUMENT>\r\n
isEOF;
$lines = array(
'name'=>'日喀則地區裝修公司排名','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/',
'name'=>'日喀則地區裝修設計公司','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/',
'name'=>'日喀則地區裝飾','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangshi/',
'name'=>'日喀則地區裝飾公司','url'=>'http://xizang.edeng.cn/73/rikaze/73/zhuangxiugongsi/'
);
foreach ($lines as $loop) {
$key = $loop['name'];
$thisurl = $loop['url'];
$edengStr = post($thisurl); //抓取
$edengPreg = '#<head>[\s\S.]*?<title>(.*?)</title>[\s\S.]*?<meta name="Description" content="(.*?)" />';
$edengPreg .= '[\s\S.]*?</head>#i';
$edengTitle = "";
$edengDesc = "";
preg_match_all($edengPreg, $edengStr, $edengMat); //正則匹配所需數據
$edengTitle = $edengMat[1][0];
$edengDesc = $edengMat[2][0];
$baiduCache = "";
if($edengMat == null || $edengMat[0] == null || count($edengMat[1]) <= 0){
$baiduUrl = "http://www.baidu.com/s?wd=".$thisurl;
$baiduUrl .= "&rsv_spt=1&issp=1&rsv_bp=0&ie=utf-8&tn=baiduhome_pg&rsv_n=2&rsv_sug3=1&rsv_sug4=251&inputT=3150";
$sContent = post($baiduUrl);
$infoPreg = '#<h3 class="t"><a[^>]*?>(.*?)</a>.*?</h3>.*?<div class=\"c-abstract\">(.*?)</div>';
$infoPreg .= '<div class="f13">.*?<span class="g">(.*?) (\d{4}-\d{1,2}-\d{1,2}) </span>';
$infoPreg .= '.*?<a data-nolog href="([^>]*?)"[^>]*?>百度快照</a>#i';
preg_match_all($infoPreg, $sContent, $matches);
if(count($matches) <=0 || count($matches[1]) <= 0){
$baiduTitle = null;
$baiduContent = null;
$baiduCache = null;
}else{
$baiduTitle = $matches[1][0];
$baiduContent = $matches[2][0];
$baiduUrl = $matches[3][0];
// $baiduDate = $matches[4][0];
$baiduCache = $matches[5][0];
$baiduTitle = strip_tags($baiduTitle);
$baiduContent = strip_tags($baiduContent);
$baiduUrl = strip_tags($baiduUrl);
$baiduCacheArr = explode("?", $baiduCache);
$baiduCacheParmArr = explode("&", $baiduCacheArr[1]);
foreach ($baiduCacheParmArr as $loopCacheParm) {
$loopArr = explode("=", $loopCacheParm);
$nowKey = $loopArr[0];
$nowVal = $loopArr[1];
if($nowKey == "m"){
$baiduCache = "m=".$nowVal;
}
}
}
}
if($edengTitle == null || time($edengTitle) == ""){
$edengTitle = $baiduTitle;
}
if($edengDesc == null || time($edengDesc) == ""){
$edengDesc = $baiduContent;
}
$baiduDate = date("Y-m-d");
$xml .= <<<isEOF
<item>
<key>{$key}</key>
<display>
<url>{$thisurl}</url>
<title>{$edengTitle}</title>
<content>{$edengDesc}</content>
<showurl>{$thisurl}</showurl>
<date>{$baiduDate}</date>
<capture>{$baiduCache}</capture>
</display>
</item>
isEOF;
sleep(1);
}
$xml .= <<<isEOF
</DOCUMENT>
isEOF;
file_put_contents("baidu_open.xml", $xml);