多餘的話先不說,先貼代碼:
spi.php
<?php
#SPI Version 1.0
#Author: mrn6 from csdn.net--https://me.csdn.net/qq_21264377
if ($_SERVER["REQUEST_METHOD"] == "POST") {
require_once "document.php";
#匹配jpg jpeg png gif webp bmp圖片鏈接的表達式
$imgSrcPattern = '/[^>\"\']+.jpg|[^>\"\']+.jpeg|[^>\"\']+.png|[^>\"\']+.gif|[^>\"\']+.webp|[^>\"\']+.bmp/';
#匹配http/https地址的基本表達式
$linkSrcPattern = '/http[s]?:\/\/[^>\"\']+/';
#匹配包含href屬性的內容
$linkTagPattern = '/href=[\'"]{1}[^<>"\']+[\'"]{1}/';
#匹配javascript地址
$jsSrcPattern = '/http[s]?:\/\/[^>\"\']+.js[^>\"\']*/';
#匹配img標籤
$imgPattern = '/<img[^>]*?>/';
#匹配包含alt屬性的img標籤
$altPattern = '/<img[^>]*alt=[^>]+>/';
#匹配meta頭信息爲charset字符集的標籤
$metaCharsetPattern = '/<meta[^>]*charset=[^>]+>/';
#從上一表達式匹配結果中匹配charset屬性的內容
$charsetPattern = '/charset=[a-zA-Z0-9]+/';
#匹配title標籤
$titlePattern = '/<title>[^>]*<\/title>/';
#從上一表達式中匹配title標籤的內容
$titleSrcPattern = '/[^<>]+/';
#從Response響應頭header信息中匹配ETag即文件名
$tagFilePattern = '/ETag:[ ]*"[^<\"\']+"/';
#從上一表達式匹配的結果中匹配其文件名的內容
$filePattern = '//';
#從Reponse響應頭Header中匹配Content-Type
$contentTypePattern = '/Content-Type:[ ]*[a-zA-Z0-9]+[\/][a-zA-Z0-9]+/';
#獲取主機名,如https://www.baidu.com中的baidu
function getHost($source)
{
$schema = 'http://';
$host = $source;
if (strpos($host, 'http://') === 0) {
$schema = 'http://';
$host = preg_replace('/http:[\/]{2}/', '', $host);
} elseif (strpos($host, 'https://') === 0) {
$schema = 'https://';
$host = preg_replace('/https:[\/]{2}/', '', $host);
} else {
// pass
}
$pos = strpos($host, '/');
$host = substr($host, 0, $pos);
$host = $schema . $host;
return $host;
}
#使用cURL從$url中獲取響應內容
function getUrlContent($url, $https = 0)
{ // 通過url獲取html內容
$output = 'unknown';
try {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
$headers = array(
'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Referer: ' . $url
);
if ($https) {
// curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 對認證證書來源的檢查
// curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); // 從證書中檢查SSL加密算法是否存在
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true); // 對認證證書來源的檢查
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, true); // 從證書中檢查SSL加密算法是否存在
}
// curl_setopt($ch, CURLOPT_USERAGENT,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0");
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_AUTOREFERER, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$output = curl_exec($ch);
$output = mb_convert_encoding($output, 'UTF-8', 'UTF-8, GBK, GB2312, BIG5');
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($code === 200) {
echo ":=<font color=green>200</font>";
} else {
echo ":=<font color=red>" . $code . "</font>";
}
flush();
ob_flush();
curl_close($ch);
} catch (Exception $e) {
echo $e->getMessage();
flush();
ob_flush();
}
return $output;
}
#下載文件保存
function wget($source, $header, $tmpfile, $https = 0)
{
try {
global $origin;
$source = urlpadding($source, getCurrentDirectory($origin));
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $source);
$headers = array(
'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
'Referer: ' . $header
);
if ($https) {
// curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // 對認證證書來源的檢查
// curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE); // 從證書中檢查SSL加密算法是否存在
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true); // 對認證證書來源的檢查
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, true); // 從證書中檢查SSL加密算法是否存在
}
// curl_setopt($ch, CURLOPT_USERAGENT,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0");
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_AUTOREFERER, $header);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // 需要response body
$response = curl_exec($ch);
// 分離header與body
$header = '';
$body = '';
if (curl_getinfo($ch, CURLINFO_HTTP_CODE) == '200') {
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE); // 頭信息size
$header = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
}
curl_close($ch);
// 文件名
$arr = array();
// echo $header;
$ctype = '';
$ctypebool = preg_match('/Content-Type:[ ]*[a-zA-Z0-9]+[\/]{1}[a-zA-Z0-9]+/', $header, $arr);
if ($ctypebool) {
$ctype = $arr[0];
$ctype = preg_replace('/Content-Type:[ ]*[a-zA-Z0-9]+[\/]{1}/', '', $ctype);
}
if ($ctype == 'jpeg') {
$ctype = 'jpg';
}
// $find=preg_match('/filename="[^<\"\']+"/', $header, $arr);
// if (! $find) {
$find = preg_match('/ETag:[ ]*"[^<\"\']+"/', $header, $arr);
// }
$file = '';
if ($find) {
$file = $arr[0];
$file = preg_replace('/ETag:[ ]*"/', '', $file);
$file = preg_replace('/"/', '', $file);
if (strpos($file, ":0") >= 0) {
$file = preg_replace('/[:]{1}[0]{1}/', '', $file);
}
} else {
$p = strrpos($source, '/');
// http:// or https://
// echo $source . "::" . $p;
if ($p > 6) {
$file = substr($source, $p + 1);
} else {
$file = md5($source);
}
$file = urlencode($file);
// echo "::" . $source . " header::" . $header ."<br/>";
}
if (strpos($file, ':') !== false) {
$file = preg_replace('/:/', '_', $file);
}
if (strpos($file, '!') !== false) {
$file = preg_replace('/!/', '_', $file);
}
if (strpos($file, ';') !== false) {
$file = preg_replace('/;/', '_', $file);
}
if (strpos($file, '-') !== false) {
$file = preg_replace('/-/', '_', $file);
}
if (strpos($file, '~') !== false) {
$file = preg_replace('/~/', '_', $file);
}
if (strpos($file, '%2F') !== false) {
$file = preg_replace('/%2F/', '_', $file);
}
if (strlen($ctype) > 1) {
$file = $file . '.' . $ctype;
}
$file = date('Ym') . '_' . $file;
$tmpfile = $tmpfile . "_" . $file;
if (file_exists($tmpfile)) {
// echo ': cached';
} else {
if (strlen($body) >= 1024 * 10) {
file_put_contents($tmpfile, $body);
// echo "content name::" . $file . "<br>";
/*
* $fp=fopen($tmpfile, "w+");
* $fp.write($body);
* fclose($fp);
*/
} else {
// echo "content length::too small<br>";
// echo ': not loaded -- too small';
}
}
// echo " ................................OK<br/>";
echo ".";
flush();
ob_flush();
} catch (Exception $e) {
echo $e->getMessage();
}
}
function isArrayObjectSet($sources)
{
return isset($sources) && $sources->count() > 0;
}
function isArraySet($sources)
{
return isset($sources) && count($sources) > 0;
}
#判斷$source是否被包含在$targets數組中,也即判斷是否存在已訪問歷史中。
function inArray($source, $targets)
{
if (! isArrayObjectSet($targets)) {
return false;
} else {
$size = $targets->count();
for ($rindex = 0; $rindex < $size; $rindex ++) {
// echo "<font color=red>set::" . $targets[$rindex] . "</font><br/>";
if ($source == $targets[$rindex]) {
return true;
} // =
} // for targets
} // else target is not null
return false;
}
#將面向移動端的地址轉換爲PC端地址:將前置的http(s)://m.修改爲http(s)://www.
function mobile2pc($source)
{
if (strpos($source, 'http://m.') === 0) {
$source = preg_replace('/http:\/\/m\./', 'http://www.', $source);
} elseif (strpos($source, 'https://m.') === 0) {
$source = preg_replace('/https:\/\/m\./', 'https://www.', $source);
} elseif (strpos($source, 'm.') === 0) {
$source = preg_replace('/m\./', 'www.', $source);
}
// echo "mp::".$source."<br/>";
return $source;
}
#填充url地址: 將相對路徑修改爲對應網站的全路徑網址。
function urlpadding($source, $currentDirectory)
{
// only for source(s) with http(s) protocol
if (strpos($source, "http://") === 0 || strpos($source, "https://") === 0) {
return $source;
} else {
$target = "";
global $host;
if (strpos($source, "/") === 0) {
$target = $host . $source;
} elseif (strpos($source, "./") === 0) {
$target = $currentDirectory . "/" . preg_replace('/\.\//', '', $source);
} elseif (strpos($source, "../") === 0) {
$target = $host . "/" . preg_replace('/\.\.\//', '', $source);
} else {
$target = $currentDirectory . "/" . $source;
}
// echo "source::" . $source . "::host::" . $host . " target::" . $target . "<br/>";
return $target;
}
}
#獲取當前網址$source的所在目錄
function getCurrentDirectory($source)
{
$firstindex = strpos($source, '/');
$lastindex = strrpos($source, '/');
if ($lastindex === 0) {
return NULL;
} elseif ($lastindex == $firstindex || $lastindex == $firstindex + 1) {
return $source;
} else {
return substr($source, 0, $lastindex);
}
}
function arrayPop()
{
global $unloadedLinks, $loadedLinks;
// check if unloaded link set equal to null;
if (! isArrayObjectSet($unloadedLinks)) {
return NULL;
} else {
$size = $unloadedLinks->count();
for ($index = 0; $index < $size; $index ++) {
$currentUnloadedLink = $unloadedLinks[$index];
// check if current unloaded link in loaded list;
if (! inArray($currentUnloadedLink, $loadedLinks)) {
// if not, then load current link;
$loadedLinks->append($currentUnloadedLink);
return $currentUnloadedLink;
}
}
} // unloaded link set not null.
return NULL;
}
function arrayPush($source, $sources)
{
if (isset($source) && isArrayObjectSet($sources)) {
$sources->append($source);
}
}
#判斷$source是否爲超鏈接標籤a
function getLinks($source, $currentDirectory)
{
global $linkSrcPattern;
$pattern = $linkSrcPattern;
$links = new ArrayObject();
preg_match_all($pattern, $source, $match);
// echo "--".count($match[0]);
$currentDirectory = mobile2pc($currentDirectory);
if (count($match[0]) > 0) {
for ($i = 0; $i < count($match[0]); $i ++) {
$link = $match[0][$i];
$link = mobile2pc($link);
$link = urlpadding($link, $currentDirectory);
$links->append($link);
}
}
global $linkTagPattern;
$pattern = $linkTagPattern;
preg_match_all($pattern, $source, $match);
if (count($match[0]) > 0) {
for ($i = 0; $i < count($match[0]); $i ++) {
$link = $match[0][$i];
// remove tag attribute;
$link = preg_replace('/href=/', '', $link);
// remove tag quote;
$link = preg_replace('/\'/', '', $link);
$link = preg_replace('/"/', '', $link);
if (strpos($link, 'javascript:') !== 0) {
$link = urlpadding($link, $currentDirectory);
$link = mobile2pc($link);
// echo "get tag< a >::" . $link . "<br/>";
$links->append($link);
}
}
}
return $links;
}
#判斷$target是否跟$source具有相同的網頁地址前綴,是否連續網頁集合的“下一頁”--Next Page
#只要兩個超鏈接地址相似度超過80%小於100%的,才進行下一步判斷
#特定某一類型的符合返回99
function distinct($source, $target)
{
$percent = 0;
if ($source == $target) {
$percent = 100;
} else {
similar_text($source, $target, $percent);
if ($percent > 80 && $percent < 100) {
$last = strrpos($source, '.');
if ($last > 6) {
$pageprefx = substr($source, 0, $last);
$dirlast = strrpos($pageprefx, '/');
$file = substr($pageprefx, $dirlast);
$startswith = strpos($target, $file);
// bug:
// $startswith = strstr($target, $pageprefx);
// echo '<br>'.$target.'-'.$pageprefx.'-'.$startswith;
if ($startswith === false) {
$percent = 81;
} else {
$percent = 99;
}
} else {
$percent = 81;
}
}
}
return $percent;
}
#將數組$targets中的超鏈接元素與源地址$source進行比較,判斷是否爲“下一頁”--Next Page
function compare($source, $targets)
{
global $unloadedLinks;
global $origin;
$size = $targets->count();
for ($rindex = 0; $rindex < $size; $rindex ++) {
$target = $targets[$rindex];
$l = strlen($origin);
$l2 = strlen($target);
if ($l != $l2 && $l < $l2 && $l2 < $l * 2) {
$currentDirectory = getCurrentDirectory($source);
$target = urlpadding($target, $currentDirectory);
$target = mobile2pc($target);
$percent = distinct($origin, $target);
// echo "<font color='gray'>" . $source . "</font><><font color='green'>" . $target . "</font>::<font color='green'>" . $percent . "%</font><br/>";
if ($percent >= 98 && $percent < 100) {
if (inArray($target, $unloadedLinks)) {
// echo "saved;<font color=gray>" . $source . "</font><><font color=green>" . $target . "</font>::<font color=green>" . $percent . "%</font><br/>";
} else {
$unloadedLinks->append($target);
// echo "<font color=gray>" . $source . "</font><><font color=green>" . $target . "</font>::<font color=green>" . $percent . "%</font>";
}
} else {
// echo "<font color=gray>" . $source . "</font><><font color=gray>" . $target . "</font>::<font color=gray>" . $percent . "%</font><br/>";
}
} // strlen
} // targets;
echo "<br/> --compare with " . $origin . " --mixed unloaded " . $unloadedLinks->count() . "<br>";
}
function isHtmlFile($source)
{
if (isHost($source)) {
return false;
} else {
if (isDirectory($source)) {
return false;
} else {
return true;
}
}
}
function isDirectory($source)
{
$pos = strrpos($source, '/');
if ($pos === 0) {
return true;
} elseif ($pos < 0) {
return true;
} else {
$urllen = mb_strlen($source, 'UTF-8');
if ($pos == $urllen - 1) {
return true;
} else {
$dotpos = strrpos($source, '.');
if ($dotpos === 0) {
return true;
} elseif ($dotpos > $pos) {
return false;
} else {
return true;
}
}
}
}
function isRelative($source)
{
if (strpos($source, '/') === 0 || strpos($source, './') === 0 || strpos($source, '../') === 0) {
return true;
} elseif (strpos($source, "http://") === 0 || strpos($source, "https://") === 0) {
return false;
} else {
return true;
}
}
function getAbsolutePath($source, $current)
{
$abspath = '';
if (isRelative($source)) {
if (strpos($source, "./") === 0) {
$abspath = getCurrentDirectory($current) + preg_replace('/\.\//', '', $source);
} elseif (strpos($source, "../") === 0) {
$abspath = getHost($current) + preg_replace('/\.\.\//', '', $source);
} elseif (strpos($source, "/") === 0) {
$abspath = getCurrentDirectory($current) + $source;
} else {
$abspath = getCurrentDirectory($current) + "/" + $source;
}
} else {
$abspath = $source;
}
return $abspath;
}
function isHost($source)
{
if (strpos($source, "http://") === 0) // if010
{
$host = preg_replace('/http:\/\//', '', $source);
if (strpos($host, "/") < 0) // if011
{
return true;
} // if011
else {
return false;
} // if011
} // if010
elseif (strpos($source, "https://") === 0) // if 010
{
$host = preg_replace('/http:\/\//', '', $source);
if (strpos($host, "/") < 0) // if 021
{
return true;
} else {
return false;
}
} // if 010
else {
return false;
} // if 010
}
function escapeJs($source)
{
echo 'Before escape js::' . strlen($source) . '<br/>';
$jspattern = '/<script[^<>]*>[^<script>|<\/script>]*<\/script>/';
$pattern = $jspattern;
preg_match($source, $pattern, $match);
while (count($match) > 0) {
$source = preg_replace($jspattern, '', $source);
preg_match($source, $pattern, $match);
}
echo 'After escape js::' . strlen($source) . '<br/>';
flush();
ob_flush();
return $source;
}
function getHtmlText($source)
{
$htmltagstartpattern = '/<[^>]+>/';
$htmltagendpattern = '/<[\/]{1}[^>]+>/';
// escape javascript content first;
$source = escapeJs($source);
$pattern = $htmltagstartpattern;
preg_match($pattern, $source, $match);
while (count($match) > 0) {
$source = preg_replace($pattern, '', $source);
preg_match($pattern, $source, $match);
}
$pattern = $htmltagendpattern;
preg_match($pattern, $source, $match);
while (count($match) > 0) {
$source = preg_replace($pattern, '', $source);
preg_match($pattern, $source, $match);
}
return $source;
}
// main function entry:: loadResources();
// 主入口
function loadResources($source)
{
try {
// $host = getHost($source);
// Test source:
$https = 0;
if (strpos($source, "https://") === 0) {
$https = 1;
}
echo "<br>-->content URL::" . $source;
$html = getUrlContent($source, $https);
// get text escaping html tags
// $htmltext = getHtmlText($html);
// echo "::<br/>::".$htmltext.'::';
// compare links
$links = getLinks($html, getCurrentDirectory($source));
// echo ";currrent page links:=" . $links->count();
compare($source, $links);
// match title tag
global $titlePattern;
$pattern = $titlePattern;
preg_match_all($pattern, $html, $match);
// echo count($match).PHP_EOL;
$titletag = $match[0][0];
// echo $titletag.PHP_EOL;
global $titleSrcPattern;
$pattern = $titleSrcPattern;
// $title=preg_replace('/<[\/]*title[^<>]*>/', '', $titletag);
$title = strip_tags($titletag);
// echo 'Title is '.$title.PHP_EOL;
echo '<title>' . $title . '</title>';
// $charsettag = '';
global $imgPattern;
$pattern = $imgPattern;
preg_match_all($pattern, $html, $match);
// print_r($match);
$imgtags = $match[0];
global $imgSrcPattern;
$pattern = $imgSrcPattern;
$count = count($imgtags);
$docs = new ArrayObject();
// echo "<ul>";
$tmpdir = "./tmp/" . date('Y_m_d');
if (! file_exists($tmpdir)) {
mkdir($tmpdir);
}
for ($index = 0; $index < $count; $index ++) {
// echo "::".$imgtags[$index]."::<br/>";
preg_match_all($pattern, $imgtags[$index], $match);
$imgcount = count($match[0]);
if ($imgcount > 0) {
for ($imgindex = 0; $imgindex < $imgcount; $imgindex ++) {
$img = $match[0][$imgindex];
if (strpos($img, '//') === 0) {
$img = 'http:' . $img;
}
// echo "<br/>" . $index . ']' . $img;
$alt = 'unknown';
// echo "<li><a href=\"".$img."\" target=\"_blank\"><img src=\"".$img."\"/>".$title.'-'.$alt."</a>";
$doc = new Document();
$doc->setSource($img);
$doc->setTitle($title);
$doc->setContent($alt);
$doc->setAuthor($source);
$docs->append($doc);
$https = 0;
if (strpos($doc->getSource(), "https://") === 0) {
$https = 1;
}
// echo 'wget>' . $doc->getSource() . ' ';
flush();
ob_flush();
global $loadChk, $loadedResources, $loadedSources;
if ($loadChk->checkIfResourceLoaded($source, $loadedResources)) {
echo ' loaded';
} else {
array_push($loadedResources, $source);
wget($doc->getSource(), $source, $tmpdir . "/tmp_", $https);
}
}
}
}
// echo "</ul>";
// echo "<br/>::finished<br/>Redirect in 5s...";
// header("Refresh:5;url=spi.php");
return ':=ok';
} catch (Exception $e) {
echo $e->getMessage();
return ':=error';
}
} // main function end:: loadResources()
// MAIN :entry
// :major data sets
$unloadedLinks = new ArrayObject();
$loadedLinks = new ArrayObject();
$host = '';
$source = "https://www.***.com/gq/m*/hy1693.html";
if (isset($_POST["source"])) {
$source = $_POST["source"];
// echo $source;
}
// start fetching resources
set_time_limit(500);
// get host
$host = getHost($source);
$origin = $source;
// load resources of first page in the following
loadResources($source);
// check if resource(s) got:
global $loadChk, $loadedSources;
if (isArrayObjectSet($unloadedLinks)) {
$unloadedlink = arrayPop();
while ($unloadedlink !== NULL) {
if ($loadChk->checkIfSourceLoaded($unloadedlink, $loadedSources)) {
echo 'Source' . $unloadedlink . ' loaded';
flush();
ob_flush();
} else {
array_push($unloadedlink, $loadedSources);
loadResources($unloadedlink);
}
sleep(1);
$unloadedlink = arrayPop();
}
} // unloaded link(s) not equal to NULL;
} // POST method
elseif ($_SERVER["REQUEST_METHOD"] == "GET") {
echo "<!DOCTYPE html>
<html><head><title></title>
<style type='text/css'>
html, body, div{
padding:0;
margin:0 auto;
overflow:hidden;
}
.post-container{
display:table;
position:absolute;
width:480px;
height:240px;
top:50%;
left:50%;
margin:-136px 0 0 -240px;
background-color:#fefefe;
}
.post-form{
display:table-cell;
vertical-align:middle;
text-align:center;
}
.lbl-title{
margin-bottom:1em;
display:block;
font-size:2em;
}
.input-source{
height:2em;
line-height:2em;
width:16em;
margin-left:5px;
margin-right:5px;
display:inline;
font-size:1em;
border:1px solid gray;
border-radius:4px;
}
.input-btn{
width:6em;
height:2.2em;
line-height:2.2em;
display:inline;
font-weight:bold;
font-size:1em;
}
</style>
</head>
<body>
<div class='post-container'>
<form class='post-form' method=\"post\" action=\"spi.php\">
<label class='lbl-title'>SPI Search</label>
<input class='input-source' type=\"password\" name=\"source\">
<input class='input-btn' type=\"submit\" value=\"OK\">
</form></div>
</body></html>";
}
#SPI Version 1.0
#Author: mrn6 from csdn.net--https://me.csdn.net/qq_21264377
?>
Document.php
<?php
class Document
{
var $title;
var $content;
var $created;
var $author;
var $editor;
var $source;
var $updated;
var $comment;
var $doctype;
function __construct()
{}
function __destruct()
{
$this->title = null;
$this->source = null;
$this->content = null;
}
function setTitle($title)
{
$this->title = $title;
}
function getTitle()
{
return $this->title;
}
function setSource($source)
{
$this->source = $source;
}
function getSource()
{
return $this->source;
}
function setContent($content)
{
$this->content = $content;
}
function getContent()
{
return $this->content;
}
function setAuthor($author)
{
$this->author = $author;
}
function getAuthor()
{
return $this->author;
}
function setEditor($editor)
{
$this->editor = $editor;
}
function getEditor()
{
return $this->editor;
}
function getCreated()
{
return $this->created;
}
function setUpdated($updated)
{
$this->updated = updated;
}
function getUpdated()
{
return $this->updated;
}
function setComment($comment)
{
$this->comment = $comment;
}
function getComment()
{
return $this->comment;
}
function setDoctype($type)
{
$this->doctype = $type;
}
function getDoctype()
{
return $this->doctype;
}
}
?>
這是一個從某個網址自動蒐集連續網頁的採集圖片集的代碼。本來有兩種匹配連續網頁的方式:1)判斷網址相似率;2)判斷是否使用相同字符串前綴的網址。第一種方式誤差較大,在某些情況下,某些高相似率的網址只是同一欄目下的,而不是同一網頁集合的子元素。故此暫時摒棄此法。第二種當前只能識別http(s)://***.com/pic/2323.html--http(s)://***.com/pic/2323_2.html這類規則的網頁集。對於此類規則識別,在我的另一篇文章裏略有介紹--https://blog.csdn.net/qq_21264377/article/details/104934580。在這裏不做討論。因爲過於具體的規則識別,深入討論的意義不大,除非用在大規模採集較爲穩定資源的情況下考慮。相當於修建大樓的"添磚加瓦"。
值得注意的是,許多網站爲了方便部署維護,網頁中的本地地址都採用相對路徑或“簡化絕對路徑”如 “/pic/2342.html”。在這種情況下,需要對採集的地址進行填充。對於相對路徑,需要從獲取源地址--輸入地址的當前目錄然後拼接成完整的網址。“簡化絕對路徑”的情況,需從源地址獲取域名網址進行拼接。使用表達式expression描述爲:
1) Assume: $sourceUrl="https://www.abc.com/pic/202003/1235.html", $currentUrl="202003/1235_2.html"
Target: $targetUrl="https://www.abc.com/pic/202003/1235_2.html"
Process: getCurrentDirectory()->$currentDirectoryUrl="https://www.abc.com/pic/202003/",
$targetUrl=$currentDirectoryUrl+$currentUrl
2) Assume: $sourceUrl="https://www.abc.com/pic/202003/1235.html", $currentUrl="/pic/202003/1235_2.html"
Target: $targetUrl="https://www.abc.com/pic/202003/1235_2.html"
Process: getHost()->$host="https://www.abc.com",
$targetUrl=$host+$currentUrl.
因爲URL地址都是以"/"爲分隔符的,目錄層級比較格式化,是按協議規則來定的,與平臺無關,所以劃分比較簡單。從這個亦可看出標準協議的重要性:透明的規則意味着暢行。這也是互聯網盛行的基本原因之一。
定義文件名也是一個令人頭疼的事。特別是不太瞭解甚少深入接觸Web協議的。一種是直接從URL地址獲取文件名+後綴,一種是從Response響應頭Header信息中提取文件名。這兩種各有優缺點。網址指向的文件名在哪裏定義,怎麼定義,是由網站作者定義的。網站作者有可能按照Web協議來辦事,也有可能完全按照自己的意願來處理。基於這樣的可能性,我們可以“點到即止”的方式來應對:獲取文件名成功即可,不成功或獲取的文件名無效或不符合“我”的意願,才考慮第二種方案。事實上,很多瀏覽器的下載功能似乎很少遇到類似問題。這是值得思考的一個問題。
這裏有個問題,輸入網址需是第一頁的地址。如果是網頁集中之後隨意一頁,利用上面描述的“前綴法”對其展開判斷,很容易失去抽象的共性從而迷失深陷於具象的規則裏。若使算法“貪婪”而又簡潔,則應儘可能的抽象出某類事物的共性,進而提煉其算法的表達式。
注:以上PHP源代碼僅供參考交流。