querylist 下载 composer require jaeger/querylist:V3.2.1
<?php
include "vendor/autoload.php";
use \QL\QueryList;
error_reporting(1);
//下载img
function dlfile($file_url, $save_to){
$content = file_get_contents($file_url);
return file_put_contents($save_to, $content);
}
$url = "https://www.php.cn/course/type/2/level_3.html?p=1";
function check_url( $a){
return 'https://www.php.cn/'.$a;
}
$reg = array(
"img" => array('.layui-col-md12 ul li a img','data-original'),
"title" => array('.layui-col-md12 ul li a h3','text'),
'href' =>array('.course-list-col ul li a','href','','check_url')
);
$data = QueryList::Query($url,$reg)->data;
$success = 0;
$fail = 0;
foreach($data as $k=>$v){
$img_ext = substr($v['img'],strrpos($v['img'],'.')+1);
$img_name = basename($v['img'],$img_ext);
$result = dlfile( $v['img'],"E:\WWW\bin\PHPTutorial\WWW\querylist\imgs\\".$img_name.$img_ext);
if(result){
++$success;
echo $img_name.$img_ext."下载成功<br/>";
}else{
++$fail;
echo $img_name.$img_ext."下载失败<br/>";
}
}
echo "失败".$fail."个"."<br/>";
echo "成功".$success."个"."<br/>";
echo "下载总数:".$success+$fail."<br/>";
echo '<pre>';
print_r($data);
exit;
QueryList::run('Multi',[
//待采集链接集合
'list' => [
'http://cms.querylist.cc/news/it/547.html',
'http://cms.querylist.cc/news/it/545.html',
'http://cms.querylist.cc/news/it/543.html'
//更多的采集链接....
],
'curl' => [
'opt' => array(
//这里根据自身需求设置curl参数
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_AUTOREFERER => true,
//........
),
//设置线程数
'maxThread' => 100,
//设置最大尝试数
'maxTry' => 3
],
'success' => function($a){
//采集规则
$reg = array(
//采集文章标题
'title' => array('h1','text'),
//采集文章正文内容,利用过滤功能去掉文章中的超链接,但保留超链接的文字,并去掉版权、JS代码等无用信息
'content' => array('.post_content','html','a -.content_copyright -script' )
);
$rang = '.content';
$ql = QueryList::Query($a['content'],$reg,$rang);
$data = $ql->getData();
//打印结果,实际操作中这里应该做入数据库操作
print_r($data);
}
]);
?>