環境要求:
php7及以上
phantomjs引擎 下載地址
QueryList4.0
核心思想:
根據抓取對象網頁的佈局,編寫相應的規則,規則語法參考jquery即可。
抓取示例:
京東商城產品列表
抓取規則:
//京東商城採集規則配置
$rules = array(
'product_list' => array(
'url' => 'https://list.jd.com/list.html?cat=670,671,672',
'range' => '.gl-item .j-sku-item',
'rules' => array(
'link' => array('.p-img a','href'),
'image' => array('.p-img a img','src'),
'lazyImage' => array('.p-img a img','data-lazy-img'),
'name' => array('.p-name a em','text'),
'price' => array('.p-price .J_price:eq(0) i','text')
),
'desc' => '產品列表'
)
);
return $rules;
抓取結果:
一點資訊
抓取規則:
//一點資訊網站採集規則配置
$rules = array(
'video' => array(
'url' => 'http://www.yidianzixun.com/channel/u13746',
'range' => '.style-content-middle',
'rules' => array(
'link' => array('','href'),
'image' => array('.doc-image-small-wrapper .doc-image-box img','src'),
'duration' => array('.doc-image-small-wrapper .doc-image-box .video-time','text'),
'title' => array('.doc-content .doc-content-inline .doc-title','text')
),
'desc' => '視頻列表'
),
'amuse' => array(
'url' => 'http://www.yidianzixun.com/channel/s10671',
'range' => '.style-content-middle',
'rules' => array(
'link' => array('','href'),
'image' => array('.doc-image-small-wrapper .doc-image-box img','src'),
'duration' => array('.doc-image-small-wrapper .doc-image-box .video-time','text'),
'title' => array('.doc-content .doc-content-inline .doc-title','text')
),
'desc' => '搞笑列表'
),
);
return $rules;
抓取結果:
圖片抓取
抓取規則:
//圖片網站採集規則配置
$rules = array(
//暱圖網
'www.nipic.com' => array(
'index' => array(
array(
'url' => 'http://www.nipic.com/',
'range' => '',
'rules' => array(
'link' => array('.newIndex-hotpic','href'),
'image' => array('.newIndex-hotpic img','src'),
'title' => array('.newIndex-hotpic .newIndex-textItem','text'),
),
'desc' => '首頁熱門專題'
),
array(
'url' => 'http://www.nipic.com/',
'range' => '',
'rules' => array(
'link' => array('.right-choicePic','href'),
'image' => array('.right-choicePic img','src'),
'title' => array('.right-choicePic .newIndex-textItem','text'),
),
'desc' => '首頁精選推薦'
)
)
)
);
return $rules;
抓取結果: