Nodejs爬蟲實戰(五)

1. 抓取標籤內容

  1. 引入模塊

    新模塊jsdom中的JSDOM

  2. 創建對象

     let DOM = new JSDOM(html);
     let document = DOM.window.document;
    
  3. dom操作

     document.querySelector('.tm-count').innerHTML
    
    完整代碼
     var index = 0;
     const fs = require('fs');
     const url = require('url');
     const gbk = require('gbk');
     const JSDOM = require('jsdom').JSDOM;
     
     GetUrl('https://detail.tmall.com/item.htm?id=548466958386&ali_refid=a3_430583_1006:1103419234:N:%E5%8D%8E%E4%B8%BA:bb84ee4c8f67c7b202d725187b7ad429&ali_trackid=1_bb84ee4c8f67c7b202d725187b7ad429&spm=a230r.1.14.1&sku_properties=5919063:6536025;12304035:116177',(data)=>{
     
     	var html = gbk.toString('utf-8',data);
     
     	let DOM = new JSDOM(html);
     	let document = DOM.window.document;
     
     	console.log(document.querySelector('.tm-count').innerHTML)
     })
     function GetUrl(sUrl,success){
     	index++;
     	var urlObj = url.parse(sUrl);
     	var http ='';
     	if(urlObj.protocol == 'http:'){
     		http = require('http');
     	}
     	else{
     		http = require('https');
     	}
     
     	let req = http.request({
     		'hostname':urlObj.hostname,
     		'path':urlObj.path
     	},res=>{
     		if(res.statusCode == 200){
     			var arr = [];
     			var str = '';
     			res.on('data',buffer=>{
     				arr.push(buffer);
     				//str +=buffer;
     			});
     			res.on('end',()=>{
     				let b = Buffer.concat(arr);
     
     				success && success(b);
     
     			})
     		}
     		else if(res.statusCode == 302 || res.statusCode == 301){
     			console.log(`第${index}次重定向`,res.headers.location);
     			GetUrl(res.headers.location,success)
     		}
     	});
     
     	req.end();
     	req.on('error',()=>{
     		console.log('404');
     	})
     }
    
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章