【Node.js】爬蟲--抓取新聞標題、圖片、文字描述,支持QQ、iFeng

文章目錄
  1. 1. app.js
  2. 2. img-spider.js
  3. 3. img.gallery.js
  4. 4. ifengImgs.js
  5. 5. ifengPictures.js
  6. 6. qqImgs.js
  7. 7. imgs.html

先上效果圖:

圖片上部分爲待解析的網頁新聞鏈接,支持一次輸入多個.
圖片下部分爲解析的進度日誌打印。

qq.img.spider02

點擊’Commit’之後,對比效果圖如下。左邊爲騰訊新聞原網頁,右邊爲抓取後的整合效果。

qq.img.spider

工程結構:

文件名描述
app.js程序啓動
img-spider.js爬蟲爬取管理
ifengImgs.js爬取iFeng下game/fashion的實現
ifengPictures.js爬取iFeng下game高清圖的實現
qqImgs.js爬取騰訊新聞圖的實現
img.gallery.js爬取圖片的彙總
imgs.html提交爬取鏈接的html界面

應用到的知識點:

  • express:搭建Web服務
  • cheerio:類似jQuery的快速解析網頁工具
  • iconv-lite:解決中文亂碼問題
  • 正則表達式:網址匹配、內容匹配/過濾
  • Charles:抓包工具

更多細節看源碼吧….

GitHub源碼鏈接:Sodino#ImgSpider


app.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
var fs = require('fs');
var express = require('express');
var img_spider = require('./img-spider.js');
var app = express();
app.get('/imgs.html', (req, resp) => {
resp.writeHead(200, {'Content-Type' : 'text/html'});
resp.write('<head><meta charset="utf-8"/></head>');
var file = fs.createReadStream('./imgs.html');
file.pipe(resp);
});
app.post('/imgs.html', (req, resp) => {
req.on('data', (data) => {
var content = data.toString();
content = unescape(content);
content = content.replace('txtUrls=', '')
//.replace('/\r/g','')
//.replace('/\\r/g','')
;
var arrUrl = content.split(/\s+/);
var imgSpider = new img_spider();
imgSpider.spider(arrUrl, (err, arrImgGallery) => {
resp.writeHead(200, {'Content-Type' : 'text/html'});
resp.write('<head><meta charset="utf-8"/></head>');
resp.write('<body>');
if (err) {
var errStr = err.toString();
resp.write(errStr);
resp.write('</body>');
resp.end();
return;
}
arrImgGallery.forEach((element, index, arrGallery)=>{
var gallery = element;
resp.write('<p>============================================</p>');
resp.write('<p>' + gallery.title + '</p>');
var arrImgs = gallery.arrImgs;
arrImgs.forEach((ele, idx, arrImg)=>{
var desc = ele.desc;
var imgUrl = ele.imgBig;
resp.write('<p>idx=' + idx + "</p>");
resp.write('<p>' + desc + '</p>');
//<img id="bigPic" src="http://img1.gtimg.com/16/1615/161596/16159645_980x1200_0.jpg" style="opacity: 1;">
resp.write('<p><img id="bigPic" src="' + imgUrl+'" style="opacity: 1;"></img></p>');
resp.write('<p>------------------------</p>');
});
});
resp.write('</body>');
resp.end();
});
});
});
app.listen(1024);
console.log('server running on http://localhost:1024/imgs.html');

img-spider.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
var ifengImgs = require('./ifengImgs.js');
var ifengPictures = require('./ifengPictures.js');
var qqImgs = require('./qqImgs.js');
var ImgSpider = function(){
this.arrUrls = [];
this.arrImgGallery = [];
this.callback = null;
};
ImgSpider.prototype.spider = function(arrUrl, callback){
if (!Array.isArray(arrUrl)) {
throw new Error("arrUrl isn't a array!");
}
if (arrUrl.length == 0) {
throw new Error("arrUrl is empty.");
}
this.callback = callback;
arrUrl.forEach((element, index, arr) => {
if (ifengImgs.prototype.RegExp.test(element)) {
runSpider(element, ifengImgs, this);
} else if (ifengPictures.prototype.RegExp.test(element)) {
runSpider(element, ifengPictures, this);
} else if (qqImgs.prototype.RegExp.test(element)) {
runSpider(element, qqImgs, this);
} else {
element = element.trim();
if (element.length > 0) {
var err = new Error("Can't support this url:[" + element + ']');
callback(err, null);
} else {
// do nothing..
}
}
});
};
ImgSpider.prototype.clean = function () {
this.arrUrls = [];
this.arrImgGallery = [];
this.callback = null;
};
function runSpider(url, constructor, imgSpider) {
imgSpider.arrUrls.push(url);
spider = new constructor();
spider.spider(url, (err, imgGallery) => {
if (err) {
console.log('error');
console.log(err);
return;
}
console.log('Done:', imgGallery.url, imgGallery.title);
imgSpider.arrImgGallery.push(imgGallery);
if (imgSpider.arrImgGallery.length == imgSpider.arrUrls.length) {
if (Object.prototype.toString.call(imgSpider.callback)=== '[object Function]') {
imgSpider.callback(null, imgSpider.arrImgGallery);
}
}
});
}
module.exports = ImgSpider;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
var ImgGallery = function(url) {
this.url = url;
this.title = '';
this.arrImgs = [];
};
ImgGallery.prototype.push = function(idx, imgBig, imgSmall, desc) {
var img = new Img(idx, imgBig, imgSmall, desc);
var length = this.arrImgs.push(img);
return length;
}
var Img = function(idx, imgBig, imgSmall, desc) {
this.imgBig = imgBig;
this.imgSmall = imgSmall;
this.desc = desc;
this.index = idx;
};
module.exports = ImgGallery;

ifengImgs.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
var url = 'http://games.ifeng.com/a/20160504/41603363_0.shtml';
var cheerio = require("cheerio");
var http = require("http");
var iconv = require('iconv-lite');
var img_gallery = require('./img.gallery.js');
var SpiderIfengImgs = function() {
};
// http://games.ifeng.com/a/20160504/41603363_0.shtml
// http://fashion.ifeng.com/a/20160519/40162307_0.shtml#p=1
SpiderIfengImgs.prototype.RegExp = /http:\/\/(games)|(fashion).ifeng.com\/a\/\d{8}\/\d+_\d+.shtml/;
SpiderIfengImgs.prototype.spider = function(url, callback){
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(url);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'utf-8');
var $ = cheerio.load(html);
imgGallery.title = $("title").text();
//console.log('page title', imgGallery.title);
var strStart = 'var G_listdata= ';
var strEnd = '</script>';
var idxStart = html.indexOf(strStart);
var idxEnd = html.indexOf(strEnd, strStart.length + idxStart);
var jsListData = html.slice(idxStart + strStart.length, idxEnd);
jsListData = jsListData.replace(/'/g, "\"")
.replace(/title/g, '\"title\"')
.replace(/big_img/g, '\"big_img\"')
.replace(/originalimg/g, '\"originalimg\"')
.replace(/picwidth/g, '\"picwidth\"')
.replace(/picheight/g, '\"picheight\"')
.replace(/morelink/g, '\"morelink\"')
.replace(/img:/g, '\"img\":')
.replace('];', ']')
;
var objJson = JSON.parse(jsListData);
objJson.forEach((element, index, arr) => {
var title = element.title;
var big = element.big_img;
var img = element.img;
var originalimg = element.originalimg;
imgGallery.push(index, big, img, title);
/*console.log(index);
console.log('title', title);
console.log('big', big);
console.log('img', img);
console.log('originalimg', originalimg);*/
});
if (Object.prototype.toString.call(callback)=== '[object Function]') {
callback(null, imgGallery);
}
});
});
};
module.exports = SpiderIfengImgs;

ifengPictures.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
var url = 'http://games.ifeng.com/picture/gaoqing/detail_2015_09/11/41081883_0.shtml';
var cheerio = require("cheerio");
var http = require("http");
var iconv = require('iconv-lite');
var img_gallery = require('./img.gallery.js');
var SpiderIfengPictures = function(){
};
SpiderIfengPictures.prototype.RegExp = /http:\/\/games.ifeng.com\/picture\/gaoqing\/detail_\d{4}_\d{2}\/\d{2}\/\d+_\d+.shtml/;
SpiderIfengPictures.prototype.spider = function (strUrl, callback) {
http.get(strUrl, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(strUrl);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'utf-8');
console.log('-----------------------------------');
console.log('html', html);
var $ = cheerio.load(html);
imgGallery.title = $("title").text();
//console.log('page title', imgGallery.title);
var strStart = '_listdata[0] = ';
var strEnd = 'new ifeng.Gallery';
var idxStart = html.indexOf(strStart);
var idxEnd = html.indexOf(strEnd, strStart.length + idxStart);
var jsListData = html.slice(idxStart + strStart.length, idxEnd);
jsListData = jsListData.replace(/'/g, "\"")
.replace(/title/g, '\"title\"')
.replace(/morelink/g, '\"morelink\"')
.replace(/picwidth/g, '\"picwidth\"')
.replace(/picheight/g, '\"picheight\"')
.replace(/listimg/g, '\"listimg\"')
.replace(/timg:/g, '\"timg\":')
.replace(/img:/g, '\"img\":')
.replace(/\};_listdata\[\d*\] = /g, '},')
.replace('\};', '}')
;
jsListData = '[' + jsListData + ']';
var objJson = JSON.parse(jsListData);
//console.log('jsListData', jsListData);
objJson.forEach((element, index, arr) => {
var title = element.title;
var timg = element.timg;
var img = element.img;
var listimg = element.listimg;
imgGallery.push(index, timg, img, title);
/*console.log(index);
console.log('title', title);
console.log('timg', timg);
console.log('img', img);
console.log('listimg', listimg);*/
});
if (Object.prototype.toString.call(callback)=== '[object Function]') {
callback(null, imgGallery);
}
});
});
};
module.exports = SpiderIfengPictures;

qqImgs.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
var img_gallery = require('./img.gallery.js');
var http = require("http");
var iconv = require('iconv-lite');
var cheerio = require("cheerio");
var url = 'http://news.qq.com/a/20160512/009639.htm';
var url = 'http://news.qq.com/a/20160512/009639.hdBigPic.js';
var SpiderQQImgs = function() {
this.title = null;
this.imgGallery = null;
this.callback = null;
};
SpiderQQImgs.prototype.RegExp = /http:\/\/news.qq.com\/a\/\d{8}\/\d+.htm/;
SpiderQQImgs.prototype.send2callback = function() {
if ((typeof this.title =='string')&&this.title.constructor==String && this.title.length > 0 && this.imgGallery != null && Object.prototype.toString.call(this.callback)=== '[object Function]') {
this.imgGallery.title = this.title;
this.callback(null, this.imgGallery);
}
};
SpiderQQImgs.prototype.spider = function (url, callback) {
this.callback = callback;
this.spiderTitle(url);
url = url.replace('.htm', '.hdBigPic.js');
this.spiderImgGallery(url);
};
SpiderQQImgs.prototype.spiderTitle = function (url) {
var spider = this;
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'gb2312');
var $ = cheerio.load(html);
spider.title = $("title").text();
//console.log('page title', spider.title);
spider.send2callback();
});
});
};
SpiderQQImgs.prototype.spiderImgGallery = function (url) {
var spider = this;
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(url);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var strJson = iconv.decode(chunkAll,'gb2312') // 漢字不亂碼
.replace(/\/\*[\s\S]+?\*\//,'')/*.replace(subfix, '')*/ // 刪除掉註釋
.replace(/\'/g, '"') // 單引號變雙引號才能解析成Object
/*.replace(/ /g, '')
.replace(/"Content":"",/g, '').replace(/"Attributes":\[\],/g, '')
.replace(/ /g, '')
.replace(/,"Children":\[\]/g,"")*/;
// console.log(strJson);
var objJson = JSON.parse(strJson);
deleteEmptyProperty(objJson);
var arr = objJson.Children[0].Children;
var shift1 = arr.shift();
var imgCount = shift1.Children[0].Content;
var arrImgs = arr.shift().Children;
// console.log('imgCount', imgCount);
arrImgs.forEach((element, index, array) => {
var arr = element.Children;
var small = arr[1];
var smallUrl = small.Children[0].Content;
var big = arr[2];
var bigUrl = big.Children[0].Content;
var text = arr[3];
var strText = text.Children[0].Content;
/*console.log('index', index);
console.log('smallUrl', smallUrl);
console.log('bigUrl', bigUrl);
console.log('text', strText);*/
imgGallery.push(index, bigUrl, smallUrl, strText);
});
spider.imgGallery = imgGallery;
spider.send2callback();
});
});
};
function deleteEmptyProperty(object){
for (var i in object) {
var value = object[i];
// console.log('typeof object[' + i + ']', (typeof value));
if (typeof value === 'object') {
if (Array.isArray(value)) {
if (value.length == 0) {
delete object[i];
//console.log('delete Array', i);
continue;
}
}
deleteEmptyProperty(value);
if (isEmpty(value)) {
//console.log('isOwnEmpty true', i, value);
delete object[i];
//console.log('delete a empty object');
}
} else {
if (value === '' || value === null || value === undefined) {
delete object[i];
//console.log('delete ', i);
} else {
//console.log('check ', i, value);
}
}
}
}
function isEmpty(object) {
for (var name in object) {
return false;
}
return true;
}
module.exports = SpiderQQImgs;

imgs.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<!DOCTYPE html><html lang="zh-CN">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>Images spider</title>
</head>
<body>
<form id="form1" actoin="imgs.html" method="POST">
Please input urls:<br/>
<textarea name="txtUrls" style="width:500px;height:120px;">http://news.qq.com/a/20160531/018019.htm#p=1
http://games.ifeng.com/a/20160530/41615842_0.shtml#p=1
</textarea><br/>
<br/>
<input type="submit" value="commit"/><br/>
<br/>
</form>
</body>
</html>

About Sodino

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章