一、代碼
var https = require('https');
var cheerio = require('cheerio');
var mysql = require('mysql');
var table = "job";
var moment = require('moment');
var today = moment();
var year = today.format('YYYY年');
var yesterday = today.subtract(1, 'days').format('YYYY年MM月DD日');
var urlTool = require("url");
var qs = require('querystring');
var db_config = {
host: '127.0.0.1',
user: 'root',
password:'123456',
port:'3306',
database:'node'
};
var connection;
function handleDisconnect() {
connection = mysql.createConnection(db_config);
connection.connect(function(err) {
if(err) {
console.log('進行斷線重連:' + new Date());
setTimeout(handleDisconnect, 2000); //2秒重連一次
return;
}
//console.log('連接成功');
});
connection.on('error', function(err) {
console.log('db error', err);
if(err.code === 'PROTOCOL_CONNECTION_LOST') {
handleDisconnect();
} else {
throw err;
}
});
}
function filterJobItem(html){
var $ = cheerio.load(html, {decodeEntities: false});
var jobItem = $('.job-list').find('ul').children('li');
var jobData = [];
var msgs = [];
var companyMsgs = [];
var titleAndSalarys = [];
var item, primaryInfo, companyInfo, id, msg, companyMsg, companyShortName, titleAndSalary, date;
var addJob = 'insert into job(titleId, title, salary, city, workYear, education, companyName, industry, financing, companySize,date) values(?,?,?,?,?,?,?,?,?,?,?)';
var i;
jobItem.each(function(item){
item = $(this);
primaryInfo = item.find('.job-primary').find('.info-primary');
companyInfo = item.find('.job-primary').find('.info-company');
id = primaryInfo.find('.name').find('a').attr('href').split('/')[2].split('.')[0];
titleAndSalary = primaryInfo.find('.name').find('a').text();
titleAndSalarys = titleAndSalary.split(' ');
msg = primaryInfo.find('p').html();
msgs = msg.split('<em class="vline"></em>');
companyShortName = companyInfo.find('.company-text').find('.name').find('a').text();
companyMsg = companyInfo.find('p').html();
companyMsgs = companyMsg.split('<em class="vline"></em>');
date = item.find('.job-time').text().substr(3);
if('昨天' === date)
date = yesterday;
else {
date = year + date;
}
handleDisconnect();
param = [id,titleAndSalarys[0],titleAndSalarys[1],msgs[0],msgs[1],msgs[2],companyShortName,companyMsgs[0],companyMsgs[1],companyMsgs[2],date];
connection.query(addJob, param, function(error, result){
if(error){
console.log(error.message);
}else{
console.log('insert id: ' + result.titleId);
}
});
})
return jobData;
}
var url;
function httpGet(url){
https.get(url,function(res){
var html = '';
res.on('data',function(data){
html += data;
})
res.on('end',function(){
//console.log(html);
var jobData = filterJobItem(html);
})
}).on('error', function(){
console.log('出錯了!');
})
}
var pages = [];
function robot(){
pages.forEach(function(page){
url = url + qs.stringify({ query: 'Java', page: page, ka: 'page-1' },'&');
httpGet(url);
});
}
robot();
二、操作
1.1、引入依賴
var https = require('https');//取決於你要爬網址是http還是https
var cheerio = require('cheerio');//用法類似jquery
var mysql = require('mysql');
var moment = require('moment');//格式化時間
var urlTool = require("url");
var qs = require('querystring');//url和querystring是nodejs用來處理url的兩大利器
1.2、依賴的簡單使用
1.2.1 monent
var today = moment();
var year = today.format('YYYY年');//2017年
var yesterday = today.subtract(1, 'days').format('YYYY年MM月DD日');//昨天的日期,格式爲:2017年11月24日
1.2.2 url和querystring
var getQuery = urlTool.parse(url).query;
var getData = qs.parse(getQuery);//{ query: 'Java', page: '1', ka: 'page-1' }
getData = qs.stringify({ query: 'Java', page: '4', ka: 'page-1' },'&');//query=Java&page=4&ka=page-1
1.2.3 mysql
//nodejs-mysql斷線重連
var db_config = {
host: '127.0.0.1',
user: 'root',
password:'123456',
port:'3306',
database:'node'
};
var connection;
function handleDisconnect() {
connection = mysql.createConnection(db_config);
connection.connect(function(err) {
if(err) {
console.log('進行斷線重連:' + new Date());
setTimeout(handleDisconnect, 2000); //2秒重連一次
return;
}
//console.log('連接成功');
});
connection.on('error', function(err) {
console.log('db error', err);
if(err.code === 'PROTOCOL_CONNECTION_LOST') {
handleDisconnect();
} else {
throw err;
}
});
}
1.2.4 cheerio(用法類似jquery)
item = $(this);
id = primaryInfo.find('.name').find('a').attr('href').split('/')[2].split('.')[0];
titleAndSalary = primaryInfo.find('.name').find('a').text();
三、爬蟲的使用
將要爬取網頁的頁數以數組的形式傳入pages,在nodejs 執行 node server.js即可。