前端學習筆記:nodeJs爬蟲

一、代碼

var https = require('https');
var cheerio = require('cheerio');
var mysql  = require('mysql');
var table = "job";
var moment = require('moment');
var today = moment();
var year = today.format('YYYY年');
var yesterday = today.subtract(1, 'days').format('YYYY年MM月DD日');
var urlTool = require("url");
var qs = require('querystring');

var db_config = {
    host: '127.0.0.1',    
    user: 'root',
    password:'123456',
    port:'3306',
    database:'node'
};
var connection;
function handleDisconnect() {
    connection = mysql.createConnection(db_config);            
    connection.connect(function(err) {       
        if(err) {                         
            console.log('進行斷線重連:' + new Date());
            setTimeout(handleDisconnect, 2000);   //2秒重連一次
            return;
        }        
        //console.log('連接成功'); 
    });                
    connection.on('error', function(err) {
        console.log('db error', err);
        if(err.code === 'PROTOCOL_CONNECTION_LOST') {
            handleDisconnect();   
        } else {                                     
            throw err;                                
        }
    });
}

function filterJobItem(html){
	var $ = cheerio.load(html, {decodeEntities: false});
	var jobItem = $('.job-list').find('ul').children('li');
	var jobData = [];
	var msgs = [];
	var companyMsgs = [];
	var titleAndSalarys = [];
	var item, primaryInfo, companyInfo, id, msg, companyMsg, companyShortName, titleAndSalary, date;
	var addJob = 'insert into job(titleId, title, salary, city, workYear, education, companyName, industry, financing, companySize,date) values(?,?,?,?,?,?,?,?,?,?,?)';
	var i;
	jobItem.each(function(item){
		item = $(this);
		primaryInfo = item.find('.job-primary').find('.info-primary');
		companyInfo = item.find('.job-primary').find('.info-company');
		id = primaryInfo.find('.name').find('a').attr('href').split('/')[2].split('.')[0];
		titleAndSalary = primaryInfo.find('.name').find('a').text();
		titleAndSalarys = titleAndSalary.split(' ');
		msg = primaryInfo.find('p').html();
		msgs = msg.split('<em class="vline"></em>');
		companyShortName = companyInfo.find('.company-text').find('.name').find('a').text();
		companyMsg = companyInfo.find('p').html();
		companyMsgs = companyMsg.split('<em class="vline"></em>');
		date = item.find('.job-time').text().substr(3);
		if('昨天' === date)
			date = yesterday;
		else { 
			date = year + date;
		}
		handleDisconnect();
		param = [id,titleAndSalarys[0],titleAndSalarys[1],msgs[0],msgs[1],msgs[2],companyShortName,companyMsgs[0],companyMsgs[1],companyMsgs[2],date];
		connection.query(addJob, param, function(error, result){
			if(error){
				console.log(error.message);
			}else{
				console.log('insert id: ' + result.titleId);
			}
		});	
	})
	return jobData;
}

var url;

function httpGet(url){
	https.get(url,function(res){
		var html = '';
		res.on('data',function(data){
			html += data;
		})
		res.on('end',function(){
			//console.log(html); 
			var jobData = filterJobItem(html);
		})
	}).on('error', function(){
		console.log('出錯了!');
	})
}

var pages = [];

function robot(){
	pages.forEach(function(page){
		url = url + qs.stringify({ query: 'Java', page: page, ka: 'page-1' },'&');
		httpGet(url);
	});
}

robot();

二、操作

1.1、引入依賴

var https = require('https');//取決於你要爬網址是http還是https
var cheerio = require('cheerio');//用法類似jquery
var mysql  = require('mysql');
var moment = require('moment');//格式化時間
var urlTool = require("url");
var qs = require('querystring');//url和querystring是nodejs用來處理url的兩大利器

1.2、依賴的簡單使用

1.2.1  monent

var today = moment();
var year = today.format('YYYY年');//2017年
var yesterday = today.subtract(1, 'days').format('YYYY年MM月DD日');//昨天的日期,格式爲:2017年11月24日

1.2.2 url和querystring

var getQuery = urlTool.parse(url).query;
var getData = qs.parse(getQuery);//{ query: 'Java', page: '1', ka: 'page-1' }
getData = qs.stringify({ query: 'Java', page: '4', ka: 'page-1' },'&');//query=Java&page=4&ka=page-1

1.2.3 mysql

//nodejs-mysql斷線重連
var db_config = {
    host: '127.0.0.1',    
    user: 'root',
    password:'123456',
    port:'3306',
    database:'node'
};
var connection;
function handleDisconnect() {
    connection = mysql.createConnection(db_config);            
    connection.connect(function(err) {       
        if(err) {                         
            console.log('進行斷線重連:' + new Date());
            setTimeout(handleDisconnect, 2000);   //2秒重連一次
            return;
        }        
        //console.log('連接成功'); 
    });                
    connection.on('error', function(err) {
        console.log('db error', err);
        if(err.code === 'PROTOCOL_CONNECTION_LOST') {
            handleDisconnect();   
        } else {                                     
            throw err;                                
        }
    });
}

1.2.4  cheerio(用法類似jquery)

item = $(this);
id = primaryInfo.find('.name').find('a').attr('href').split('/')[2].split('.')[0];
titleAndSalary = primaryInfo.find('.name').find('a').text();

三、爬蟲的使用

將要爬取網頁的頁數以數組的形式傳入pages,在nodejs 執行 node server.js即可。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章