nodejs爬蟲如何設置動態ip以及userAgent

前言

在寫nodejs爬蟲的過程中,原網站可能會對某一時間段內集中訪問該頁面的ip進行封殺。那麼如何動態設置每次爬取使用的ip地址以及瀏覽器頭部信息呢?

動態userAgent

這是我收集到的常用的瀏覽器頭部信息,每次爬取的時候從中隨機選取一個,並使用superAgent設置請求頭部的User-Agent字段就好了。

userAgent.js

const userAgents = [
  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
  'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
  'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
  'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
  'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
  'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
  'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
  'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
]

module.exports = userAgents

app.js

import request from 'superagent'
import userAgents from '../src/userAgent'

async function doRequest(){
    let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]
    request.get('http://www.xxx.com')
    .set({ 'User-Agent': userAgent })
    .timeout({ response: 5000, deadline: 60000 })
    .end(async(err, res) => {
      // 處理數據
    })
}

動態ip

設置動態IP需要用到一個superagent插件—superagent-proxy,除此之外爲了避免每次爬取時都去獲取一次動態IP的列表,我將爬取到的動態IP列表存放在redis中,並設置10分鐘的過期時間。數據過期之後再重新發送獲取動態IP的請求。
ps: 這裏我使用的動態IP是爬蟲網絡科技公司提供的免費代理,因爲免費所以難免會有些缺陷。有時候使用他的代理ip並不能訪問得通,我在後面會做單獨的處理。

package.json

{
  "name": "xxx",
  "version": "1.0.0",
  "description": "xxx",
  "main": "arf.js",
  "scripts": {
    "arf": "nodemon src/app.js --exec babel-node --config package.json"
  },
  "keywords": [
    "爬蟲"
  ],
  "author": "lidikang",
  "license": "MIT",
  "dependencies": {
    "bluebird": "^3.5.1",
    "cheerio": "^1.0.0-rc.2",
    "eventproxy": "^1.0.0",
    "mongoose": "^4.13.6",
    "mongoose-findorcreate": "^2.0.0",
    "progress": "^2.0.0",
    "redis": "^2.8.0",
    "superagent": "^3.8.1",
    "superagent-proxy": "^1.0.2"
  },
  "devDependencies": {
    "babel-cli": "^6.26.0",
    "babel-preset-es2015": "^6.24.1",
    "babel-preset-stage-2": "^6.24.1",
    "nodemon": "^1.12.4"
  },
  "nodemonConfig": {
    "ignore": [
      "ips.json",
      "docs/*"
    ],
    "delay": "2500"
  }
}

app.js

import request from 'superagent'
import requestProxy from 'superagent-proxy'
import redis from 'redis'
// superagent添加使用代理ip的插件
requestProxy(request)
// redis promise化
bluebird.promisifyAll(redis.RedisClient.prototype)
bluebird.promisifyAll(redis.Multi.prototype)
// 建立mongoose和redis連接
const redisClient = connectRedis()

/**
 * 初始化redis
 */
function connectRedis() {
  let client = redis.createClient(config.REDIS_URL)
  client.on("ready", function(err) {
    console.log('redis連接 √')
  })
  client.on("error", function(err) {
    console.log(`redis錯誤,${err}  ×`);
  })
  return client
}


/**
 * 請求免費代理,讀取redis,如果代理信息已經過期,重新請求免費代理請求
 */
async function getProxyIp() {
  // 先從redis讀取緩存ip
  let localIpStr = await redisClient.getAsync('proxy_ips')
  let ips = null
  // 如果本地存在,則隨機返回其中一個ip,否則重新請求
  if (localIpStr) {
    let localIps = localIpStr.split(',')
    return localIps[parseInt(Math.random() * localIps.length)]
  } else {
    let ipsJson = (await request.get('http://api.pcdaili.com/?orderid=888888888&num=100&protocol=1&method=1&an_ha=1&sp1=1&sp2=1&format=json&sep=1')).body
    let isRequestSuccess = false
    if (ipsJson && ipsJson.data.proxy_list) {
      ips = ipsJson.data.proxy_list
      isRequestSuccess = true
    } else {
      ips = ['http://127.0.0.1']
    }
    // 將爬取結果存入本地,緩存時間10分鐘
    if (isRequestSuccess) {
      redisClient.set("proxy_ips", ips.join(','), 'EX', 10 * 60)
    }
    return ips[parseInt(Math.random() * ips.length)]
  }
}

async function doRequest(){
  let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]
  let ip = await getProxyIp()
  let useIp = 'http://' + ip
  request.get('http://www.xxx.com')
    .set({ 'User-Agent': userAgent })
    .timeout({ response: 5000, deadline: 60000 })
    .proxy(ip)
    .end(async(err, res) => {
      // 處理數據
    })
}

之前說爬蟲網絡科技的免費ip有些缺陷—代理成功率有些低。這點必須想辦法去修復,原理其實很簡單,既然一次不成功那我就換個IP再試,直到成功了我纔去開始執行解析html的邏輯

async function doRequest(){
  let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]
  let ip = await getProxyIp()
  let useIp = 'http://' + ip
  request.get('http://www.xxx.com')
    .set({ 'User-Agent': userAgent })
    .timeout({ response: 5000, deadline: 60000 })
    .proxy(ip)
    .end(async(err, res) => {
      if (err) {
        console.log(`爬取頁面失敗,${err},正在重新尋找代理ip... ×`)
        // 如果是代理ip無法訪問,另外選擇一個代理
        doRequest('http://' + await getProxyIp(), userAgents[parseInt(Math.random() * userAgents.length)])
        return
      }
      // 解析html
      console.log('爬取頁面  √')
      await parseDivision(res.text)
    })
}

如果你有啥疑問,歡迎寫信到我的郵箱([email protected])與我討論。

原文請查看http://andyliwr.github.io/2017/12/05/nodejs_spider_ip/

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章