nodejs爬蟲的入門

原創

2020-05-03 20:34

首先想清楚你到底用什麼做。
因爲不同的用法有細微的區別。最方便入門的應該是使用request和cheerio。cheerio是nodejs中的jquery。request是更方便的http模塊。
爬蟲其實就是兩步：
第一步從網上把整個html讀下來；其實就是req.get()這一個函數而已，此時的nodejs相當於一個客戶端，向服務端的host發出request請求，得到一個回調函數callback，這裏是箭頭函數的形式，有三個參數error，response和body。body是返回的html文件。
而第一個是傳入的url，除了需要爬取的鏈接的地址外，還可以對http的訪問頭進行修改。

req.get({
    url:"https://www.scut.edu.cn/new/",
    json: true,
    encoding : 'utf-8',
    port: 443,
    headers: {
        "content-type": "application/json",
        "accept": "application/json",
        "User-Agent": "SCUT-NIRCourse-TEST",
        "Upgrade-Insecure-Requests": "1"
    }
}, (err, res, body)=>{
    if(!err)
    {
        // console.log(body);
        dohtml(body);
    }
    else
        console.log(err);
})

這裏特意和http的區分一下寫法。http引入的url在這裏可以是完整的url，也可以是host加上path，但是注意host是要訪問的主機名，不能算上http或者https的協議名，path則是後面再後面具體的地址。
先忽略promise的用法，直接看https.get的內容，可以看出結構應該也是https.get(url,callback).on();
裏面的res.on()就像onclick一樣，是個等待觸發的函數，如果滿足條件就執行。比如如果是res.on(data)就是當data的數據流出現時，執行下面的內容。

let uri = {
    // url: baseUrl,
    host : 'scut.edu.cn',
    // path : "new",
    json: true,
    encoding : 'utf-8',
    port: 443,
    headers: {
        "content-type": "application/json",
        "accept": "application/json",
        "User-Agent": "SCUT-NIRCourse-TEST",
        "Upgrade-Insecure-Requests": "1"
    }
};

// 爬
function getPageAsync(url) {
    var p = new Promise(function(resolve, reject) {
        console.log('正在爬取 ' + "https://www."+ uri.host )
        // 拿到源碼，調用方法進行解析及輸出
        https.get(url, function(res) {
            var html = '';
            res.on('data', function(data) {
                html += data
            });
            res.on('end', function() {
                res.headers.
                console.log("成功讀取！")
                resolve(html)
            });
        }).on('error', function(e) {
            reject(e)
            console.log('獲取頁面出錯！')
        })
    })
    return p;
}

第二步是從html中提取自己要的信息。
我這裏是提取網站中指向別的地方的鏈接。也就是filterhtml函數裏面的內容。下面放的是完整的可以跑的代碼。

const req = require('request');
var cheerio = require('cheerio');


//處理獲取到的HTML
function filterhtml(ele) {
    let $ = cheerio.load(ele);
    let link_data=[];
    //新聞部分
    let arr = $(".post-news .item-meta a");
    // console.log(arr);
    arr.each((index, element) => {
        // console.log(element.attribs.title);
        let link_title = element.attribs.title;
        let link_url =element.attribs.href;
        // console.log(link_title + '->' + link_url);
        link_data.push(link_title + '->' + link_url);
    });
    //其他版塊部分
    let arr1 = $(".post-more a");
    arr1.each((index, element) => {
        let link_title = element.attribs.class;
        let link_url = element.attribs.href;
        // console.log(link_title + '->' + link_url);
        link_data.push(link_title + '->' + link_url);
    });
    //more部分
    let arr2 = $(".post-more a");
    arr1.each((index, element) => {
        // console.log(element.attribs.title);
        let link_title = element.attribs.title;
        let link_url ="https://www.scut.edu.cn" + element.attribs.href;
        // console.log(link_title + '->' + link_url);
        link_data.push(link_title + '->' + link_url);
    });
    // console.log(link_data);
    return link_data;
}

req.get({
    url:"https://www.scut.edu.cn/new/",
    // host : 'scut.edu.cn',
    // path : "new",
    json: true,
    encoding : 'utf-8',
    port: 443,
    headers: {
        "content-type": "application/json",
        "accept": "application/json",
        "User-Agent": "SCUT-NIRCourse-TEST",
        "Upgrade-Insecure-Requests": "1"
    }
}, (err, res, body)=>{
    if(!err)
    {
        // console.log(body);
        dohtml(body);
    }
    else
        console.log(err);
})

function dohtml(pages) {
    var link_list = []
    console.log("接下來是整個文檔");
    // console.log(pages);
    let link_data = filterhtml(pages);
    console.log(link_data);
}

此時就擁有了一個link_data列表：

接下來再處理這些 HTML 的 URL，能判別提取的 URL 是否已處理過，不重複下載和解析已蒐集過的網頁；

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

nodejs爬蟲的入門

redis的key亂碼問題和值自增問題

CORS error 但是 status code 是200 OK

一個開源且全面的C#算法實戰教程

一款.NET開源、功能強大、跨平臺的繪圖庫 - OxyPlot

壓縮上傳的GPU數據的方案

使用skopeo同步鏡像

vue裏dom節點和window對象

pyinstaller打包exe軟件並解決 Pyinstaller failed to execute script main問題（用了sklearn庫）

Anaconda執行import任何報valueError: failed to parse CPython sys.version錯誤的問題

python3.6anaconda安裝sklearn踩坑實錄

emit朝父組件傳遞參數，並且需要父組件自身的參數

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結