基於Boost::Thread庫的多線程網絡爬蟲程序

在之前的幾篇博文中，曾經設計實現了單線程爬蟲和多線程爬蟲，但是自己私下想了想，其實在實現多線程爬蟲時，所有的線程都是自己通過調用底層的API來實現的，這樣的調用一般來說並不是很好，並且又鑑於自己最近學習了下boost::thread相關的東西，於是索性想將之前的那個多線程爬蟲程序改爲boost::thread版的多線程爬蟲，好了，廢話說了這麼多，現在該是設計代碼的時候，代碼如下：

#ifndef __HTTP_CURL__H
#define __HTTP_CURL__H
#include <boost/smart_ptr.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/locks.hpp>
#include <boost/thread.hpp>
#include <boost/function.hpp>
#include <boost/bind.hpp>
#include <curl/curl.h>
#include <string>
#include <set>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
using namespace std;
using namespace boost;

#define MAX_BUFFERSIZE 1024*10
#define MAX_THREAD 10
static int fileIndex = 1;

boost::mutex mut;
std::set<string> urlSet;
std::set<string> finishUrlSet;
typedef set<string>::iterator urlSet_Iter;

#define BEGIN_SPIDER(path) {urlSet.insert(path);}
class HttpCurl
{
    public:
        HttpCurl()
        {
            conn = NULL;
        }
        ~HttpCurl()
        {
            curl_easy_cleanup(conn);
        }
        static bool HttpCurlInit()
        {
            urlSet.clear();
            finishUrlSet.clear();
            CURLcode code;
            code = curl_global_init(CURL_GLOBAL_DEFAULT);
            if(CURLE_OK != code)
            {
                printf("Failed to global init default\n");
                return false;
            }
            return true;
        }

        bool InitCurlObject(string& context)
        {
            CURLcode code;
            conn = curl_easy_init();
            if(NULL == conn)
            {
                printf("Failed to create CURL\n");
                return false;
            }
            if(!setWriteFunc())
            {
                printf("Failed to set write\n");
                return false;
            }
            if(!setWriteBuff(context))
            {
                printf("Failed to set buffer\n");
                return false;
            }
            return true;
        }

        bool setWriteFunc()
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_WRITEFUNCTION,HttpCurl::write);
            if(CURLE_OK != code)
            {
                printf("Failed to set write\n");
                return false;
            }
            return true;
        }
        bool setWriteBuff(string& context)
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_WRITEDATA,&context);
            if(CURLE_OK != code)
            {
                printf("Failed to set write data\n");
                return false;
            }
            return true;
        }

        bool setUrl(string& url)
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_URL,url.c_str());
            if(CURLE_OK != code)
            {
                printf("Failed to set URL\n");
                return false;
            }
            return true;
        }

        bool getHttpResponse()
        {
            CURLcode code;
            assert(conn);
            code = curl_easy_perform(conn);
            if(CURLE_OK != code)
            {
                printf("Failed to get response\n");
                return false;
            }

            return true;
        }

        static long write(void* data,int size,int nmemb,string& context)
        {
            long sizes = size*nmemb;
            std::string temp((char*)data,sizes);
            context += temp;
            return sizes;
        }
        bool save(const string& context,string filename)
        {
            CURLcode code;
            int retcode = 0;
            code = curl_easy_getinfo(conn,CURLINFO_RESPONSE_CODE,&retcode);
            if((CURLE_OK == code)&& retcode ==200)
            {
                int length = strlen(context.c_str());
                FILE* file = fopen(filename.c_str(),"w+");
                fseek(file,0,SEEK_SET);
                fwrite(context.c_str(),1,length,file);
                fclose(file);
                return  true;
            }
            return false;
        }
    private:
        CURL* conn;
};

class Spider
{
    public:
        Spider(shared_ptr<HttpCurl>& cul):httpCurl(cul)
        {
            httpCurlUrlSet.clear();
            context.clear();
            initCurl(httpCurl,context);
        }
        ~Spider(){}

        bool initCurl(shared_ptr<HttpCurl>& httpCurl,string& context)
        {
            return httpCurl->InitCurlObject(context);
        }
        void parseUrl(const string& context)
        {
            const string tag = "href";
            const string tag2 = "\"";
            const string tag3 = "http";
            string::size_type tempBegin,tempEnd,iter,httpIter;
            tempBegin = tempEnd = 0;
            iter= context.find(tag);
            while(iter != string::npos)
            {
                tempBegin = context.find(tag2,iter);
                if(tempBegin != string::npos)
                {
                    ++tempBegin;
                    tempEnd = context.find(tag2,tempBegin);
                }
                if(tempEnd != string::npos && tempEnd > tempBegin)
                {
                    string url;
                    url.assign(context,tempBegin,(tempEnd-tempBegin));
                    httpIter = url.find(tag3);
                    if(httpIter != string::npos)
                        httpCurlUrlSet.insert(url);
                }
                iter = context.find(tag,tempEnd);
            }
            printf("httpCurlUrlSet.size():%d\n",httpCurlUrlSet.size());
        }

        bool write(const string& context,const string& filename)
        {
            return httpCurl->save(context,filename);
        }

        void start(string url,string& context)
        {
            char filename[64];
            memset(filename,0,sizeof(filename));
            sprintf(filename,"%d.html",fileIndex++);

            httpCurl->setUrl(url);
            if(httpCurl->getHttpResponse())
            {
               parseUrl(context);
               write(context,filename);
               insertUrl();
            }
        }
        void insertUrl()
        {
            boost::unique_lock<boost::mutex> lock(mut);
            for( urlSet_Iter iter = httpCurlUrlSet.begin();iter != httpCurlUrlSet.end();++iter)
                urlSet.insert(*iter);
            httpCurlUrlSet.clear();
        }

        void displayUrl()
        {
            urlSet_Iter iter = urlSet.begin();
            for(; iter != urlSet.end();++iter)
            {
                cout<<*iter<<endl;
            }
        }

        string& getContext()
        {
            return context;
        }

        string getUrl()
        {
            urlSet_Iter iter;
            string url;
            boost::unique_lock<boost::mutex> lock(mut);
            for(iter = urlSet.begin();iter != urlSet.end();++iter)
            {
                if(finishUrlSet.find(*iter) != finishUrlSet.end())
                    continue;
                break;
            }
            if(iter != urlSet.end())
            {
                url = *iter;
                urlSet.erase(iter);
                finishUrlSet.insert(url);
                return url;
            }
            return "";
        }

    private:
        shared_ptr<HttpCurl> httpCurl;
        std::set<std::string> httpCurlUrlSet;
        std::string context;

};
static void task(Spider* spider)
{
    assert(spider);
    for(;;)
    {
        string url = spider->getUrl();
        if(url != "")
        {
            printf("url=%s\n",url.c_str());
            spider->start(url,spider->getContext());
        }
    }
}

#endif

測試程序：

#include "curlTest.h"

int main()
{
    HttpCurl::HttpCurlInit();
    BEGIN_SPIDER("www.baidu.com");

    shared_ptr<HttpCurl> curl1(new HttpCurl());
    Spider spider1(curl1);
    boost::thread thr1(boost::bind(&task,&spider1));

    shared_ptr<HttpCurl> curl2(new HttpCurl());
    Spider spider2(curl2);
    boost::thread thr2(boost::bind(&task,&spider2));

    thr1.join();
    thr2.join();
    //sleep(100);
    return 0;
}

測試結果：

url=http://anquan.baidu.com/bbs/thread-10353-1-1.html
httpCurlUrlSet.size():221
url=http://anquan.baidu.com/bbs/thread-10356-1-1.html
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82280&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82424&ptid=10356
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82455&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82507&ptid=10353
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82703&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/thread-10360-1-1.html
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/thread-10365-1-1.html
httpCurlUrlSet.size():224
url=http://anquan.baidu.com/bbs/thread-10454-1-1.html
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82343&ptid=10365
httpCurlUrlSet.size():229
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82848&ptid=10454
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82964&ptid=10454
httpCurlUrlSet.size():229
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82969&ptid=10454
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=83171&ptid=10454

總結

本篇博文主要是針對上幾篇博文的修改，主要是將相關的線程部分改爲了boost::thread庫，爲了保持程序的完整性，將所有的程序貼了出來，方便閱讀，設計思想很簡單，主要是爲boost::thread線程提供相應的處理函數即可，在實現的過程中，開始是想使用重載operator()的形式，但是測試下來發現其爬取不到任何的東西，個人感覺可能是在註冊爬取網頁緩存區出了問題，遂將其實現爲函數的形式，然後再顯式將其註冊給線程，發現可行，總之，找到能夠解決的方案就行，不要太追究其中的部分細節，尤其是在時間很緊的情況下，好了，等有時間再看看這個問題，本博文到此結束，多謝

如果需要，請註明轉載，多謝

zmyer

發佈了52 篇原創文章 · 獲贊 10 · 訪問量 7萬+

私信關注

基於Boost::Thread庫的多線程網絡爬蟲程序

釘釘打卡速度慢

Nginx R31 doc 官方文檔-01-nginx 如何安裝

Qt/C++音視頻開發74-合併標籤圖形/生成yolo運算結果圖形/文字和圖形合併成一個/水印濾鏡

挑戰程序設計競賽 2.2章習題 POJ - 3617 Best Cow Line 貪心

字節面試：MySQL什麼時候鎖表？如何防止鎖表？

.NET8連接SQL SERVER 2008 R2 報：證書鏈是由不受信任的頒發機構頒發的

golang開發環境搭建(win10)

python計算機視覺學習筆記——PIL庫的用法

Golang初學：獲取程序內存使用情況，std runtime

開源項目（庫）之libcurl學習（二）

一個線程池與任務池相結合的案例

開源項目（庫）之libcurl學習（一）

Boost之內存管理學習（一）

Boost庫之bind適配器學習

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結