基於Boost::Thread庫的多線程網絡爬蟲程序

在之前的幾篇博文中,曾經設計實現了單線程爬蟲和多線程爬蟲,但是自己私下想了想,其實在實現多線程爬蟲時,所有的線程都是自己通過調用底層的API來實現的,這樣的調用一般來說並不是很好,並且又鑑於自己最近學習了下boost::thread相關的東西,於是索性想將之前的那個多線程爬蟲程序改爲boost::thread版的多線程爬蟲,好了,廢話說了這麼多,現在該是設計代碼的時候,代碼如下:

#ifndef __HTTP_CURL__H
#define __HTTP_CURL__H
#include <boost/smart_ptr.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/locks.hpp>
#include <boost/thread.hpp>
#include <boost/function.hpp>
#include <boost/bind.hpp>
#include <curl/curl.h>
#include <string>
#include <set>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
using namespace std;
using namespace boost;

#define MAX_BUFFERSIZE 1024*10
#define MAX_THREAD 10
static int fileIndex = 1;

boost::mutex mut;
std::set<string> urlSet;
std::set<string> finishUrlSet;
typedef set<string>::iterator urlSet_Iter;

#define BEGIN_SPIDER(path) {urlSet.insert(path);}
class HttpCurl
{
    public:
        HttpCurl()
        {
            conn = NULL;
        }
        ~HttpCurl()
        {
            curl_easy_cleanup(conn);
        }
        static bool HttpCurlInit()
        {
            urlSet.clear();
            finishUrlSet.clear();
            CURLcode code;
            code = curl_global_init(CURL_GLOBAL_DEFAULT);
            if(CURLE_OK != code)
            {
                printf("Failed to global init default\n");
                return false;
            }
            return true;
        }

        bool InitCurlObject(string& context)
        {
            CURLcode code;
            conn = curl_easy_init();
            if(NULL == conn)
            {
                printf("Failed to create CURL\n");
                return false;
            }
            if(!setWriteFunc())
            {
                printf("Failed to set write\n");
                return false;
            }
            if(!setWriteBuff(context))
            {
                printf("Failed to set buffer\n");
                return false;
            }
            return true;
        }

        bool setWriteFunc()
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_WRITEFUNCTION,HttpCurl::write);
            if(CURLE_OK != code)
            {
                printf("Failed to set write\n");
                return false;
            }
            return true;
        }
        bool setWriteBuff(string& context)
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_WRITEDATA,&context);
            if(CURLE_OK != code)
            {
                printf("Failed to set write data\n");
                return false;
            }
            return true;
        }

        bool setUrl(string& url)
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_URL,url.c_str());
            if(CURLE_OK != code)
            {
                printf("Failed to set URL\n");
                return false;
            }
            return true;
        }

        bool getHttpResponse()
        {
            CURLcode code;
            assert(conn);
            code = curl_easy_perform(conn);
            if(CURLE_OK != code)
            {
                printf("Failed to get response\n");
                return false;
            }

            return true;
        }

        static long write(void* data,int size,int nmemb,string& context)
        {
            long sizes = size*nmemb;
            std::string temp((char*)data,sizes);
            context += temp;
            return sizes;
        }
        bool save(const string& context,string filename)
        {
            CURLcode code;
            int retcode = 0;
            code = curl_easy_getinfo(conn,CURLINFO_RESPONSE_CODE,&retcode);
            if((CURLE_OK == code)&& retcode ==200)
            {
                int length = strlen(context.c_str());
                FILE* file = fopen(filename.c_str(),"w+");
                fseek(file,0,SEEK_SET);
                fwrite(context.c_str(),1,length,file);
                fclose(file);
                return  true;
            }
            return false;
        }
    private:
        CURL* conn;
};

class Spider
{
    public:
        Spider(shared_ptr<HttpCurl>& cul):httpCurl(cul)
        {
            httpCurlUrlSet.clear();
            context.clear();
            initCurl(httpCurl,context);
        }
        ~Spider(){}

        bool initCurl(shared_ptr<HttpCurl>& httpCurl,string& context)
        {
            return httpCurl->InitCurlObject(context);
        }
        void parseUrl(const string& context)
        {
            const string tag = "href";
            const string tag2 = "\"";
            const string tag3 = "http";
            string::size_type tempBegin,tempEnd,iter,httpIter;
            tempBegin = tempEnd = 0;
            iter= context.find(tag);
            while(iter != string::npos)
            {
                tempBegin = context.find(tag2,iter);
                if(tempBegin != string::npos)
                {
                    ++tempBegin;
                    tempEnd = context.find(tag2,tempBegin);
                }
                if(tempEnd != string::npos && tempEnd > tempBegin)
                {
                    string url;
                    url.assign(context,tempBegin,(tempEnd-tempBegin));
                    httpIter = url.find(tag3);
                    if(httpIter != string::npos)
                        httpCurlUrlSet.insert(url);
                }
                iter = context.find(tag,tempEnd);
            }
            printf("httpCurlUrlSet.size():%d\n",httpCurlUrlSet.size());
        }

        bool write(const string& context,const string& filename)
        {
            return httpCurl->save(context,filename);
        }

        void start(string url,string& context)
        {
            char filename[64];
            memset(filename,0,sizeof(filename));
            sprintf(filename,"%d.html",fileIndex++);

            httpCurl->setUrl(url);
            if(httpCurl->getHttpResponse())
            {
               parseUrl(context);
               write(context,filename);
               insertUrl();
            }
        }
        void insertUrl()
        {
            boost::unique_lock<boost::mutex> lock(mut);
            for( urlSet_Iter iter = httpCurlUrlSet.begin();iter != httpCurlUrlSet.end();++iter)
                urlSet.insert(*iter);
            httpCurlUrlSet.clear();
        }

        void displayUrl()
        {
            urlSet_Iter iter = urlSet.begin();
            for(; iter != urlSet.end();++iter)
            {
                cout<<*iter<<endl;
            }
        }

        string& getContext()
        {
            return context;
        }

        string getUrl()
        {
            urlSet_Iter iter;
            string url;
            boost::unique_lock<boost::mutex> lock(mut);
            for(iter = urlSet.begin();iter != urlSet.end();++iter)
            {
                if(finishUrlSet.find(*iter) != finishUrlSet.end())
                    continue;
                break;
            }
            if(iter != urlSet.end())
            {
                url = *iter;
                urlSet.erase(iter);
                finishUrlSet.insert(url);
                return url;
            }
            return "";
        }

    private:
        shared_ptr<HttpCurl> httpCurl;
        std::set<std::string> httpCurlUrlSet;
        std::string context;

};
static void task(Spider* spider)
{
    assert(spider);
    for(;;)
    {
        string url = spider->getUrl();
        if(url != "")
        {
            printf("url=%s\n",url.c_str());
            spider->start(url,spider->getContext());
        }
    }
}

#endif

測試程序:

#include "curlTest.h"

int main()
{
    HttpCurl::HttpCurlInit();
    BEGIN_SPIDER("www.baidu.com");

    shared_ptr<HttpCurl> curl1(new HttpCurl());
    Spider spider1(curl1);
    boost::thread thr1(boost::bind(&task,&spider1));

    shared_ptr<HttpCurl> curl2(new HttpCurl());
    Spider spider2(curl2);
    boost::thread thr2(boost::bind(&task,&spider2));

    thr1.join();
    thr2.join();
    //sleep(100);
    return 0;
}

測試結果:

url=http://anquan.baidu.com/bbs/thread-10353-1-1.html
httpCurlUrlSet.size():221
url=http://anquan.baidu.com/bbs/thread-10356-1-1.html
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82280&amp;ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82424&amp;ptid=10356
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82455&amp;ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82507&amp;ptid=10353
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82703&amp;ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/thread-10360-1-1.html
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/thread-10365-1-1.html
httpCurlUrlSet.size():224
url=http://anquan.baidu.com/bbs/thread-10454-1-1.html
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82343&amp;ptid=10365
httpCurlUrlSet.size():229
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82848&amp;ptid=10454
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82964&amp;ptid=10454
httpCurlUrlSet.size():229
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=82969&amp;ptid=10454
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=83171&amp;ptid=10454

總結

      本篇博文主要是針對上幾篇博文的修改,主要是將相關的線程部分改爲了boost::thread庫,爲了保持程序的完整性,將所有的程序貼了出來,方便閱讀,設計思想很簡單,主要是爲boost::thread線程提供相應的處理函數即可,在實現的過程中,開始是想使用重載operator()的形式,但是測試下來發現其爬取不到任何的東西,個人感覺可能是在註冊爬取網頁緩存區出了問題,遂將其實現爲函數的形式,然後再顯式將其註冊給線程,發現可行,總之,找到能夠解決的方案就行,不要太追究其中的部分細節,尤其是在時間很緊的情況下,好了,等有時間再看看這個問題,本博文到此結束,多謝

如果需要,請註明轉載,多謝

發佈了52 篇原創文章 · 獲贊 10 · 訪問量 7萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章