在之前的幾篇博文中,曾經設計實現了單線程爬蟲和多線程爬蟲,但是自己私下想了想,其實在實現多線程爬蟲時,所有的線程都是自己通過調用底層的API來實現的,這樣的調用一般來說並不是很好,並且又鑑於自己最近學習了下boost::thread相關的東西,於是索性想將之前的那個多線程爬蟲程序改爲boost::thread版的多線程爬蟲,好了,廢話說了這麼多,現在該是設計代碼的時候,代碼如下:
#ifndef __HTTP_CURL__H
#define __HTTP_CURL__H
#include <boost/smart_ptr.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/locks.hpp>
#include <boost/thread.hpp>
#include <boost/function.hpp>
#include <boost/bind.hpp>
#include <curl/curl.h>
#include <string>
#include <set>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
using namespace std;
using namespace boost;
#define MAX_BUFFERSIZE 1024*10
#define MAX_THREAD 10
static int fileIndex = 1;
boost::mutex mut;
std::set<string> urlSet;
std::set<string> finishUrlSet;
typedef set<string>::iterator urlSet_Iter;
#define BEGIN_SPIDER(path) {urlSet.insert(path);}
class HttpCurl
{
public:
HttpCurl()
{
conn = NULL;
}
~HttpCurl()
{
curl_easy_cleanup(conn);
}
static bool HttpCurlInit()
{
urlSet.clear();
finishUrlSet.clear();
CURLcode code;
code = curl_global_init(CURL_GLOBAL_DEFAULT);
if(CURLE_OK != code)
{
printf("Failed to global init default\n");
return false;
}
return true;
}
bool InitCurlObject(string& context)
{
CURLcode code;
conn = curl_easy_init();
if(NULL == conn)
{
printf("Failed to create CURL\n");
return false;
}
if(!setWriteFunc())
{
printf("Failed to set write\n");
return false;
}
if(!setWriteBuff(context))
{
printf("Failed to set buffer\n");
return false;
}
return true;
}
bool setWriteFunc()
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_WRITEFUNCTION,HttpCurl::write);
if(CURLE_OK != code)
{
printf("Failed to set write\n");
return false;
}
return true;
}
bool setWriteBuff(string& context)
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_WRITEDATA,&context);
if(CURLE_OK != code)
{
printf("Failed to set write data\n");
return false;
}
return true;
}
bool setUrl(string& url)
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_URL,url.c_str());
if(CURLE_OK != code)
{
printf("Failed to set URL\n");
return false;
}
return true;
}
bool getHttpResponse()
{
CURLcode code;
assert(conn);
code = curl_easy_perform(conn);
if(CURLE_OK != code)
{
printf("Failed to get response\n");
return false;
}
return true;
}
static long write(void* data,int size,int nmemb,string& context)
{
long sizes = size*nmemb;
std::string temp((char*)data,sizes);
context += temp;
return sizes;
}
bool save(const string& context,string filename)
{
CURLcode code;
int retcode = 0;
code = curl_easy_getinfo(conn,CURLINFO_RESPONSE_CODE,&retcode);
if((CURLE_OK == code)&& retcode ==200)
{
int length = strlen(context.c_str());
FILE* file = fopen(filename.c_str(),"w+");
fseek(file,0,SEEK_SET);
fwrite(context.c_str(),1,length,file);
fclose(file);
return true;
}
return false;
}
private:
CURL* conn;
};
class Spider
{
public:
Spider(shared_ptr<HttpCurl>& cul):httpCurl(cul)
{
httpCurlUrlSet.clear();
context.clear();
initCurl(httpCurl,context);
}
~Spider(){}
bool initCurl(shared_ptr<HttpCurl>& httpCurl,string& context)
{
return httpCurl->InitCurlObject(context);
}
void parseUrl(const string& context)
{
const string tag = "href";
const string tag2 = "\"";
const string tag3 = "http";
string::size_type tempBegin,tempEnd,iter,httpIter;
tempBegin = tempEnd = 0;
iter= context.find(tag);
while(iter != string::npos)
{
tempBegin = context.find(tag2,iter);
if(tempBegin != string::npos)
{
++tempBegin;
tempEnd = context.find(tag2,tempBegin);
}
if(tempEnd != string::npos && tempEnd > tempBegin)
{
string url;
url.assign(context,tempBegin,(tempEnd-tempBegin));
httpIter = url.find(tag3);
if(httpIter != string::npos)
httpCurlUrlSet.insert(url);
}
iter = context.find(tag,tempEnd);
}
printf("httpCurlUrlSet.size():%d\n",httpCurlUrlSet.size());
}
bool write(const string& context,const string& filename)
{
return httpCurl->save(context,filename);
}
void start(string url,string& context)
{
char filename[64];
memset(filename,0,sizeof(filename));
sprintf(filename,"%d.html",fileIndex++);
httpCurl->setUrl(url);
if(httpCurl->getHttpResponse())
{
parseUrl(context);
write(context,filename);
insertUrl();
}
}
void insertUrl()
{
boost::unique_lock<boost::mutex> lock(mut);
for( urlSet_Iter iter = httpCurlUrlSet.begin();iter != httpCurlUrlSet.end();++iter)
urlSet.insert(*iter);
httpCurlUrlSet.clear();
}
void displayUrl()
{
urlSet_Iter iter = urlSet.begin();
for(; iter != urlSet.end();++iter)
{
cout<<*iter<<endl;
}
}
string& getContext()
{
return context;
}
string getUrl()
{
urlSet_Iter iter;
string url;
boost::unique_lock<boost::mutex> lock(mut);
for(iter = urlSet.begin();iter != urlSet.end();++iter)
{
if(finishUrlSet.find(*iter) != finishUrlSet.end())
continue;
break;
}
if(iter != urlSet.end())
{
url = *iter;
urlSet.erase(iter);
finishUrlSet.insert(url);
return url;
}
return "";
}
private:
shared_ptr<HttpCurl> httpCurl;
std::set<std::string> httpCurlUrlSet;
std::string context;
};
static void task(Spider* spider)
{
assert(spider);
for(;;)
{
string url = spider->getUrl();
if(url != "")
{
printf("url=%s\n",url.c_str());
spider->start(url,spider->getContext());
}
}
}
#endif
測試程序:
#include "curlTest.h"
int main()
{
HttpCurl::HttpCurlInit();
BEGIN_SPIDER("www.baidu.com");
shared_ptr<HttpCurl> curl1(new HttpCurl());
Spider spider1(curl1);
boost::thread thr1(boost::bind(&task,&spider1));
shared_ptr<HttpCurl> curl2(new HttpCurl());
Spider spider2(curl2);
boost::thread thr2(boost::bind(&task,&spider2));
thr1.join();
thr2.join();
//sleep(100);
return 0;
}
測試結果:
url=http://anquan.baidu.com/bbs/thread-10353-1-1.html
httpCurlUrlSet.size():221
url=http://anquan.baidu.com/bbs/thread-10356-1-1.html
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82280&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82424&ptid=10356
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82455&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82507&ptid=10353
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82703&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/thread-10360-1-1.html
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/thread-10365-1-1.html
httpCurlUrlSet.size():224
url=http://anquan.baidu.com/bbs/thread-10454-1-1.html
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82343&ptid=10365
httpCurlUrlSet.size():229
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82848&ptid=10454
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82964&ptid=10454
httpCurlUrlSet.size():229
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82969&ptid=10454
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=83171&ptid=10454
總結
本篇博文主要是針對上幾篇博文的修改,主要是將相關的線程部分改爲了boost::thread庫,爲了保持程序的完整性,將所有的程序貼了出來,方便閱讀,設計思想很簡單,主要是爲boost::thread線程提供相應的處理函數即可,在實現的過程中,開始是想使用重載operator()的形式,但是測試下來發現其爬取不到任何的東西,個人感覺可能是在註冊爬取網頁緩存區出了問題,遂將其實現爲函數的形式,然後再顯式將其註冊給線程,發現可行,總之,找到能夠解決的方案就行,不要太追究其中的部分細節,尤其是在時間很緊的情況下,好了,等有時間再看看這個問題,本博文到此結束,多謝
如果需要,請註明轉載,多謝