最近學深度學習,但是無奈很缺數據,所以就寫了一個網頁爬蟲去爬取圖像,(一個一個手動下載的話太煩了)
#define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS
#define _WINSOCK_DEPRECATED_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#include <winsock2.h>
#include <Windows.h>
#include <string>
#include <iostream>
#include <vector>
#include <process.h>
#include <WinInet.h>
#include <assert.h>
#pragma comment(lib, "Wininet.lib")
#pragma comment(lib, "ws2_32.lib")
using namespace std;
//獲取網站的源碼
void GetWebCode(const char* szWeb,char* szCode,int nSize,int& nLen)
{
HINTERNET hOpen = NULL, hOpenUrl = NULL;
nLen = 0;
hOpen = InternetOpen("Testing", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (hOpen)
{
hOpenUrl = InternetOpenUrl(hOpen, szWeb, NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (hOpenUrl)
{
Sleep(40);
DWORD dwByteRead = 0;
if (InternetReadFile(hOpenUrl, szCode, nSize, &dwByteRead))
{
assert(dwByteRead < nSize);
nLen = dwByteRead;
}
}
}
if(hOpen)
InternetCloseHandle(hOpen);
if(hOpenUrl)
InternetCloseHandle(hOpen);
}
int FindSubstr(const char* szMain,int nMainSize,const char* szSub,int nSubSize,int nBegin = 0)
{
if (!szMain || !szSub)
return -1;
if (nMainSize <= 0 || nSubSize <= 0)
return -1;
int i = 0;
for (i = nBegin; i < nMainSize; i++)
{
bool bRet = true;
for (int j = 0; j < nSubSize; j++)
{
if (szMain[i + j] != szSub[j])
{
bRet = false;
break;
}
}
if (bRet)
break;
}
return (i == nMainSize) ? -1 : i;
}
void GetWebHref(vector<string>& cHrefList,const char* szBuffer,int nLen)
{
const char* szSub = "href=";
const char* szSubEnd = "html";
int nSubLen = strlen(szSub), nSubEndlen = strlen(szSubEnd);
int nRet = 0, nEnd = 0;
int nIndex = 0;
do
{
nRet = FindSubstr(szBuffer, nLen, szSub, nSubLen, nEnd);
if (nRet != -1)
{
char szTemp[1024] = "0";
nEnd = FindSubstr(szBuffer, nLen, szSubEnd, nSubEndlen, nRet + nSubLen);
if (nEnd != -1)
{
strncpy(szTemp, szBuffer + nRet + nSubLen + 1, nEnd - nRet - 2);
cout << "[" << nIndex++ << "] -> " << szTemp << endl;
cHrefList.emplace_back(szTemp);
}
}
} while (nRet != -1 && nEnd != -1);
}
void GetWebJpg(vector<string>& cUrl,vector<string>& cJpg)
{
const int Size = 1024 * 40;
char szCode[Size] = "0";
const char* szPos = "paper-down";
const char* szBegin = "href=";
const char* szEnd = "jpg";
int nIndex = 0;
for (auto& it : cUrl)
{
//不會超過20張圖像
for (int i = 1; i < 20; i++)
{
char szWeb[1024] = "0";
strncpy(szWeb, it.c_str(), it.size());
szWeb[it.size() - 5] = '\0';
sprintf(szWeb, "%s_%d.html", szWeb, i);
//獲取網站源代碼
int nLen = 0;
GetWebCode(szWeb, szCode, Size, nLen);
if (!nLen)
continue;
//後面沒有了
const char* szNullPage = "SORRY";
if(FindSubstr(szCode,Size,szNullPage,strlen(szNullPage)) != -1)
break;
cout << "\t" << szWeb << endl;
int nPos = FindSubstr(szCode, nLen, szPos, strlen(szPos));
if (nPos != -1)
{
int nBegin = FindSubstr(szCode, nLen, szBegin, strlen(szBegin), nPos);
int nEnd = FindSubstr(szCode, nLen, szEnd, strlen(szEnd), nBegin);
if (nBegin != -1 && nEnd != -1)
{
char szTemp[1024] = "0";
strncpy(szTemp, szCode + nBegin + strlen(szBegin) + 1, nEnd - nBegin - strlen(szEnd));
cout << "\t [" << nIndex++ << "] -> " << szTemp << endl;
cJpg.emplace_back(szTemp);
}
}
}
}
}
long g_nJpg = 0;//記錄圖像數量
void DownLoadJpg(vector<string>& cJpg)
{
HINTERNET hOpen = NULL, hOpenUrl = NULL;
HANDLE hFile = INVALID_HANDLE_VALUE;
DWORD dwRecv = 0, dwSend = 0;
const int Size = 1024 * 40;
char szDownLoad[Size] = "0";
char szPath[1024] = "0";
hOpen = InternetOpen("Testing", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (hOpen)
{
for (auto& it : cJpg)
{
hOpenUrl = InternetOpenUrl(hOpen, it.c_str(), NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (hOpenUrl)
{
long index = InterlockedIncrement(&g_nJpg);
sprintf(szPath, "Images\\%d.jpg", index);
hFile = CreateFile(szPath, GENERIC_WRITE, 0, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
if (hFile != INVALID_HANDLE_VALUE)
{
cout << "download [" << index << "] -> " << it.c_str() << endl;
while (true)
{
Sleep(10);
InternetReadFile(hOpenUrl, szDownLoad, Size, &dwRecv);
if (!dwRecv)
break;
WriteFile(hFile, szDownLoad, dwRecv, &dwSend, NULL);
if (!dwSend)
break;
}
CloseHandle(hFile);
hFile = INVALID_HANDLE_VALUE;
}
}
}
}
if(hOpen)
InternetCloseHandle(hOpen);
if(hOpenUrl)
InternetCloseHandle(hOpenUrl);;
}
int JpgDownLoader(const char* szWeb)
{
//先獲取網站的源碼
const int Size = 1024 * 40;
int nLen = 0;
char szCode[Size] = "0";
GetWebCode(szWeb, szCode, Size,nLen);
if (!nLen)
{
cout << "獲取網站源碼失敗" << endl;
return -1;
}
const char* szNullPage = "SORRY";
if (FindSubstr(szCode, Size, szNullPage, strlen(szNullPage)) != -1)
{
cout << "空網頁" << endl;
return 0;
}
//獲取網站區域方便解析
const char* szBegin = strstr(szCode, "Left_bar");
const char* szEnd = strstr(szCode, "pages");
if (!szBegin || !szEnd)
{
cout << "無法解析出目標區域" << endl;
return -1;
}
//將UL標籤裏面的網站地址全部解析出來
vector<string> cDownloadWeb;
GetWebHref(cDownloadWeb, szBegin, szEnd - szBegin);
if (cDownloadWeb.empty())
{
cout << "獲取圖像目標失敗" << endl;
return -1;
}
//將第一個網址去掉
cDownloadWeb.erase(cDownloadWeb.begin());
//將網址地址裏面的圖像地址全部解析出來
vector<string> cJpgList;
GetWebJpg(cDownloadWeb, cJpgList);
if (cJpgList.empty())
{
cout << "獲取圖像地址失敗" << endl;
return -1;
}
//下載圖像
DownLoadJpg(cJpgList);
return true;
}
void Downloading()
{
//創建目錄
CreateDirectory("Images", NULL);
//輸入網址
char szBuffer[1024];
cin.getline(szBuffer, 1024);
int nLen = strlen(szBuffer);
szBuffer[nLen - 5] = '\0';
int nCount = 10;
for (int i = 1; i < 10; i++)
{
char szTemp[1024];
sprintf(szTemp, "%s_%d.html", szBuffer, i);
cout << "[" << i << "] Begin Download -> " << szTemp << endl;
//開始下載
int nRet = JpgDownLoader(szTemp);
if (nRet == 0)
break;
else if (nRet == -1 && --nCount >= 0)
i--;
}
cout << "完成下載任務,下載圖像數量爲" << g_nJpg << "...." << endl;
}
int main(int argc,char* argv[])
{
cout << "圖片下載器" << endl;
cout << "圖像將保存到" << argv[0] << "\\Images" << endl;
cout << "請輸入根網頁路徑:" << endl;
Downloading();
getchar();
return 0;
}
/************************************************************************
美桌網(http://www.win4000.com/)圖片下載程序
Time:2019-11-30
By fyh
************************************************************************/
#include <Windows.h>
#include <WinInet.h>
#include <time.h>
#pragma comment(lib, "Wininet.lib")
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <future>
using namespace std;
char g_dir[1024];
long g_jpg_index = 0;
//獲取網址的源代碼
string get_web_code(const char* web)
{
char agent_name[1024]{ 0 };
_itoa_s(static_cast<int>(time(NULL)), agent_name, 10);
HINTERNET internet_open = NULL;
HINTERNET internet_url = NULL;
string web_code;
internet_open = InternetOpenA(agent_name, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (internet_open)
{
internet_url = InternetOpenUrlA(internet_open, web, NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (internet_url)
{
DWORD read_byte = 0;
const int max_size = 1024 * 10;
char code_buffer[max_size];
//一直讀取直到讀取完整個網頁的源代碼
while (1)
{
memset(code_buffer, 0, max_size);
if (InternetReadFile(internet_url, code_buffer, max_size, &read_byte))
{
if(read_byte == 0)
break;
web_code += code_buffer;
}
else
{
web_code.clear();
break;
}
Sleep(10);
}
}
}
if (internet_open)
InternetCloseHandle(internet_open);
if (internet_url)
InternetCloseHandle(internet_url);
return web_code;
}
//判斷釋放是404錯誤網頁
bool is_error_404_page(string& web_code)
{
const char* error_page = "SORRY 404";
if (web_code.find(error_page) != string::npos) return true;
return false;
}
//或者網頁列表
vector<string> get_web_list(const char* web,int num = 10)
{
int web_len = strlen(web);
char web_temp[1024];
strncpy_s(web_temp, web, web_len);
web_temp[web_len - 5] = '\0';
vector<string> web_list;
for (int i = 1; i <= num; i++)
{
char temp[1024];
sprintf_s(temp, "%s_%d.html", web_temp, i);
string web_code = get_web_code(temp);
if (is_error_404_page(web_code))break;
else web_list.push_back(temp);
}
return web_list;
}
//獲取目標區域位置
void get_block_pos(string web_code,const char* beginpos,const char* endpos,int& begin,int& end)
{
string begin_pos = "Left_bar";
string end_pos = "pages";
if (beginpos) begin_pos = beginpos;
if (endpos) end_pos = endpos;
begin = web_code.find(begin_pos.c_str());
end = web_code.find(end_pos.c_str(), begin);
}
//獲取目標列表
vector<string> get_target_list(string& web_code,int begin_index,int end_index)
{
const char* href = "href=";
const char* over = "html";
vector<string> target_list;
int begin = begin_index;
while(true)
{
int pos1 = 0, pos2 = 0;
pos1 = web_code.find(href, begin);
if (pos1 == string::npos)break;
pos2 = web_code.find(over, pos1);
if (pos2 == string::npos)break;
if (pos1 >= end_index || pos2 >= end_index)break;
target_list.push_back(web_code.substr(pos1 + strlen(href) + 1, (pos2 + strlen(over)) - (pos1 + strlen(href) + 1)));
begin = pos2;
}
return target_list;
}
//下載jpg圖片
void download_jpg(string& web_jpg)
{
char agent_name[1024]{ 0 };
_itoa_s(static_cast<int>(time(NULL)), agent_name, 10);
HINTERNET internet_open = NULL;
HINTERNET internet_url = NULL;
HANDLE jpg_file = INVALID_HANDLE_VALUE;
internet_open = InternetOpenA(agent_name, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (internet_open)
{
internet_url = InternetOpenUrlA(internet_open, web_jpg.c_str(), NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (internet_url)
{
char jpg_temp[1024];
sprintf_s(jpg_temp, "%s//%d.jpg", g_dir, InterlockedIncrement(&g_jpg_index));
jpg_file = CreateFile(jpg_temp, GENERIC_WRITE, 0, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
if (jpg_file != INVALID_HANDLE_VALUE)
{
DWORD read_byte = 0;
DWORD write_byte = 0;
const int max_size = 1024 * 10;
char code_buffer[max_size];
//一直讀取直到讀取完整個網頁的源代碼
while (1)
{
if (InternetReadFile(internet_url, code_buffer, max_size, &read_byte))
{
if (read_byte == 0) break;
WriteFile(jpg_file, code_buffer, read_byte, &write_byte, NULL);
Sleep(20);
}
}
}
}
}
if (internet_open)
InternetCloseHandle(internet_open);
if (internet_url)
InternetCloseHandle(internet_url);
if (jpg_file != INVALID_HANDLE_VALUE)
CloseHandle(jpg_file);
}
void target_download(string& web)
{
const int num = 20;
char web_temp[1024];
strncpy_s(web_temp, web.c_str(), web.size());
web_temp[web.size() - 5] = '\0';
for (int i = 1; i < num; i++)
{
char temp[1024];
sprintf_s(temp, "%s_%d.html", web_temp, i);
string web_code = get_web_code(temp);
if (is_error_404_page(web_code))break;
const char* begin = "pic-large";
const char* target = "data-original=";
const char* target_end = "jpg";
int pos = web_code.find(begin);
if (pos == string::npos)continue;
int begin_index = web_code.find(target, pos);
if (begin_index == string::npos)continue;
int end_index = web_code.find(target_end, begin_index);
if (end_index == string::npos)continue;
string target_web = web_code.substr(begin_index + strlen(target) + 1, (end_index + strlen(target_end)) - (begin_index + strlen(target) + 1));
cout << "\t開始下載圖片 -> " << target_web << endl;
download_jpg(target_web);
}
}
//分析網頁
bool analise_web(string web,const char* beginpos,const char* endpos)
{
//獲取網址源代碼
string web_code = get_web_code(web.c_str());
if (web_code.empty())
{
cout << "讀取網頁源代碼失敗 -> " << web << endl;
return false;
}
//判斷是不是404錯誤頁面
if (is_error_404_page(web_code))
{
cout << "當前頁面爲錯誤頁面 -> " << web << endl;
return true;
}
//獲取目標區域
int begin_index = 0;
int end_index = 0;
get_block_pos(web_code, beginpos, endpos, begin_index, end_index);
if (begin_index == string::npos || end_index == string::npos)
{
cout << "解析目標區域失敗 -> " << web << endl;
return false;
}
//獲取目標列表
vector<string> targer_list = get_target_list(web_code, begin_index, end_index);
if (targer_list.empty())
{
cout << "獲取目標連接失敗 -> " << web << endl;
return false;
}
//刪除第一個網頁
targer_list.erase(targer_list.begin());
//對每一個目標網頁進行下載
for (auto& it : targer_list)
{
target_download(it);
}
return false;
}
//開始工作
bool begin_to_work(const char* foler, const char* web, const char* beginpos = nullptr, const char* endpos = nullptr)
{
cout << "美桌網(http://www.win4000.com/)圖片下載程序" << endl;
//創建文件夾保存下載的圖片
strncpy_s(g_dir, foler, strlen(foler));
CreateDirectoryA(foler, NULL);
//獲取網址列表
vector<string> web_list = get_web_list(web);
//analise_web(web_list[0], beginpos, endpos);
vector<shared_future<bool>> statu;
for (int i = 0; i < web_list.size(); i++)
{
shared_future<bool> ret = std::async(launch::async, analise_web, move(web_list[i]), beginpos, endpos);
statu.push_back(ret);
}
for (auto& it : statu)
it.get();
cout << "完成圖片下載,成功下載圖片數量爲:" << g_jpg_index << endl;
return true;
}
//幫助顯示
void showhelp()
{
cout << "圖片下載程序用法簡介:" << endl;
cout << "參數1[必須]\t保存圖片的文件夾名" << endl;
cout << "參數2[必須]\t目標網址" << endl;
cout << "參數3[非必須]\t目標區域開始標識" << endl;
cout << "參數4[非必須]\t目標區域結束標識" << endl;
}
int main(int argc, char* argv[], char* env[])
{
if (argc >= 3) begin_to_work(argv[1], argv[2]);
else showhelp();
return 0;
}
代碼優點low,但是能爬取圖像就行,代碼對網頁有針對性,所以要爬取不同網頁的話需要自己根據指定網頁修改代碼