關於asyncio的ValueError: too many file descriptors in select()錯誤

最近寫爬蟲用asyncio+aiohttp的形式,代碼如下:

import aiohttp
import asyncio

headers = {
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, sdch, br",
        "Accept-Language": "zh-CN,zh;q=0.8",
    }

async def ss(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url,headers=headers) as resp:
            print(resp.status)
            d = (await resp.text("utf-8","ignore"))
            cc(d)

def cc(v):
    print(v)
    soup = BeautifulSoup(v, "lxml")
    contents = soup.select("div.content")
    for conten in contents:
        articleAuthor = conten.select("div.blog_info > a")
        if articleAuthor:
            # print(articleAuthor)
            articleAuthor = articleAuthor[0]
        else:
            articleAuthor = ""
        print(articleAuthor)

loop = asyncio.get_event_loop()
tasks = [ss(url) for url in ["http://www.iteye.com/blogs/tag/java?page="+str(x) for x in range(1,2)] ]
loop.run_until_complete(asyncio.gather(*tasks))

乍一看代碼沒有問題,運行起來代碼也沒有問題,但是如果將url增加到上千個就會報ValueError: too many file descriptors in select()的錯誤

這是爲什麼呢?

因爲asyncio內部用到了select,而select就是那個什麼系統打開文件數是有限度的,上面的代碼一次性將處理url的函數作爲任務扔進了一個超大的List中,這就引起了錯誤,用這種形式無法寫大規模爬蟲

那怎麼辦呢?

用回調

代碼如下:

from bs4 import BeautifulSoup
import aiohttp
import asyncio
import time

urlss=[]
headers = {
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, sdch, br",
        "Accept-Language": "zh-CN,zh;q=0.8",
    }

async def ss(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url,headers=headers) as resp:
            print(resp.status)
            return await resp.text("utf-8","ignore")

def cc(v):
    print("ssssssss")
    # print(v.result())
    # result()獲取內容
    soup = BeautifulSoup(v.result(), "lxml")
    contents = soup.select("div.content")
    for conten in contents:
        # articleAuthor = conten.select("div.blog_info > a")
        # if articleAuthor:
        #     # print(articleAuthor)
        #     articleAuthor = articleAuthor[0]
        # else:
        #     articleAuthor = ""
        articleUrl = conten.select("h3 > a")
        if articleUrl:
            articleUrl = articleUrl[0].get("href")
            urlss.append(articleUrl)

# async def ss2(url):
#     async with aiohttp.ClientSession() as session:
#         async with session.get(url,headers=headers) as resp:
#             print(resp.status)
#             return await resp.text("utf-8","ignore")

def cc2(v):
    print("ssssssss222222222222")
    # print(v.result())
    # result()獲取內容
    soup = BeautifulSoup(v.result(), "lxml")
    articleImages_list = soup.select("img")
    if articleImages_list:
        articleImages_list = articleImages_list[0].get("src")
    else:
        articleImages_list = []
    print(articleImages_list)

now = lambda: time.time()
start = now()
loop = asyncio.get_event_loop()

# url = "http://www.iteye.com/blogs/tag/java?page=1"
for url in ["http://www.iteye.com/blogs/tag/java?page="+str(x) for x in range(1,2)]:
    coroutine = ss(url)
    # 添加任務
    task = asyncio.ensure_future(coroutine)
    # 回調
    task.add_done_callback(cc)
    # 事件循環
    loop.run_until_complete(task)
    
for url in urlss:
    coroutine = ss(url)
    task = asyncio.ensure_future(coroutine)
    task.add_done_callback(cc2)
    loop.run_until_complete(task)

print('TIME: ', now() - start)



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章