python实现百度网页采集，增加多线程处理，同时对百度返回的内容进行分类统计

原創

2023-06-08 11:59

import asyncio
import aiohttp
import threading
from collections import Counter

# 定义一个全局变量，用于存储分类结果
categories = Counter()

# 定义一个函数，用于根据文本内容进行分类
def classify(text):
    # 这里可以使用任何文本分类的方法，例如正则表达式、机器学习等
    # 这里为了简单起见，只使用了简单的字符串匹配
    if "Python" in text:
        return "Python"
    elif "Java" in text:
        return "Java"
    elif "C++" in text:
        return "C++"
    else:
        return "Other"

async def fetch_page(url, proxy):
    # 创建一个 aiohttp 的 ClientSession 对象，并指定代理IP和端口
    async with aiohttp.ClientSession(proxy=proxy) as session:
        # 使用 session.get 方法发送请求，并获取响应对象
        async with session.get(url) as response:
            # 返回响应的文本内容
            return await response.text()

async def main():
    urls = ["https://www.baidu.com/s?wd=" + str(i) for i in range(10)] # 生成十个百度搜索网址
    
    # 假设有一个文件 16yun.txt，每行存储一个代理host和端口，例如 www.16yun.cn:3333
    # 读取文件中的所有代理，并存储在一个列表中
    with open("16yun.txt") as f:
        proxies = [line.strip() for line in f]
    
    tasks = [] # 创建一个空列表，用于存储 task 对象
    
    # 遍历 urls 和 proxies 列表，为每个 url 配对一个 proxy，并创建 task 对象
    for url, proxy in zip(urls, proxies):
        task = asyncio.create_task(fetch_page(url, proxy))
        tasks.append(task)
    
    results = await asyncio.gather(*tasks) # 同时运行所有 task 并获取结果
    
    # 创建一个线程池，用于执行分类任务
    pool = threading.ThreadPoolExecutor(max_workers=4)
    
    for result in results:
        print(result[:100]) # 打印每个网页的前 100 个字符
        
        # 使用线程池提交一个分类任务，并更新全局变量 categories
        category = pool.submit(classify, result).result()
        categories[category] += 1
    
    # 关闭线程池并等待所有任务完成
    pool.shutdown(wait=True)
    
    # 打印最终的分类结果
    print(categories)

asyncio.run(main()) # 运行主协程

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python实现百度网页采集，增加多线程处理，同时对百度返回的内容进行分类统计

win11关闭自动检测病毒删文件

打開神經網絡的黑箱子

深圳IO 第8關-仿真蜂鳴器

深圳IO 第9關-無線遊戲控制器

動詞算子式通用代碼生成器的根本原理，動詞算子和域對象的笛卡爾積

通用代碼生成器簡介

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結