Python多線程爬蟲：爬取紅盾網

原創

2020-06-20 03:12

import requests
import time
import csv
import pymongo
from lxml import etree
from multiprocessing import Pool

"""
本項目只用於學習，不用於獲取倒賣任何信息
獲取網頁所有地區的鏈接詳情頁信息
Windows10
Pycharm2018
xpath
多線程
數據庫
"""


class RedDum(object):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'}

    """獲取頁面詳情頁鏈接"""

    def get_pages(self, url):
        link_list = []
        response = requests.get(url=url, headers=self.headers)
        items = etree.HTML(response.text)
        hrefs = items.xpath('//*[@class="search-detail"]')
        for href in hrefs[4:]:
            links = href.xpath('./a/@href')
            for link in links:
                link_list.append(link)
        return link_list


    """解析詳情頁內容"""

    def paser_detail(self, url):
        response = requests.get(url=url, headers=self.headers)
        time.sleep(1)
        items = etree.HTML(response.text)
        titles = items.xpath('//a[@class="name"]')
        for titl in titles:  # 地區
            title = titl.xpath('./text()')[0]
        lis = items.xpath('//*[@id="list-container"]/ul/li')
        for li in lis:
            daima = li.xpath('./div/p[1]/a/span[1]/text()')[0]  # 代碼
            person = li.xpath('./div/p[1]/a/span[2]/text()')[0]  # 法人
            address = li.xpath('./div/p[2]/a/span/text()')[0]  # 地址
            data = [title, daima, person, address]
            collection = {
                '地區': title,
                '代碼': daima,
                '法人': person,
                '地址': address
            }
            self.mongodb(collection)
            self.save_details('\n'.join(data).replace(',', '').strip())
            print(title, daima, person, address)

    """多線程"""
    def threads_crawl(self):
        link = self.get_pages('https://www.ubaike.cn/')
        pool = Pool()
        process = pool.map(self.paser_detail, link)
        pool.join()
        pool.close()


    def save_details(self, data):
        with open('hongdun1.csv', 'a', encoding='utf-8')as f:
            write = csv.writer(f)
            write.writerow(data)

    def mongodb(self, collection):
        clinet = pymongo.MongoClient('localhost', 27017)
        db = clinet.hongdun
        table = db.hongdun_table
        table.insert_many(collection)

if __name__ == '__main__':
    k = RedDum()
    k.threads_crawl()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Python多線程爬蟲：爬取紅盾網

pyinstaller打包程序這麼簡單

DIV+CSS佈局（進階篇）

DIV+CSS層疊式佈局（綜合篇）

docker 小白入門

Scrapy ：全站爬取文學文章

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結