利用python爬取博客信息並保存在Excel中

只放代碼

import re,json,requests
from lxml import etree
import xlwt

# 分頁
# 找頁碼變化的規律
for i in range(1,6):
    base_url = 'https://blog.csdn.net/qq_42374697/article/list/%s'%(i)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
    }
    response = requests.get(base_url,headers=headers)
    html = etree.HTML(response.text)
    div_list = html.xpath('//div[@class="article-item-box csdn-tracking-statistics"]')
    # print(dd_list)
    
    info_list = []
    for div in div_list:
    
        type_ = div.xpath('./h4/a/span/text()')[0]
        title = div.xpath('./h4/a/text()')[1].strip()
        date = div.xpath('.//span[@class="date"]/text()')[0].strip()
        read_num = div.xpath('.//span[@class="read-num"]/text()')[0]
    
        item = {}
        item['文章類型'] = type_
        item['標題'] = title
        item['日期'] = date
        item['閱讀量'] = read_num
        info_list.append(item)

    filename = 'C:/爬取數據.xls'
    
    import xlwt
    import os
    import xlrd
    from xlutils.copy import copy
    class ExcelUtils(object):
        #工具類的方法:不適用外部變量
        #靜態方法:直接可以用類名.方法名來調用
        # @staticmethod
        #類變量:
        #實例變量
    
        #類方法
        @staticmethod
        def write_to_excel(filename,sheetname,word_list):
            '''
            寫入excel
            :param filename: 文件名
            :param sheetname: 表單名
            :param word_list: [item,item,{}]
            :return:
            '''
            try:
                # 創建workbook
                workbook = xlwt.Workbook(encoding='utf-8')
                # 給工作表添加sheet表單
                sheet = workbook.add_sheet(sheetname)
                # 設置表頭
                head = []
                for i in word_list[0].keys():
                    head.append(i)
                # print(head)
                # 將表頭寫入excel
                for i in range(len(head)):
                    sheet.write(0, i, head[i])
                # 寫內容
                i = 1
                for item in word_list:
                    for j in range(len(head)):
                        sheet.write(i, j, item[head[j]])
                    i += 1
                # 保存
                workbook.save(filename)
                print('寫入excle成功!')
            except Exception as e:
                print(e)
                print('寫入失敗!')
        @staticmethod
        def write_to_excel_append(filename,infos):
            '''
            追加excel的方法
            :param filename: 文件名
            :param infos: 【item,item】
            :return:
            '''
            #打開excle文件
            work_book = xlrd.open_workbook(filename)
            #獲取工作表中的所有sheet表單名稱
            sheets = work_book.sheet_names()
            #獲取第一個表單
            work_sheet = work_book.sheet_by_name(sheets[0])
            #獲取已經寫入的行數
            old_rows = work_sheet.nrows
            #獲取表頭的所有字段
            keys = work_sheet.row_values(0)
            print('===================',keys)
            #將xlrd對象轉化成xlwt,爲了寫入
            new_work_book = copy(work_book)
            #獲取表單來添加數據
            new_sheet = new_work_book.get_sheet(0)
            i = old_rows
            for item in infos:
                for j in range(len(keys)):
                    new_sheet.write(i, j, item[keys[j]])
                i += 1
    
            new_work_book.save(filename)
            print('追加成功!')
    
    if os.path.exists(filename):
        #如果文件存在就追加
        ExcelUtils.write_to_excel_append(filename,info_list)
    else:
        #不存在就新建
        ExcelUtils.write_to_excel(filename,'sheet',info_list) 

在這裏插入圖片描述
在這裏插入圖片描述

多線程

from selenium import webdriver
from lxml import etree
import threading
import os
from queue import Queue
import xlwt
import xlrd
from xlutils.copy import copy

class ExcelUtils(object):
    #工具類的方法:不適用外部變量
    #靜態方法:直接可以用類名.方法名來調用
    # @staticmethod
    #類變量:
    #實例變量

    #類方法
    @staticmethod
    def write_to_excel(filename,sheetname,word_list):
        '''
        寫入excel
        :param filename: 文件名
        :param sheetname: 表單名
        :param word_list: [item,item,{}]
        :return:
        '''
        try:
            # 創建workbook
            workbook = xlwt.Workbook(encoding='utf-8')
            # 給工作表添加sheet表單
            sheet = workbook.add_sheet(sheetname)
            # 設置表頭
            head = []
            for i in word_list[0].keys():
                head.append(i)
            # print(head)
            # 將表頭寫入excel
            for i in range(len(head)):
                sheet.write(0, i, head[i])
            # 寫內容
            i = 1
            for item in word_list:
                for j in range(len(head)):
                    sheet.write(i, j, item[head[j]])
                i += 1
            # 保存
            workbook.save(filename)
            print('寫入excle成功!')
        except Exception as e:
            print(e)
            print('寫入失敗!')
    @staticmethod
    def write_to_excel_append(filename,infos):
        '''
        追加excel的方法
        :param filename: 文件名
        :param infos: 【item,item】
        :return:
        '''
        #打開excle文件
        work_book = xlrd.open_workbook(filename)
        #獲取工作表中的所有sheet表單名稱
        sheets = work_book.sheet_names()
        #獲取第一個表單
        work_sheet = work_book.sheet_by_name(sheets[0])
        #獲取已經寫入的行數
        old_rows = work_sheet.nrows
        #獲取表頭的所有字段
        keys = work_sheet.row_values(0)
        print('===================',keys)
        #將xlrd對象轉化成xlwt,爲了寫入
        new_work_book = copy(work_book)
        #獲取表單來添加數據
        new_sheet = new_work_book.get_sheet(0)
        i = old_rows
        for item in infos:
            for j in range(len(keys)):
                new_sheet.write(i, j, item[keys[j]])
            i += 1

        new_work_book.save(filename)
        print('追加成功!')


class SPIDER(threading.Thread):
    def __init__(self,url,queue_page,name,filename):
        super().__init__() #調用父類的init方法
        self.url=url
        self.queue_page=queue_page
        self.name=name
        self.filename=filename
        
        
    def run(self):
        while True:
            # 一定要先做跳出循環的條件準備
            if self.queue_page.empty():  #如果隊列中頁碼爲空了,就跳出循環
                break
            #取頁碼
            page = self.queue_page.get()  #出隊操作
            #請求+解析
            self.parse_page(page)
        
        
        
    def parse_page(self,page):
        driver = webdriver.PhantomJS() #創建一個驅動
        driver.get(self.url.format(page))  
        html = etree.HTML(driver.page_source)
        info_list=[]
        
        ii_list  = html.xpath('//div[@class="article-item-box csdn-tracking-statistics"]')
        for ii in ii_list:
            try:
                ##提取
                type_ = ii.xpath('./h4/a/span/text()')[0]
                title = ii.xpath('./h4/a/text()')[1].strip()
                date = ii.xpath('.//span[@class="date"]/text()')[0].strip()
                read_num = ii.xpath('.//span[@class="read-num"]/text()')[0]
                
                #獲取到的信息放入字典
                item = {}
                item['文章類型'] = type_
                item['標題'] = title
                item['日期'] = date
                item['閱讀量'] = read_num
                
                info_list.append(item)
            except Exception:
                pass
        # 保存
        if os.path.exists(self.filename):
        #如果文件存在就追加
            ExcelUtils.write_to_excel_append(self.filename,info_list)
        else:
        #不存在就新建
            ExcelUtils.write_to_excel(self.filename,'sheet1',info_list) 
    

if __name__ == '__main__':  
    
   #基礎url
    base_url = 'https://blog.csdn.net/qq_42374697/article/list/{}'
    
    #第一步:創建任務隊列並初始化
    queue_page = Queue()
    
    
    # 找頁碼變化的規律
    for i in range(1,3):
        queue_page.put(i) #入隊操作 就是 0,1,2,3,4,...,9 頁碼


    name_list = ['a','b','c','d','e']
    thread_list = []
    for i in name_list:
        #創建線程
        #queue_page:將創建好的隊列傳進去
        #傳線程名稱
        t = SPIDER(base_url,queue_page,i,'C:/爬取數據.xls')
        t.start()
        thread_list.append(t)
    for t in thread_list:
        t.join()

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章