只放代碼
import re,json,requests
from lxml import etree
import xlwt
# 分頁
# 找頁碼變化的規律
for i in range(1,6):
base_url = 'https://blog.csdn.net/qq_42374697/article/list/%s'%(i)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
response = requests.get(base_url,headers=headers)
html = etree.HTML(response.text)
div_list = html.xpath('//div[@class="article-item-box csdn-tracking-statistics"]')
# print(dd_list)
info_list = []
for div in div_list:
type_ = div.xpath('./h4/a/span/text()')[0]
title = div.xpath('./h4/a/text()')[1].strip()
date = div.xpath('.//span[@class="date"]/text()')[0].strip()
read_num = div.xpath('.//span[@class="read-num"]/text()')[0]
item = {}
item['文章類型'] = type_
item['標題'] = title
item['日期'] = date
item['閱讀量'] = read_num
info_list.append(item)
filename = 'C:/爬取數據.xls'
import xlwt
import os
import xlrd
from xlutils.copy import copy
class ExcelUtils(object):
#工具類的方法:不適用外部變量
#靜態方法:直接可以用類名.方法名來調用
# @staticmethod
#類變量:
#實例變量
#類方法
@staticmethod
def write_to_excel(filename,sheetname,word_list):
'''
寫入excel
:param filename: 文件名
:param sheetname: 表單名
:param word_list: [item,item,{}]
:return:
'''
try:
# 創建workbook
workbook = xlwt.Workbook(encoding='utf-8')
# 給工作表添加sheet表單
sheet = workbook.add_sheet(sheetname)
# 設置表頭
head = []
for i in word_list[0].keys():
head.append(i)
# print(head)
# 將表頭寫入excel
for i in range(len(head)):
sheet.write(0, i, head[i])
# 寫內容
i = 1
for item in word_list:
for j in range(len(head)):
sheet.write(i, j, item[head[j]])
i += 1
# 保存
workbook.save(filename)
print('寫入excle成功!')
except Exception as e:
print(e)
print('寫入失敗!')
@staticmethod
def write_to_excel_append(filename,infos):
'''
追加excel的方法
:param filename: 文件名
:param infos: 【item,item】
:return:
'''
#打開excle文件
work_book = xlrd.open_workbook(filename)
#獲取工作表中的所有sheet表單名稱
sheets = work_book.sheet_names()
#獲取第一個表單
work_sheet = work_book.sheet_by_name(sheets[0])
#獲取已經寫入的行數
old_rows = work_sheet.nrows
#獲取表頭的所有字段
keys = work_sheet.row_values(0)
print('===================',keys)
#將xlrd對象轉化成xlwt,爲了寫入
new_work_book = copy(work_book)
#獲取表單來添加數據
new_sheet = new_work_book.get_sheet(0)
i = old_rows
for item in infos:
for j in range(len(keys)):
new_sheet.write(i, j, item[keys[j]])
i += 1
new_work_book.save(filename)
print('追加成功!')
if os.path.exists(filename):
#如果文件存在就追加
ExcelUtils.write_to_excel_append(filename,info_list)
else:
#不存在就新建
ExcelUtils.write_to_excel(filename,'sheet',info_list)
多線程
from selenium import webdriver
from lxml import etree
import threading
import os
from queue import Queue
import xlwt
import xlrd
from xlutils.copy import copy
class ExcelUtils(object):
#工具類的方法:不適用外部變量
#靜態方法:直接可以用類名.方法名來調用
# @staticmethod
#類變量:
#實例變量
#類方法
@staticmethod
def write_to_excel(filename,sheetname,word_list):
'''
寫入excel
:param filename: 文件名
:param sheetname: 表單名
:param word_list: [item,item,{}]
:return:
'''
try:
# 創建workbook
workbook = xlwt.Workbook(encoding='utf-8')
# 給工作表添加sheet表單
sheet = workbook.add_sheet(sheetname)
# 設置表頭
head = []
for i in word_list[0].keys():
head.append(i)
# print(head)
# 將表頭寫入excel
for i in range(len(head)):
sheet.write(0, i, head[i])
# 寫內容
i = 1
for item in word_list:
for j in range(len(head)):
sheet.write(i, j, item[head[j]])
i += 1
# 保存
workbook.save(filename)
print('寫入excle成功!')
except Exception as e:
print(e)
print('寫入失敗!')
@staticmethod
def write_to_excel_append(filename,infos):
'''
追加excel的方法
:param filename: 文件名
:param infos: 【item,item】
:return:
'''
#打開excle文件
work_book = xlrd.open_workbook(filename)
#獲取工作表中的所有sheet表單名稱
sheets = work_book.sheet_names()
#獲取第一個表單
work_sheet = work_book.sheet_by_name(sheets[0])
#獲取已經寫入的行數
old_rows = work_sheet.nrows
#獲取表頭的所有字段
keys = work_sheet.row_values(0)
print('===================',keys)
#將xlrd對象轉化成xlwt,爲了寫入
new_work_book = copy(work_book)
#獲取表單來添加數據
new_sheet = new_work_book.get_sheet(0)
i = old_rows
for item in infos:
for j in range(len(keys)):
new_sheet.write(i, j, item[keys[j]])
i += 1
new_work_book.save(filename)
print('追加成功!')
class SPIDER(threading.Thread):
def __init__(self,url,queue_page,name,filename):
super().__init__() #調用父類的init方法
self.url=url
self.queue_page=queue_page
self.name=name
self.filename=filename
def run(self):
while True:
# 一定要先做跳出循環的條件準備
if self.queue_page.empty(): #如果隊列中頁碼爲空了,就跳出循環
break
#取頁碼
page = self.queue_page.get() #出隊操作
#請求+解析
self.parse_page(page)
def parse_page(self,page):
driver = webdriver.PhantomJS() #創建一個驅動
driver.get(self.url.format(page))
html = etree.HTML(driver.page_source)
info_list=[]
ii_list = html.xpath('//div[@class="article-item-box csdn-tracking-statistics"]')
for ii in ii_list:
try:
##提取
type_ = ii.xpath('./h4/a/span/text()')[0]
title = ii.xpath('./h4/a/text()')[1].strip()
date = ii.xpath('.//span[@class="date"]/text()')[0].strip()
read_num = ii.xpath('.//span[@class="read-num"]/text()')[0]
#獲取到的信息放入字典
item = {}
item['文章類型'] = type_
item['標題'] = title
item['日期'] = date
item['閱讀量'] = read_num
info_list.append(item)
except Exception:
pass
# 保存
if os.path.exists(self.filename):
#如果文件存在就追加
ExcelUtils.write_to_excel_append(self.filename,info_list)
else:
#不存在就新建
ExcelUtils.write_to_excel(self.filename,'sheet1',info_list)
if __name__ == '__main__':
#基礎url
base_url = 'https://blog.csdn.net/qq_42374697/article/list/{}'
#第一步:創建任務隊列並初始化
queue_page = Queue()
# 找頁碼變化的規律
for i in range(1,3):
queue_page.put(i) #入隊操作 就是 0,1,2,3,4,...,9 頁碼
name_list = ['a','b','c','d','e']
thread_list = []
for i in name_list:
#創建線程
#queue_page:將創建好的隊列傳進去
#傳線程名稱
t = SPIDER(base_url,queue_page,i,'C:/爬取數據.xls')
t.start()
thread_list.append(t)
for t in thread_list:
t.join()