只获取了书名,需要别的信息可自行添加
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# author=He
import re
from urllib import request
from bs4 import BeautifulSoup
url = 'https://www.qidian.com'
header = {
'Cookie': 'AD_RS_COOKIE=20080917',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ '
'Chrome/58.0.3029.110 Safari/537.36'}
def GetHttp(url, headers=None, charset='utf8'):
if headers is None:
headers = {}
try:
return request.urlopen(request.Request(url=url, headers=headers)).read().decode(charset)
except Exception as e:
print(e)
return ''
soup = BeautifulSoup(GetHttp(url=url, headers=header), 'html.parser')
top_cat = {}
for i in soup.find(id='classify-list').find_all('dd'): # 顶级分类
if i.a.get('href').find('//') != 0:
top_cat[i.find('i').text] = ({'name': i.find('i').text, 'url': i.a.get('href'), 'num': i.find('b').text})
del soup
t_list = top_cat.copy()
for i in t_list: # 获取顶级分类id
soup = BeautifulSoup(GetHttp(url=url + top_cat[i]['url'], headers=header), 'html.parser')
all_url = soup.find(attrs={'data-eid': 'qd_F58'})
if not all_url:
all_url = soup.find(attrs={'data-eid': 'qd_F103'})
del soup
ID = re.sub("\D", "", str(all_url.get('href')))
del all_url
top_cat[i]['cat_id'] = ID
all_url = url+'/all?chanId=%d&pageSize=%d&page=%d'
for t in top_cat: # 获取顶级分类下的全部图书
soup = BeautifulSoup(GetHttp(url=all_url % (int(top_cat[t]['cat_id']), 20, 1), headers=header), 'html.parser')
page_ul = soup.find(attrs={'class': 'lbf-pagination-item-list'})
page_li = page_ul.find_all('li')
page_total = page_li[7].find('a').text
del soup, page_ul, page_li
for p in range(1, int(page_total)): # 按总页数循环读取每页的图书信息
soup = BeautifulSoup(GetHttp(url=all_url % (int(top_cat[t]['cat_id']), 20, p), headers=header), 'html.parser')
body = soup.find(attrs={'class': 'all-img-list cf'})
body_li = body.find_all('li')
del soup
for li in body_li: # 获取单本图书的详细信息
soup = BeautifulSoup(GetHttp(url='https:'+li.find('a').get('href'), headers=header), 'html.parser')
print('书名:'+soup.find(attrs={'class': 'book-info'}).h1.em.text)