Python獲取起點網圖書信息

只獲取了書名,需要別的信息可自行添加

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# author=He
import re
from urllib import request
from bs4 import BeautifulSoup
url = 'https://www.qidian.com'
header = {
    'Cookie': 'AD_RS_COOKIE=20080917',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ '
                  'Chrome/58.0.3029.110 Safari/537.36'}
def GetHttp(url, headers=None, charset='utf8'):
    if headers is None:
        headers = {}
    try:
        return request.urlopen(request.Request(url=url, headers=headers)).read().decode(charset)
    except Exception as e:
        print(e)
    return ''
soup = BeautifulSoup(GetHttp(url=url, headers=header), 'html.parser')
top_cat = {}
for i in soup.find(id='classify-list').find_all('dd'):  # 頂級分類
    if i.a.get('href').find('//') != 0:
        top_cat[i.find('i').text] = ({'name': i.find('i').text, 'url': i.a.get('href'), 'num': i.find('b').text})
del soup
t_list = top_cat.copy()
for i in t_list:    # 獲取頂級分類id
    soup = BeautifulSoup(GetHttp(url=url + top_cat[i]['url'], headers=header), 'html.parser')
    all_url = soup.find(attrs={'data-eid': 'qd_F58'})
    if not all_url:
        all_url = soup.find(attrs={'data-eid': 'qd_F103'})
    del soup
    ID = re.sub("\D", "", str(all_url.get('href')))
    del all_url
    top_cat[i]['cat_id'] = ID
all_url = url+'/all?chanId=%d&pageSize=%d&page=%d'
for t in top_cat:   # 獲取頂級分類下的全部圖書
    soup = BeautifulSoup(GetHttp(url=all_url % (int(top_cat[t]['cat_id']), 20, 1), headers=header), 'html.parser')
    page_ul = soup.find(attrs={'class': 'lbf-pagination-item-list'})
    page_li = page_ul.find_all('li')
    page_total = page_li[7].find('a').text
    del soup, page_ul, page_li
    for p in range(1, int(page_total)):  # 按總頁數循環讀取每頁的圖書信息
        soup = BeautifulSoup(GetHttp(url=all_url % (int(top_cat[t]['cat_id']), 20, p), headers=header), 'html.parser')
        body = soup.find(attrs={'class': 'all-img-list cf'})
        body_li = body.find_all('li')
        del soup
        for li in body_li:  # 獲取單本圖書的詳細信息
            soup = BeautifulSoup(GetHttp(url='https:'+li.find('a').get('href'), headers=header), 'html.parser')
            print('書名:'+soup.find(attrs={'class': 'book-info'}).h1.em.text)
發佈了51 篇原創文章 · 獲贊 22 · 訪問量 14萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章