Spider古詩詞

# coding: utf-8
import urllib.request
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

# http://www.shicimingju.com/book/sanguoyanyi.html
# http://www.shicimingju.com/book/sanguoyanyi/1.html
# http://www.shicimingju.com/book/sanguoyanyi/3.html

url = "http://www.shicimingju.com/book/sanguoyanyi.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}

request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
concent = response.read().decode("utf-8")

soup = BeautifulSoup(concent, 'lxml')
# print(soup.prettify())
# print(soup.select('.book-mulu >ul >li >a '))
ret = soup.select('.book-mulu >ul >li >a')
# print(ret['href'])  # TypeError: list indices must be integers or slices, not str
# print(ret[0])  # <a href="/book/sanguoyanyi/1.html">第一回·宴桃園豪傑三結義  斬黃巾英雄首立功</a>
# print(ret[0]['href'])  # /book/sanguoyanyi/1.html

with open('三國演義.txt', 'w', encoding='utf-8')as fp:
    for item in ret:
        url_1 = 'http://www.shicimingju.com' + item['href']
        # print(url_1)  # 每一集url
        title = item.string
        # print(title)
        print("正在爬取: %s" %title)
        request_1 = urllib.request.Request(url=url_1, headers=headers)
        response_1 = urllib.request.urlopen(request_1)
        concent_1 = response_1.read().decode('utf-8')
        # print(concent_1)
        soup_1 = BeautifulSoup(concent_1, 'lxml')
        # print(soup_1.prettify())
        # print(soup_1.select('.chapter_content')[0].text)
        # ret_1 = soup_1.select('.chapter_content >p')
        string = soup_1.select('.chapter_content')[0].text
        fp.write(title + string)
        print("爬取結束:%s" %title)

# item['href'] 列表取url
# fp.write(字符串)
# 空格不用考慮
# print(soup_1.select('.chapter_content'))  打印出來是多個列表


報錯:UnicodeEncodeError: 'gbk' codec can't encode character '\xa9' in position 30
解決:
import io  
import sys 
#改變標準輸出的默認編碼 
#utf-8中文亂碼
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
~~~
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章