最近閒來無事,拿來練練手。
注:
由於網站可能會變動,本代碼不保證後面一直都能用,僅講述抓取的思路;
個人純屬研究使用,請不要應用於商業目的;
使用語言:Python
版本:3.4.3
依賴:BeautifulSoup、requests(可以使用pip install進行安裝)
代碼也比較簡單,直接貼上來:
HttpClient.py
# -*- coding: utf-8 -*- import requests def make_request(url): print('make_request: ', url) r = requests.get(url, timeout=(30, 90)) # if r.status_code == 200: print('content-type: ', r.headers['content-type']) print('encoding: ', r.encoding) print('apparent_encoding: ', r.apparent_encoding) return r
Kanunu8.py
# -*- coding: utf-8 -*- import os import sys import re import encodings #爲了引用與父目錄同級文件夾中的方法 sys.path.append("..") # 解決gb2312亂碼問題 encodings.aliases.aliases['gb2312'] = 'gb18030' from bs4 import BeautifulSoup from _pyio import open from util import * book_url = '' book_name = '' #屏蔽掉作者的鏈接 writer_link_pattern = re.compile(r'.*/writer/\d+\.html') #由於我用的是Windows平臺,文件名中不能包含下列非法字符,需要過濾 window_illegal_file_name_pattern = re.compile(r'[\\|/|:|\*|\?|"|<|>|\|]') def find_tbody(tag): if tag.name == 'tbody': if tag.find('tbody') is None and tag.find('strong').string == '正文': return True elif '發佈時間' in tag.get_text(): return True return False def strong_with_no_href(tag): return tag.name == 'strong' and tag.a is None and tag.font is not None def find_title(tag): if tag.h1 is not None: return tag.h1.font.string elif tag.h2 is not None: return tag.h2.font.string else: return tag.find(strong_with_no_href).font.string def make_soup(html): # , from_encoding='gb18030' soup = BeautifulSoup(html, "html.parser") print('original_encoding: ', soup.original_encoding, ', declared_html_encoding: ', soup.declared_html_encoding, ', from_encoding: ', soup.from_encoding) return soup def get_legal_window_file_name(name): if name is None: return 'unknown' return window_illegal_file_name_pattern.sub('', name) if __name__ == '__main__': book_url = input('請輸入電子書URL:') # 按任意鍵繼續 # if input('請按任意鍵開始抓取...'): # pass #獲取Html內容 request = HttpClient.make_request(book_url) html = request.content soup = make_soup(html) # 爬取書名 book_name = soup.find('title').string path = './' + get_legal_window_file_name(book_name) + '.txt' links = [] #提取所有章節的鏈接 for tmp in soup.find_all('tbody'): if len(tmp.find_all('tr')) > 1 : all_link = tmp.find_all('a') if not all_link is None: links.extend(all_link) if book_url.endswith('.html'): parent_url = book_url[0:book_url.rindex('/') + 1] else: parent_url = book_url with open(path, 'w', encoding="utf-8") as f: for link in links: # 作家鏈接,忽略 if not writer_link_pattern.match(link['href']) is None: continue print('\n', link.string) url = parent_url + link['href'] print(url) response = HttpClient.make_request(url) chapter_soup = make_soup(response.content) chapter_name = find_title(chapter_soup) # 章節標題 f.write('\n\n') f.write(chapter_name) f.write('\n\n') # 章節內容 f.write(chapter_soup.find('p').get_text().replace('<br/>', '')) # for p in chapter_soup.find('p').contents: # if p == '<br>': # f.write('\n') # elif p is NavigableString: # f.write(p) # elif p is Tag: # f.write(p.string) f.flush() print('電子書已成功保存: ', path)
遇到的問題:
不同的書(甚至章節)標題內容、字體(h1,h2...)、標籤結構都不同;
編碼問題,抓下來是亂碼,具體原因請參考;
應該是爲了增加爬取的難度吧,不過只能針對遇到的問題進行分析、解決;