from urllib.error import URLError
from urllib.request import urlopen
import re
import pymysql
def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )):
try:
for charset in charsets:
try:
html = urlopen(start_url).read().decode(charset)
break
except UnicodeDecodeError:
html = None
except URLError as ex:
print('Error:', ex)
return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \
retry_times > 0 else None
return html
def main():
url_list = ['http://sports.sohu.com/nba_a.shtml','http://quote.eastmoney.com/centerv2/hsbk']
visited_list = set({})
while len(url_list) > 0:
current_url = url_list.pop(0)
visited_list.add(current_url)
print(current_url)
html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
if html:
link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
link_list = re.findall(link_regex, html)
url_list += link_list
conn = pymysql.connect(host='localhost', port=3306,
db='bots', user='root',
passwd='123456', charset='utf8')
try:
for link in link_list:
if link not in visited_list:
visited_list.add(link)
print(link)
html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312'))
if html:
title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE)
match_list = title_regex.findall(html)
if len(match_list) > 0:
title = match_list[0]
with conn.cursor() as cursor:
cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)',
(title, link))
conn.commit()
finally:
conn.close()
print('執行完成!')
if __name__ == '__main__':
main()
這一堆洋洋灑灑的代碼我抄都要抄錯,還不知道錯在哪兒。總之是通過導入第三方包和正則表達式,可以從相應的網頁爬取相應的節點內容,比如超鏈接比如標籤圖片等,然後持久化的存入數據庫中。邏輯上似懂非懂了,實際上是暈了。
當然有更簡單但效率較低的辦法,比如bs4。
完蛋了,看樣子我得降級。感覺從web知識到python基礎都欠缺太多!最近連博客都寫不出來,全靠抄就太過分了。