分享關於學習Python,跟着書籍敲的代碼。
第一本書:《Byte Of Python》,給出代碼筆記鏈接:ByteOfPython筆記代碼,鏈接中有此書的PDF格式資源。
第二本書:《Python網絡數據採集》,給出此書PDF格式的資源鏈接:https://pan.baidu.com/s/1eSq6x5g 密碼:a46q
此篇給出《Python網絡數據採集》第一章:初見網絡爬蟲 第二章:複雜HTML解析 的代碼筆記,供大家參考。
第一章:初見網絡爬蟲
#-*-coding:utf-8-*-
###原生的爬網頁
# import urllib.request
# response = urllib.request.urlopen('http://localhost:8080/zhf/login!index.action')
# print(response.read().decode('utf-8'))
###使用BeautifulSoup塊爬網頁
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html=urlopen("http://localhost:8080/zhf/login.jsp")
# bshtml=BeautifulSoup(html.read(),"html.parser")
# print("網頁抓取成功!")
# print(bshtml.title)
# print(bshtml.head)
# print(bshtml.body)
###排誤性(有預見性地處理異常):使用BeautifulSoup塊爬網頁
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
def getHtmlTitle(url):
try:
html=urlopen(url)
except HTTPError as e:
return None
except URLError as e:
return None
try:
bshtml=BeautifulSoup(html.read(),"html.parser")
title=bshtml.head.title;
except ArithmeticError as e:
return None
return title
url="http://localhost:8080/zhf/login.jsp"
title=getHtmlTitle(url)
if title is None:
print("抓取失敗")
else:
print("抓取成功:\n{0}".format(title))
第二章:複雜HTML解析
#-*-coding:utf-8-*-
########複雜HTML解析
###根據css的class或者id這些標誌屬性,過濾網頁元素,獲取特定的標籤。
### .get_text(),會去掉html中所有的標籤
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
# bshtml=BeautifulSoup(html,"html.parser")
# namelist=bshtml.findAll("span",{"class":"green"})
# for name in namelist:
# print("name:{0}".format(name.get_text()))
# #findAll/find方法
# # findAll(tag, attributes, recursive, text, limit, keywords)
# # find(tag, attributes, recursive, text, keywords)
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html=urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
# bshtml=BeautifulSoup(html,"html.parser")
# princelist=bshtml.findAll(text="the prince")
# print("'the prince'出現了:{0} 次".format(len(princelist)))
#
# alltext=bshtml.findAll(id='text')
# print(alltext[0].get_text())
###子標籤以及後代標籤
###子標籤:.children
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html=urlopen("http://www.pythonscraping.com/pages/page3.html")
# bshtml=BeautifulSoup(html,"html.parser")
#
# for child in bshtml.find("table",{"id":"giftList"}).children:
# print(child)
###後代標籤:. descendants
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html=urlopen("http://www.pythonscraping.com/pages/page3.html")
# bshtml=BeautifulSoup(html,"html.parser")
#
# for child in bshtml.find("table",{"id":"giftList"}).descendants:
# print(child)
###所有的兄弟標籤:.next_siblings
###兄弟標籤:.next_sibling
#previous_sibling:上一個兄弟標籤
#previous_siblings:前面所有的兄弟標籤
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html=urlopen("http://www.pythonscraping.com/pages/page3.html")
# bhtml=BeautifulSoup(html,"html.parser")
#
# for tr in bhtml.find("table",{"id":"giftList"}).tr.next_siblings:
# print(tr)
###父標籤:parent 和 parents
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html=urlopen("http://www.pythonscraping.com/pages/page3.html")
# bthtml=BeautifulSoup(html,"html.parser")
#
# obj=bthtml.find("img",{"src":"../img/gifts/img6.jpg"}).parent.previous_sibling.get_text()
# print(obj)
#######正則表達式 re.compile("正則表達式")
# 郵箱的正則:[A-Za-z0-9\._+]+@[A-Za-z]+\.(com|org|edu|net)
#
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re
#
# html=urlopen("http://www.pythonscraping.com/pages/page3.html")
# bthtml=BeautifulSoup(html,"html.parser")
#
# imges=bthtml.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
# for imge in imges:
# print(imge)
# print(imge["src"])
###獲取屬性:myImgTag.attrs["src"]
### myTag.attrs
第三章鏈接:第三章 開始採集