分享關於學習Python,跟着書籍敲的代碼。
第一本書:《Byte Of Python》,給出代碼筆記鏈接:ByteOfPython筆記代碼,鏈接中有此書的PDF格式資源。
第二本書:《Python網絡數據採集》,給出此書PDF格式的資源鏈接:https://pan.baidu.com/s/1eSq6x5g 密碼:a46q
此篇給出《Python網絡數據採集》第三章:開始採集 的代碼筆記,供大家參考。
第三章 開始採集
#-*-coding:utf-8-*-
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
# sys.setdefaultencoding('utf-8')
####遍歷單個域名,(隨機從一個網頁跳轉到另外一個網頁)
###第一版
# beginUrl=urlopen("https://en.wikipedia.org/wiki/Eric_Idle")
# bsBeginUrl=BeautifulSoup(beginUrl)
# for link in bsBeginUrl.findAll("a"):
# if "href" in link.attrs:
# print(link.attrs["href"])
###第二版
# 它們都有三個共同點:
# • 它們都在 id 是 bodyContent 的 div 標籤裏
# • URL 鏈接不包含分號
# • URL 鏈接都以 /wiki/ 開頭
# beginUrl=urlopen("https://en.wikipedia.org/wiki/Eric_Idle")
# bsBeginUrl=BeautifulSoup(beginUrl)
#
# for link in bsBeginUrl.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$")):
# if("href" in link.attrs):
# print(link["href"])
###第三版
# • 一個函數 getLinks,可以用維基百科詞條 /wiki/< 詞條名稱 > 形式的 URL 鏈接作爲參數,
# 然後以同樣的形式返回一個列表,裏面包含所有的詞條 URL 鏈接。
# • 一個主函數, 以某個起始詞條爲參數調用 getLinks,再從返回的 URL 列表裏隨機選擇
# 一個詞條鏈接,再調用 getLinks,直到我們主動停止,或者在新的頁面上沒有詞條鏈接
# 了,程序才停止運行。
# import random
# import datetime
# def getLinks(url):
# links=[]
# bshtml=BeautifulSoup(urlopen("https://en.wikipedia.org{0}".format(url)))
# return bshtml.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
# links=getLinks("/wiki/Kevin_Bacon")
# while len(links)>0:
# href=links[random.randint(0,len(links)-1)].attrs["href"]
# print(href)
# links=getLinks(href)
###採集整個網站的地址
# pages=set()
#
# def getLinks(url):
# global pages
# try:
# bshtml=BeautifulSoup(urlopen("http://en.wikipedia.org{0}".format(url)),"html.parser")
# except HTTPError:
# print("找不到這個網頁!")
# else:
# for link in bshtml.findAll("a",href=re.compile("^(/wiki/)")):
# if("href" in link.attrs):
# if(link.attrs["href"] not in pages):
# newpage=link.attrs["href"]
# print("第{0}頁面:{1}".format(len(pages)+1,newpage))
# pages.add(newpage)
# getLinks(newpage)
#
#
# getLinks('')
# print("全站的頁面數量:{0}".format(len(pages)))
###收集整個網站的特定內容數據
# pages=set()
# def getLinks(url):
# global pages
# try:
# bshtml = BeautifulSoup(urlopen("http://en.wikipedia.org{0}".format(url)), "html.parser")
# # print("第{0}頁面:{1}".format(len(pages)+1,url))
# print(bshtml.h1.get_text())
# print(bshtml.find(id="mw-content-text").findAll("p")[0])
# print(bshtml.find(id="ca-edit").find("span").find("a").attrs['href'])
# except HTTPError:
# print("404:找不到這個網頁(http://en.wikipedia.org{0})!".format(url))
# except AttributeError:
# print("頁面缺少這個元素")
# except IndexError:
# print("角標越界")
# except URLError:
# print("沒反應")
# for link in bshtml.findAll("a",href=re.compile("^(/wiki/)")):
# if("href" in link.attrs):
# if link.attrs["href"] not in pages:
# newpage=link.attrs["href"]
# print("--------------------\n第{0}網頁:{1}".format(len(pages)+1,newpage))
# pages.add(newpage)
# getLinks(newpage)
#
# getLinks('')
###採集互聯網。不再在一個特定的網站內採集數據,而是跨網站。
pages=set()
##獲取內鏈
def getInternalLinks(bshtml,includeUrl):
internalLinks=[]
#找出所有以“/”開頭的鏈接
for link in bshtml.findAll("a",href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs["href"] is not None:
if link.attrs["href"] not in internalLinks:
internalLinks.append(link.attrs["href"])
return internalLinks
##獲取外鏈
def getExternalLinks(bshtml,excliudeUrl):
externalLinks=[]
#找出所有以“www”或者“http”開頭且不包含當前URL的的鏈接
for link in bshtml.findAll("a", href=re.compile("^(www|http)((?!"+excliudeUrl+").)*$")):
if link.attrs["href"] is not None:
if link.attrs["href"] not in externalLinks:
externalLinks.append(link.attrs["href"])
return externalLinks
def splitAddress(address):
addressPart=address.replace("http://","").split("/")
return addressPart
def getRandomExternalLink(startingPage):
html=urlopen(startingPage)
bshtml=BeautifulSoup(html,"html.parser")
externalLinks=getExternalLinks(bshtml,splitAddress(startingPage)[0])
if len(externalLinks)==0:
internalLinks=getInternalLinks(startingPage)
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0,len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink("http://oreilly.com")
print("隨機外鏈是:"+externalLink)
followExternalOnly(externalLink)
# followExternalOnly("http://oreilly.com")
# print(splitAddress("http://oreilly.com"))
####收集網站的所有外鏈鏈接
#
# allExtLinks=[]
# allIntLinks=[]
#
# def getAllExtLinks(url):
# html=urlopen(url)
# bshtml=BeautifulSoup(html,"html.parser")
# internalLinks =getInternalLinks(bshtml,splitAddress(url)[0])
# externalLinks =getExternalLinks(bshtml,splitAddress(url)[0])
#
# for link in externalLinks:
# if link not in allExtLinks:
# allExtLinks.append(link)
# print("第{0}外鏈:{1}".format(len(allExtLinks)+1,link))
#
# for link in internalLinks:
# if link not in allIntLinks:
# allIntLinks.append(link)
# getAllExtLinks(link)
#
# getAllExtLinks("http://oreilly.com")
給出第四章:使用API