import urllib.request
import re
#第一步 確定需要爬取的網址
url ="http://www.baidu.com/"#第二步:發送請求獲取響應
response = urllib.request.urlopen(url)#第三步:通過response.read() 獲取響應內容
html = response.read().decode("utf-8")#第四步:輸出print(html)#提取網址
f = re.findall("""(")(http://[^"]+)(")""",html)for i in f:print(i[1])
二、User-Agent值的獲取與爬蟲解碼
import urllib.request
url ="http://www.baidu.com/"#headers的值可在自己的瀏覽器中找到,比如在谷歌流量器中按F12,點擊Network,在點Name下的任意一欄,在Headers便可看見User-Agent的值
headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}#1.構建請求對象
request = urllib.request.Request(url,headers = headers)#2.獲取響應對象
response = urllib.request.urlopen(request)#3.通過response獲取對象內容
html = response.read().decode("utf-8")print(request.get_header("User-agent"))