簡介
python的一個簡單的小爬蟲示例。用於爬取美美的bing桌面壁紙。
其中使用到
1. 獲取html的urllib模塊
2. 從html中使用正則表達式(regular expression)尋找壁紙路徑的re模塊。
源碼
模塊的使用和如何爬取的過程,一切盡在源碼中。:)
# urblib2 在python3後變成 urllib.request
import urllib.request
import re
HOST_URL = ["https://cn.bing.com", "https://cn.bing.com/?ensearch=1"]
FILE_NAME = "bing.html"
PATTERN = r'g_img={url: ".{10,90}.jpg"'
def open_html(hostUrl):
print("open url")
request = urllib.request.Request(hostUrl)
page = urllib.request.urlopen(request)
html = page.read()
print("url respose: " + str(len(html)) + " bytes")
return html
def save_html(html):
print("save html")
with open(FILE_NAME, 'wb') as f:
# f.write(bytes(file_content, encoding = "utf8"))
f.write(html)
def find_image_url(hostUrl, html):
html = html.decode('utf-8')
pattern = PATTERN
pattern = re.compile(pattern)
jpgList = re.findall(pattern, html)
if jpgList:
print("find jpg: " + str(jpgList))
jpgUrl = jpgList[0].split('"')[1]
imageUrl = hostUrl + jpgUrl
return imageUrl
else:
print("find nothing...")
return None
def save_image(imageUrl):
print("open image")
request = urllib.request.Request(imageUrl)
page = urllib.request.urlopen(request)
image = page.read()
print("save image")
fileName = imageUrl.split('/')[-1:][0]
with open(fileName, 'wb') as f:
f.write(image)
def bing_wallpaper():
for i in range(len(HOST_URL)):
print(HOST_URL[i])
html = open_html(HOST_URL[i])
if html:
save_html(html)
# image的地址都使用bing中文的地址
imageUrl = find_image_url(HOST_URL[0], html)
if imageUrl:
save_image(imageUrl)
print()
print("fin")
if __name__ == '__main__':
bing_wallpaper()
彩蛋
恭祝大家2018年新年快樂!!