爬蟲流程
爬蟲入門代碼示例
import requests
url = "https://www.sogou.com/web"
rep = requests.get(url=url)
repEncode = rep.encoding
with open(filename, "w", encoding=repEncode) as f:
f.write(rep.text)
關於響應結果
方法 |
結果 |
response.status_code |
狀態碼, 正常爲200 |
response.encoding |
網頁原字符編碼 |
response.text |
字符串 |
response.content |
bytes類型結果 |
response.url |
請求的url |
response.headers |
請求頭信息 |
通用爬蟲 - requests模塊
代碼示例
get請求
"""
帶參數發送get請求, 之後所有代碼都會進行ua僞裝
"""
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
url = "https://www.sogou.com/web"
params = {
"query": "美女圖片",
}
rep = requests.get(url=url, params=params, headers=headers)
repEncode = rep.encoding
with open(filename, "w", encoding=repEncode) as f:
f.write(rep.text)
post請求
import requests
url = "xxx"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
data = {
"keyA": "valueA",
"keyB": "valueB",
}
rep = requests.post(url=url, data=data, headers=headers)
repText = rep.text
print(repText)
ajax get請求
import requests
url = "http://image.so.com/zjl?ch=beauty&sn=60&listtype=new&temp=1"
param = {
"ch": "beauty",
"sn": "60",
"listtype": "new",
"temp": "1",
}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.get(url=url, params=param, headers=headers)
repEncode = rep.encoding
repText = rep.text
print(repText)
with open("360pic美女.html","w",encoding=repEncode) as f:
f.write(repText)
ajax post請求
import requests
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
data = {
"cname": "",
"pid": "",
"keyword": "北京",
"pageindex": "3",
"pageSize": "10",
}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.post(url=url, data=data, headers=headers)
print(rep.text)
練習 - 普通爬蟲
import requests
url = "https://www.sogou.com/sogou"
keyword = input("input key word: ")
startPage = int(input("input start page: "))
endPage = int(input("input end page: "))
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
for i in range(startPage, endPage+1):
params = {
"query": keyword,
"page": str(i),
}
rep = requests.get(url=url, params=params, headers=headers, proxies={"http": "117.127.16.207:8080"})
repEncode = rep.encoding
fileName = "{}_{}.html".format(keyword, i)
with open(fileName, "w", encoding=repEncode) as f:
f.write(rep.text)
聚焦爬蟲
正則
介紹
正則表達式
代碼示例
import requests
import re
import os
url = "https://www.qiushibaike.com/pic/"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.get(url=url,headers=headers)
repText = rep.text
re_rule = '<div class="thumb">.*?<img src="//(.*?)" .*?</div>'
img_url = re.findall(re_rule, repText, re.S)
if not os.path.exists("./qiutu"):
os.mkdir("qiutu")
for i in img_url:
url = "http://" + i
r = requests.get(url=url, headers=headers)
rCont = r.content
filename = i.split("/")[-1]
file = "./qiutu/" + filename
with open(file,"wb") as f:
f.write(rCont)
print("{}下載完成".format(file))
xpath
xpath表達式
屬性定位:
# 找到class屬性值爲song的div標籤
//div[@class="song"]
層級&索引定位:
# 找到class屬性值爲tang的div的直系子標籤ul下的第二個子標籤li下的直系子標籤a
//div[@class="tang"]/ul/li[2]/a
邏輯運算:
# 找到href屬性值爲空且class屬性值爲du的a標籤
//a[@href="" and @class="du"]
模糊匹配:
//div[contains(@class, "ng")]
//div[starts-with(@class, "ta")]
取文本:
# /表示獲取某個標籤下的文本內容
# //表示獲取某個標籤下的文本內容和所有子標籤下的文本內容
//div[@class="song"]/p[1]/text()
//div[@class="tang"]//text()
取屬性:
//div[@class="tang"]//li[2]/a/@href
代碼示例
import requests
from lxml import etree
import xlwt
def write_excel(data):
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
for i in range(len(data)):
for j in range(len(data[i])):
if len(data[i][j]) > 1:
data[i][j] = "{}, {}".format(data[i][j][0], data[i][j][1])
worksheet.write(i, j, data[i][j])
workbook.save('Excel_test.xls')
def spider():
"""
爬取特定url的數據
並將數據處理, 返回列表
:return:
"""
url = "http://newgame.17173.com/shouyou/ceshi"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
rep = requests.get(url=url, headers=headers)
rep_text = rep.text
tree = etree.HTML(rep_text)
div_list = tree.xpath('//div[@class="g-box4 box"][1]//ul[2]/li')
ret = [["遊戲名", "上線時間", "測試類型", "遊戲類型", "平臺", "工作室"]]
for i in div_list:
name = i.xpath('.//h6[@class="c1"]/a/text()')
time = i.xpath('.//p[@class="c2"]/text()')
qa_type = i.xpath('.//p[@class="c3"]/text()')
game_type = i.xpath('.//i[@class="c4"]/text()')
plate = i.xpath('./p[@class="c5"]/span//text()')
auth = i.xpath('.//span[@class="c7"]/text()')
data = [name, time, qa_type, game_type, plate, auth]
ret.append(data)
return ret
if __name__ == "__main__":
data = spider()
write_excel(data)