爬取百度翻譯處理數據的過程
【表單數據的處理】
form_data = urllib.parse.urlencode(form_data).encode()
import urllib.request
import urllib.parse
post_url = 'https://fanyi.baidu.com/sug'
word = input('請輸入你要查詢的英文單詞:')
#構建post表單數據
form_data = {
'kw':word,
}
#發送請求的過程
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
#構建請求對象
request = urllib.request.Request(url=post_url,headers=headers)
#處理post表單數據
form_data = urllib.parse.urlencode(form_data).encode()
#發送請求
response = urllib.request.urlopen(request,data=form_data)
print(response.read().decode())
爬取百度翻譯
import urllib.request
import urllib.parse
post_url = 'https://fanyi.baidu.com/v2transapi'
word = 'baby'
formdata = {
'from' : 'en',
'query' : word,
'sign' : '814534.560887',
'simple_means_flag' : '3',
'to' : 'zh',
'token' : '02816325ffee1111bf235a46a566198c',
'transtype' : 'realtime',
}
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.5',
#'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Host': 'fanyi.baidu.com',
#'Content-Length': '121',
'Connection': 'Keep-Alive',
'Cache-Control': 'no-cache',
'Cookie': 'BAIDUID=0D0DAA575FF991F1F48D12B3753F7622:FG=1; BIDUPSID=0D0DAA575FF991F1F48D12B3753F7622; PSTM=1556349090; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; H_PS_PSSID=1444_28777_21090_28775_28721_28838_28584_22157; PSINO=1; locale=zh; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1556349111; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1556349111; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1',
}
request = urllib.request.Request(url=post_url,headers=headers)
formdata = urllib.parse.urlencode(formdata).encode()
response = urllib.request.urlopen(request,formdata)
print(response.read().decode())
上面這個實例若換其他單詞會發生998錯誤主要是因爲formdata中’sign‘ ’token‘這兩個參數加密,需破解