url詳解
1. urlopen函數—打開一個網頁
from urllib import request
r=request.urlopen('http://www.baidu.com')
print(r.read(2000).decode("utf-8"))
2.urlretrieve函數—保存網頁文
request.urlretrieve('http://www.baidu.com', 'baidu.html')
#request.urlretrieve('https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=3254061209,1473183314&fm=11&gp=0.jpg','biaoqing.jpg')
3. 參數編碼與解碼
#urlencode函數---可以把字典轉換成url編碼的數據(編碼)
from urllib import parse
params = {'name':'張三','age':18,'grat':'hello world'}
result = parse.urlencode(params)
print(result)
url = 'http://www.baidu.com/s'
params={"wd":"劉德華"}
qs = parse.urlencode(params)
url =url + "?" + qs
r = request.urlopen(url)
print(r.read().decode("utf-8"))
#parse_qs函數 (urlencode函數的反向操作,解碼)
params = {'name':'張三','age':18,'grat':'hello world'}
qs = parse.urlencode(params)
print(qs)
result = parse.parse_qs(qs)
print(result)
4.urlparse函數
url = 'http://www.baidu.com/s?wd=python&username=abc#1'
result = parse.urlparse(url)
print(result)
print('scheme:', result.scheme)
print('netloc:',result.netloc)
print('path',result.path)
print('params',result.params)
print('query',result.query)
print('fragment',result.fragment)
5.urlsplit函數(同urlparse)
url = 'http://www.baidu.com/s?wd=python&username=abc#1'
result = parse.urlsplit(url)
print(result)
print('scheme:', result.scheme)
print('netloc:',result.netloc)
print('path',result.path)
#print('params',result.params)
print('query',result.query)
print('fragment',result.fragment)
6.request.Request類 ----增加請求頭
from urllib import request
url = 'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=py'
r = request.urlopen(url)
print(r.read)
headers = {
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
req = request.Request(url,headers = headers)
resp = request.urlopen(req)
print(resp.read(3000).decode('utf-8'))
實例: 爬取拉勾網
from urllib import request,parse
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=py',
}
data ={
'first': 'true',
'pn': 1,
'kd': 'python'
}
req = request.Request(url,headers = headers, data = parse.urlencode(data).encode('utf-8'), method='POST')
resp = request.urlopen(req)
print(resp.read().decode('utf-8'))
作業:爬取慶餘年的短評
from urllib import request
url = 'https://movie.douban.com/subject/25853071/comments?status=P'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
req = request.Request(url,headers = headers)
r = request.urlopen(req)
print(r.read().decode('utf-8'))
7.ProxyHandler處理器(IP設置)
西刺免費代理IP:http://www.xicidaili.com/
快代理:http://www.kuaidaili.com/
代理雲:http://www.dailiyun.com/
from urllib import request
#沒使用代理
url = 'http://httpbin.org/ip'
r = request.urlopen(url)
print(r.read())
#使用代理
url = 'http://httpbin.org/ip'
#1.使用ProxyHandler,傳入代理構建一個handler
handler = request.ProxyHandler({"http":"163.204.244.84"})
#2.使用上面的handler構建一個opener
opener =request.build_opener(handler)
#3.使用opener發送一個請求
r = opener.open(url)
print(r.read())
8.cookie存儲用戶ID
#cookie的格式:
set-Cookie:NAME=VALUE;Expires/Max=age=DATE;Path=PATH;Domain=DOMAIN_NAME;SECURE
NAME:cookie的名字
VALUE:cookie的值
Expires:cookie的過期時間
Path:cookie作用的路徑
Domain:cookie作用的域名
#使用cookie模擬登錄
from urllib import request
#1.不使用cookie來請求知乎頁面
zhihu_url = "https://www.zhihu.com/follow"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
req = request.Request(url=zhihu_url,headers=headers)
resp = request.urlopen(req)
#print(resp.read().decode('utf-8'))
with open('zhihu.html','w') as fp:
#write函數必須寫入一個str的數據類型
#resp.read()讀出來的是bytes數據類型
#str-> encode -> bytes
fp.write(resp.read().decode('gbk','ignore'))
實例:爬取知乎
#http.cookiejar模塊
from urllib import request,parse
from http.cookiejar import CookieJar
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
def get_opener():
#1.登陸
#1.1創建一個cookiejar對象
cookiejar = CookieJar()
#1.2使用cookiejar創建一個httpchookiejar對象
handler = request.HTTPCookieProcessor(cookiejar)
#1.3使用上一步創建的handler創建一個opener
opener = request.build_opener(handler)
return opener
def login_zhihu(opener):
#1.4使用opener發送登陸請求
data = {
"username":"19912456595",
"password":"pythoncookie0"
}
login_url = "https://www.zhihu.com/signin?next=%2Fsettings%2Faccount"
req = request.Request(login_url,data =parse.urlencode(data).encode('utf-8'), headers=headers)
opener.open(req)
def visit_zhihu(opener):
#2.訪問知乎
zhihu_url = "https://www.zhihu.com/follow"
resp = opener.open(zhihu_url)
with open('zhihu0.html','w',encoding='utf-8') as fp:
fp.write(resp.read().decode("utf-8"))
if __name__ == '__main__':
opener = get_opener()
login_zhihu(opener)
visit_zhihu(opener)
9.cookie信息的加載與保存
#MozillaCookieJar
from urllib import request
from http.cookiejar import MozillaCookieJar
cookiejar = MozillaCookieJar('cookie.txt')
cookiejar.load(ignore_discard=True)
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)
resp = opener.open('http://httpbin.org/cookies/set?freeform=spider')
for cookie in cookiejar:
print(cookie)
#cookiejar.save(ignore_discard=True)
10.requests庫的使用
#發送GET請求
import requests
response = requests.get("https://www.baidu.com/")
#print(type(response.text))
#print(response.text)
#print(type(response.content))
#print(response.content.decode('utf-8'))
print(response.url)
print(response.encoding)
print(response.status_code)
import requests
params = {
'wd':'中國'
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
response = requests.get("https://www.baidu.com/s",params=params, headers=headers)
with open('baidu.html','w',encoding='utf-8') as fp:
fp.write(response.content.decode('utf-8'))
print(response.url)
#發送POST請求
import requests
data = {
'first':'true',
'pn':'1',
'kw':'python'
}
headers = {
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=p',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
response = requests.post('https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=falseRequest Method: POST', data=data,headers=headers)
print(type(response.json()))
print(response.text)
requests使用代理
import requests
proxy = {
'http':'123.149.38.52'
}
response = requests.get('http://httpbin.org/ip',proxies=proxy)
print(response.text)
requests處理cookie信息
#Session 爬取知乎
url = 'https://www.zhihu.com/signin?next=%2Fsettings%2Faccount'
data = { "username":"19912456595","password":"pythoncookie0" }
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
session = requests.Session()
session.post(url,data=data,headers=headers)
response = session.get('https://www.zhihu.com/follow')
with open('zhihu1.html','w',encoding='utf-8') as f:
f.write(response.text)