[B9]爬蟲課程01

url詳解

1. urlopen函數—打開一個網頁

from urllib import request
  
r=request.urlopen('http://www.baidu.com')
print(r.read(2000).decode("utf-8"))

2.urlretrieve函數—保存網頁文

request.urlretrieve('http://www.baidu.com', 'baidu.html')
#request.urlretrieve('https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=3254061209,1473183314&fm=11&gp=0.jpg','biaoqing.jpg')

3. 參數編碼與解碼

#urlencode函數---可以把字典轉換成url編碼的數據(編碼)
from urllib import parse

params = {'name':'張三','age':18,'grat':'hello world'}
result = parse.urlencode(params)
print(result)
url = 'http://www.baidu.com/s'

params={"wd":"劉德華"}
qs = parse.urlencode(params)

url =url + "?" + qs
r = request.urlopen(url)
print(r.read().decode("utf-8"))
#parse_qs函數  (urlencode函數的反向操作,解碼)
params = {'name':'張三','age':18,'grat':'hello world'}
qs = parse.urlencode(params)
print(qs)

result = parse.parse_qs(qs)
print(result)

4.urlparse函數

url = 'http://www.baidu.com/s?wd=python&username=abc#1'
result = parse.urlparse(url)
print(result)
print('scheme:', result.scheme)
print('netloc:',result.netloc)
print('path',result.path)
print('params',result.params)
print('query',result.query)
print('fragment',result.fragment)

5.urlsplit函數(同urlparse)

url = 'http://www.baidu.com/s?wd=python&username=abc#1'
result = parse.urlsplit(url)
print(result)
print('scheme:', result.scheme)
print('netloc:',result.netloc)
print('path',result.path)
#print('params',result.params)
print('query',result.query)
print('fragment',result.fragment)

6.request.Request類 ----增加請求頭

from urllib import request
url = 'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=py'
r = request.urlopen(url)
print(r.read)
headers = {
    'User-Agent':' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}

req = request.Request(url,headers = headers)
resp = request.urlopen(req)
print(resp.read(3000).decode('utf-8'))

實例: 爬取拉勾網

from urllib import request,parse

url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
    'User-Agent':' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
    'Referer':'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=py',
}
data ={
    'first': 'true',
'pn': 1,
'kd': 'python'
}
req = request.Request(url,headers = headers, data = parse.urlencode(data).encode('utf-8'), method='POST')
resp = request.urlopen(req)
print(resp.read().decode('utf-8'))

作業:爬取慶餘年的短評

from urllib import request
url = 'https://movie.douban.com/subject/25853071/comments?status=P'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
req = request.Request(url,headers = headers)
r = request.urlopen(req)
print(r.read().decode('utf-8'))

7.ProxyHandler處理器(IP設置)

西刺免費代理IP:http://www.xicidaili.com/
快代理:http://www.kuaidaili.com/
代理雲:http://www.dailiyun.com/
from urllib import request
#沒使用代理
url = 'http://httpbin.org/ip'
r = request.urlopen(url)
print(r.read())
#使用代理
url = 'http://httpbin.org/ip'
#1.使用ProxyHandler,傳入代理構建一個handler
handler = request.ProxyHandler({"http":"163.204.244.84"})
#2.使用上面的handler構建一個opener
opener =request.build_opener(handler)
#3.使用opener發送一個請求
r = opener.open(url)
print(r.read())

8.cookie存儲用戶ID

#cookie的格式:
set-Cookie:NAME=VALUE;Expires/Max=age=DATE;Path=PATH;Domain=DOMAIN_NAME;SECURE
NAME:cookie的名字
VALUE:cookie的值
Expires:cookie的過期時間
Path:cookie作用的路徑
Domain:cookie作用的域名

#使用cookie模擬登錄

from urllib import request
#1.不使用cookie來請求知乎頁面
zhihu_url = "https://www.zhihu.com/follow"
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
   
}
req = request.Request(url=zhihu_url,headers=headers)
resp = request.urlopen(req)
#print(resp.read().decode('utf-8'))
with open('zhihu.html','w') as fp:
    #write函數必須寫入一個str的數據類型
    #resp.read()讀出來的是bytes數據類型
    #str-> encode -> bytes
    fp.write(resp.read().decode('gbk','ignore'))

實例:爬取知乎

#http.cookiejar模塊
from urllib import request,parse
from http.cookiejar import CookieJar

headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' 
}

def get_opener():
    #1.登陸
    #1.1創建一個cookiejar對象
    cookiejar = CookieJar()
    #1.2使用cookiejar創建一個httpchookiejar對象
    handler = request.HTTPCookieProcessor(cookiejar)
    #1.3使用上一步創建的handler創建一個opener
    opener = request.build_opener(handler)
    return opener

def login_zhihu(opener):
    #1.4使用opener發送登陸請求
    data = {
        "username":"19912456595",
        "password":"pythoncookie0"   
    }
    login_url = "https://www.zhihu.com/signin?next=%2Fsettings%2Faccount"
    req = request.Request(login_url,data =parse.urlencode(data).encode('utf-8'), headers=headers)
    opener.open(req)
    

def visit_zhihu(opener):
    #2.訪問知乎
    zhihu_url = "https://www.zhihu.com/follow"
    resp = opener.open(zhihu_url)
    with open('zhihu0.html','w',encoding='utf-8') as fp:
        fp.write(resp.read().decode("utf-8"))

if __name__ == '__main__':
    opener = get_opener()
    login_zhihu(opener)
    visit_zhihu(opener)

9.cookie信息的加載與保存

#MozillaCookieJar
from urllib import request
from http.cookiejar import MozillaCookieJar

cookiejar = MozillaCookieJar('cookie.txt')
cookiejar.load(ignore_discard=True)
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)

resp = opener.open('http://httpbin.org/cookies/set?freeform=spider')
for cookie in cookiejar:
    print(cookie)
#cookiejar.save(ignore_discard=True)

10.requests庫的使用

#發送GET請求

import requests

response = requests.get("https://www.baidu.com/")
#print(type(response.text))
#print(response.text)
#print(type(response.content))
#print(response.content.decode('utf-8'))
print(response.url)
print(response.encoding)
print(response.status_code)
import requests

params = {
    'wd':'中國'
}
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
response = requests.get("https://www.baidu.com/s",params=params, headers=headers)

with open('baidu.html','w',encoding='utf-8') as fp:
    fp.write(response.content.decode('utf-8'))
    
print(response.url)

#發送POST請求

import requests
data = {
    'first':'true',
    'pn':'1',
    'kw':'python'
}
headers = {
    'Referer':'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=p',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
response = requests.post('https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=falseRequest Method: POST', data=data,headers=headers)
print(type(response.json()))
print(response.text)

requests使用代理

import requests
proxy = {
    'http':'123.149.38.52'
}
response = requests.get('http://httpbin.org/ip',proxies=proxy)
print(response.text)

requests處理cookie信息

#Session  爬取知乎
url = 'https://www.zhihu.com/signin?next=%2Fsettings%2Faccount'
data = { "username":"19912456595","password":"pythoncookie0"  }
headers = {
     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' 
}
session = requests.Session()
session.post(url,data=data,headers=headers)
response = session.get('https://www.zhihu.com/follow')
with open('zhihu1.html','w',encoding='utf-8') as f:
    f.write(response.text)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章