python 爬蟲學習第一課

get方式請求

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))

post方式請求

#post方式請求
import urllib.parse
import urllib.request

data = bytes(urllib.parse.urlencode({'form_email':'sun', 'form_password':'123456'}), encoding='utf8')
print(data)
response = urllib.request.urlopen('https://www.douban.com/accounts/login?source=main', data = data)
print(response.read())

超時時間設置

#超時時間設置
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
print(response.read())

超時時間設置,對錯誤進行處理

#超時時間設置,對錯誤進行處理
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

響應:響應類型、狀態碼、響應頭

#響應:響應類型、狀態碼、響應頭
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/')
print(type(response))
print(type(response.status))
print(response.status)
#getheaders/getheader 前者得到一個響應頭列表/後者獲取響應頭中的某一項
print(type(response.getheaders()))
print(response.getheader('date'))

設置request請求頭,添加代理

#設置request請求頭,添加代理
from urllib import request, parse

url = 'https://36kr.com/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Host': '36kr.com'
}
proxy_handle = request.ProxyHandler({
    'http': 'http://183.47.40.35:8088',
    'http': 'https://42.176.36.251:37000'
})
opener = request.build_opener(proxy_handle)

req = request.Request(url, headers=headers)
response = opener.open(req)
print(response.read().decode('utf-8'))

cookie, HTTPCookiProcessor

#cookie, HTTPCookiProcessor
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opner = urllib.request.build_opener(handler)
response = opner.open('http://wwww.baidu.com')
print(cookie)
for item in cookie:
    print(type(item))
    print(item.name + "=" + item.value)

cookie保存到文件,MozillaCookieJar

#cookie保存到文件,MozillaCookieJar
import http.cookiejar
import urllib.request
import os

filename = "cookie.txt"
if_exist = os.path.exists(filename)
print(if_exist)

cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

cookie保存到文件, LWPCookieJar

#cookie保存到文件, LWPCookieJar
import http.cookiejar
import urllib.request

filename = 'cookie1.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

讀取文件中的cookie

#讀取文件中的cookie
import http.cookiejar
import urllib.request

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie1.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

異常處理

#異常處理
from urllib import request, error

try:
    response = request.urlopen('http://pythonsite.com/1111.html')
except error.HTTPError as e:
    print(e.reason)
    print(e.code)
    print(e.headers)
except error.URLError as e:
    print(e.reason)

urlparse url拆分

#urlparse url拆分

from urllib.parse import urlparse

result = urlparse('https://www.cnblogs.com/zhaof/p/6910871.html')
print(result)

urlunparse url合併

#urlunparse url合併
from urllib.parse import urlunparse

data = ['http', 'www.cnblogs.com', '/zhaof/p/6910871.html','','','']
print(urlunparse(data))

urljoin url拼接

#urljoin url拼接
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','http://www.qdaily.com/tags/29.html'))

urlencode 字典url

#urlencode 字典url
from urllib.parse import urlencode

param = {
    "name":"Lyli",
    "age":"23"
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(param)
print(url)

注:本文學習材料來自於python爬蟲從入門到放棄(三)之 Urllib庫的基本使用

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章