python 爬蟲學習第一課

原創

2018-12-28 15:23

python 爬蟲學習之urllib模塊

get方式請求

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))

post方式請求

#post方式請求
import urllib.parse
import urllib.request

data = bytes(urllib.parse.urlencode({'form_email':'sun', 'form_password':'123456'}), encoding='utf8')
print(data)
response = urllib.request.urlopen('https://www.douban.com/accounts/login?source=main', data = data)
print(response.read())

超時時間設置

#超時時間設置
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
print(response.read())

超時時間設置，對錯誤進行處理

#超時時間設置，對錯誤進行處理
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

響應：響應類型、狀態碼、響應頭

#響應：響應類型、狀態碼、響應頭
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/')
print(type(response))
print(type(response.status))
print(response.status)
#getheaders/getheader 前者得到一個響應頭列表/後者獲取響應頭中的某一項
print(type(response.getheaders()))
print(response.getheader('date'))

設置request請求頭,添加代理

#設置request請求頭,添加代理
from urllib import request, parse

url = 'https://36kr.com/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Host': '36kr.com'
}
proxy_handle = request.ProxyHandler({
    'http': 'http://183.47.40.35:8088',
    'http': 'https://42.176.36.251:37000'
})
opener = request.build_opener(proxy_handle)

req = request.Request(url, headers=headers)
response = opener.open(req)
print(response.read().decode('utf-8'))

cookie, HTTPCookiProcessor

#cookie, HTTPCookiProcessor
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opner = urllib.request.build_opener(handler)
response = opner.open('http://wwww.baidu.com')
print(cookie)
for item in cookie:
    print(type(item))
    print(item.name + "=" + item.value)

cookie保存到文件，MozillaCookieJar

#cookie保存到文件，MozillaCookieJar
import http.cookiejar
import urllib.request
import os

filename = "cookie.txt"
if_exist = os.path.exists(filename)
print(if_exist)

cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

cookie保存到文件, LWPCookieJar

#cookie保存到文件, LWPCookieJar
import http.cookiejar
import urllib.request

filename = 'cookie1.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

讀取文件中的cookie

#讀取文件中的cookie
import http.cookiejar
import urllib.request

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie1.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

異常處理

#異常處理
from urllib import request, error

try:
    response = request.urlopen('http://pythonsite.com/1111.html')
except error.HTTPError as e:
    print(e.reason)
    print(e.code)
    print(e.headers)
except error.URLError as e:
    print(e.reason)

urlparse url拆分

#urlparse url拆分

from urllib.parse import urlparse

result = urlparse('https://www.cnblogs.com/zhaof/p/6910871.html')
print(result)

urlunparse url合併

#urlunparse url合併
from urllib.parse import urlunparse

data = ['http', 'www.cnblogs.com', '/zhaof/p/6910871.html','','','']
print(urlunparse(data))

urljoin url拼接

#urljoin url拼接
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','http://www.qdaily.com/tags/29.html'))

urlencode 字典url

#urlencode 字典url
from urllib.parse import urlencode

param = {
    "name":"Lyli",
    "age":"23"
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(param)
print(url)

注：本文學習材料來自於python爬蟲從入門到放棄（三）之 Urllib庫的基本使用

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python 爬蟲學習第一課

python 爬蟲學習之urllib模塊

get方式請求

post方式請求

超時時間設置

超時時間設置，對錯誤進行處理

響應：響應類型、狀態碼、響應頭

設置request請求頭,添加代理

cookie, HTTPCookiProcessor

cookie保存到文件，MozillaCookieJar

cookie保存到文件, LWPCookieJar

讀取文件中的cookie

異常處理

urlparse url拆分

urlunparse url合併

urljoin url拼接

urlencode 字典url

scrapy學習第一課

PHP學習練手（十）

PHP學習練手（九）

spring錯誤及解決方法總結

hibernate之Validator使用

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結