python 爬虫学习第一课

get方式请求

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))

post方式请求

#post方式请求
import urllib.parse
import urllib.request

data = bytes(urllib.parse.urlencode({'form_email':'sun', 'form_password':'123456'}), encoding='utf8')
print(data)
response = urllib.request.urlopen('https://www.douban.com/accounts/login?source=main', data = data)
print(response.read())

超时时间设置

#超时时间设置
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
print(response.read())

超时时间设置,对错误进行处理

#超时时间设置,对错误进行处理
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

响应:响应类型、状态码、响应头

#响应:响应类型、状态码、响应头
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/')
print(type(response))
print(type(response.status))
print(response.status)
#getheaders/getheader 前者得到一个响应头列表/后者获取响应头中的某一项
print(type(response.getheaders()))
print(response.getheader('date'))

设置request请求头,添加代理

#设置request请求头,添加代理
from urllib import request, parse

url = 'https://36kr.com/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Host': '36kr.com'
}
proxy_handle = request.ProxyHandler({
    'http': 'http://183.47.40.35:8088',
    'http': 'https://42.176.36.251:37000'
})
opener = request.build_opener(proxy_handle)

req = request.Request(url, headers=headers)
response = opener.open(req)
print(response.read().decode('utf-8'))

cookie, HTTPCookiProcessor

#cookie, HTTPCookiProcessor
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opner = urllib.request.build_opener(handler)
response = opner.open('http://wwww.baidu.com')
print(cookie)
for item in cookie:
    print(type(item))
    print(item.name + "=" + item.value)

cookie保存到文件,MozillaCookieJar

#cookie保存到文件,MozillaCookieJar
import http.cookiejar
import urllib.request
import os

filename = "cookie.txt"
if_exist = os.path.exists(filename)
print(if_exist)

cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

cookie保存到文件, LWPCookieJar

#cookie保存到文件, LWPCookieJar
import http.cookiejar
import urllib.request

filename = 'cookie1.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

读取文件中的cookie

#读取文件中的cookie
import http.cookiejar
import urllib.request

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie1.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

异常处理

#异常处理
from urllib import request, error

try:
    response = request.urlopen('http://pythonsite.com/1111.html')
except error.HTTPError as e:
    print(e.reason)
    print(e.code)
    print(e.headers)
except error.URLError as e:
    print(e.reason)

urlparse url拆分

#urlparse url拆分

from urllib.parse import urlparse

result = urlparse('https://www.cnblogs.com/zhaof/p/6910871.html')
print(result)

urlunparse url合并

#urlunparse url合并
from urllib.parse import urlunparse

data = ['http', 'www.cnblogs.com', '/zhaof/p/6910871.html','','','']
print(urlunparse(data))

urljoin url拼接

#urljoin url拼接
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','http://www.qdaily.com/tags/29.html'))

urlencode 字典url

#urlencode 字典url
from urllib.parse import urlencode

param = {
    "name":"Lyli",
    "age":"23"
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(param)
print(url)

注:本文学习材料来自于python爬虫从入门到放弃(三)之 Urllib库的基本使用

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章