python 爬虫学习第一课

原創

2018-12-28 15:23

python 爬虫学习之urllib模块

get方式请求

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))

post方式请求

#post方式请求
import urllib.parse
import urllib.request

data = bytes(urllib.parse.urlencode({'form_email':'sun', 'form_password':'123456'}), encoding='utf8')
print(data)
response = urllib.request.urlopen('https://www.douban.com/accounts/login?source=main', data = data)
print(response.read())

超时时间设置

#超时时间设置
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
print(response.read())

超时时间设置，对错误进行处理

#超时时间设置，对错误进行处理
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

响应：响应类型、状态码、响应头

#响应：响应类型、状态码、响应头
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/')
print(type(response))
print(type(response.status))
print(response.status)
#getheaders/getheader 前者得到一个响应头列表/后者获取响应头中的某一项
print(type(response.getheaders()))
print(response.getheader('date'))

设置request请求头,添加代理

#设置request请求头,添加代理
from urllib import request, parse

url = 'https://36kr.com/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Host': '36kr.com'
}
proxy_handle = request.ProxyHandler({
    'http': 'http://183.47.40.35:8088',
    'http': 'https://42.176.36.251:37000'
})
opener = request.build_opener(proxy_handle)

req = request.Request(url, headers=headers)
response = opener.open(req)
print(response.read().decode('utf-8'))

cookie, HTTPCookiProcessor

#cookie, HTTPCookiProcessor
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opner = urllib.request.build_opener(handler)
response = opner.open('http://wwww.baidu.com')
print(cookie)
for item in cookie:
    print(type(item))
    print(item.name + "=" + item.value)

cookie保存到文件，MozillaCookieJar

#cookie保存到文件，MozillaCookieJar
import http.cookiejar
import urllib.request
import os

filename = "cookie.txt"
if_exist = os.path.exists(filename)
print(if_exist)

cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

cookie保存到文件, LWPCookieJar

#cookie保存到文件, LWPCookieJar
import http.cookiejar
import urllib.request

filename = 'cookie1.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

读取文件中的cookie

#读取文件中的cookie
import http.cookiejar
import urllib.request

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie1.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

异常处理

#异常处理
from urllib import request, error

try:
    response = request.urlopen('http://pythonsite.com/1111.html')
except error.HTTPError as e:
    print(e.reason)
    print(e.code)
    print(e.headers)
except error.URLError as e:
    print(e.reason)

urlparse url拆分

#urlparse url拆分

from urllib.parse import urlparse

result = urlparse('https://www.cnblogs.com/zhaof/p/6910871.html')
print(result)

urlunparse url合并

#urlunparse url合并
from urllib.parse import urlunparse

data = ['http', 'www.cnblogs.com', '/zhaof/p/6910871.html','','','']
print(urlunparse(data))

urljoin url拼接

#urljoin url拼接
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','http://www.qdaily.com/tags/29.html'))

urlencode 字典url

#urlencode 字典url
from urllib.parse import urlencode

param = {
    "name":"Lyli",
    "age":"23"
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(param)
print(url)

注：本文学习材料来自于python爬虫从入门到放弃（三）之 Urllib库的基本使用

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python 爬虫学习第一课

python 爬虫学习之urllib模块

get方式请求

post方式请求

超时时间设置

超时时间设置，对错误进行处理

响应：响应类型、状态码、响应头

设置request请求头,添加代理

cookie, HTTPCookiProcessor

cookie保存到文件，MozillaCookieJar

cookie保存到文件, LWPCookieJar

读取文件中的cookie

异常处理

urlparse url拆分

urlunparse url合并

urljoin url拼接

urlencode 字典url

测试人员都是画画大神，让我看看谁还不会用代码图？

Object.values()对象遍历

scrapy學習第一課

PHP學習練手（十）

PHP學習練手（九）

spring錯誤及解決方法總結

hibernate之Validator使用

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結