Python3 urllib(網絡數據獲取 模塊)

本文由 Luzhuo 編寫,轉發請保留該信息. 
原文: http://blog.csdn.net/Rozol/article/details/71307483


以下代碼以Python3.6.1爲例
Less is more!


#coding=utf-8
# urllibdemo.py urllib演示

# urllib 用於處理Url相關的工具,用於從網絡獲取數據(網頁源碼/下載資源)
from urllib import request # 請求url, 支持 HTTP(0.9/1.0) / FTP / 本地文件 / URL
from urllib import parse # 解析url, 支持 file / ftp / gopher / hdl / http / https / imap / mailto / mms / news / nntp / prospero / rsync / rtsp / rtspu / sftp / shttp / sip / sips / snews / svn / svn+ssh / telnet / wais
from urllib import robotparser # 分析 robots.txt 文件
from urllib import error # 異常
import re # 正則模塊
from bs4 import BeautifulSoup
import os

# 演示(下載鬥魚首頁的圖片)
def demo():
    os.mkdir("images")

    # -- 獲取網頁源代碼 --
    f = request.urlopen("https://www.douyu.com")
    data = f.read().decode("utf-8")

    # -- 獲取網頁源碼中的圖片地址 --
    # 方式一: 正則的方式
    images = re.findall(r'src="(.*?\.(jpg|png))"', data)
    tnum = 0
    for i in images:
        # 下載資源
        request.urlretrieve(i[0], "./images/%d.%s"%(tnum, i[1]))
        tnum += 1

    # 方式二: Beautiful Soup (安裝: pip install beautifulsoup4) 提取html/xml標籤中的內容
    soup = BeautifulSoup(data, "html.parser")
    images = soup.find_all("img") # 取標籤

    tnum = 0
    for i in images:
        # 下載資源
        imgurl = i.get("src")
        if len(imgurl) > 3:
            request.urlretrieve(imgurl, "./images/%d.jpg"%tnum)
        tnum += 1

    # -- 關閉 --
    f.close



# 參數詳解
def fun():
    neturl = "http://luzhuo.me/blog/Base1.html"
    imgurl = "http://luzhuo.me/image/performers/%E5%85%B3%E6%99%93%E5%BD%A4.jpg"



    # --- urllib.parse --- 解析Url
    # - 編碼 -
    neturl = "%s?%s" %(neturl, parse.urlencode({"name":"luzhuo", "age": 21})) # Get傳參url構建
    data = parse.urlencode({"name":"luzhuo", "啊age": 21}).encode('ascii') # POST參參data構建

    # - 解碼 -
    urls = parse.urlparse(imgurl) # => ParseResult(scheme='http', netloc='luzhuo.me', path='/image/performers/%E5%85%B3%E6%99%93%E5%BD%A4.jpg', params='', query='', fragment=''
    urls = parse.urlparse("//luzhuo.me/image/performers/%E5%85%B3%E6%99%93%E5%BD%A4.jpg?a=1")
    scheme = urls.scheme # 獲取相應數據

    # - 替換 -
    url = parse.urljoin('http://luzhuo.me/blog/Base1.html', 'Fame.html') # 替換後部分 => http://luzhuo.me/blog/Fame.html
    url = parse.urljoin('http://luzhuo.me/blog/Base1.htm', '//xxx/blog') # => http://xxx/blog



    # --- urllib.reques --- 請求數據
    try:
        # - Request - 構建
        req = request.Request(neturl) # GET
        req = request.Request(neturl, headers = {"1":"2"}) # 添加請求頭
        req = request.Request(neturl, data=b'This is some datas.') # POST 添加POST請求數據
        req = request.Request(neturl, data) # POST 添加POST請求數據
        req = request.Request(neturl, data=b"This is some datas.", method="PUT") # PUT 其他類型的請求

        # 獲取
        url = req.full_url # 獲取Url
        reqtype = req.type # 請求類型(如http)
        host = req.host # 主機名(如:luzhuo.me / luzhuo.me:8080)
        host = req.origin_req_host # 主機名(如:luzhuo.me)
        url = req.selector # url路徑(如:/blog/Base1.html)
        data = req.data # 請求的實體,沒有爲Noce
        boolean = req.unverifiable # 是否是RFC 2965定義的不可驗證的
        method = req.get_method() # 請求方式(如:GET / POST)
        # 修改
        req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36") # 添加請求頭,鍵衝突將覆蓋
        req.add_unredirected_header("Key", "value") # 添加不會重定向的請求頭
        req.remove_header("Key") # 刪除請求頭
        req.get_header("Key") # 獲取請求頭, 無返回None
        req.get_header("Key", "None.") # 獲取請求頭
        boolean = req.has_header("Key") # 是否有該請求頭
        headers = req.header_items() # (所有)請求頭列表
        req.set_proxy("220.194.55.160:3128", "http") # 設置代理(主機,類型)
        # 下載
        filename, headers = request.urlretrieve(imgurl) # 下載資源(不提供文件名不復制), 返回(文件名,頭信息)元組
        filename, headers = request.urlretrieve(imgurl, filename="./xxx.jpg", reporthook=callback, data=None) # reporthook下載進度回調
        request.urlcleanup() # 清除下載的臨時文件



        # - response - 請求結果
        res = request.urlopen(neturl) # GET 打開Url,返回response
        res = request.urlopen(neturl, data=b'This is some datas.') # POST 添加POST請求數據
        res = request.urlopen(req) # 支持 Request 參數
        # 獲取信息
        data = res.read().decode("utf-8") # 讀取全部數據
        data = res.readline().decode("utf-8") # 讀取行數據
        url = res.geturl() # 獲取Url
        info = res.info() # 元信息,如頭信息
        code = res.getcode() # 狀態碼

        # 釋放資源
        res.close



    # --- urllib.error ---
    except error.URLError as e:
        print(e)
    except error.HTTPError as e:
        # code / reason / headers 異常
        print(e)
    except error.ContentTooShortError as e:
        # 數據下載異常
        print(e)



def robot():
    # --- urllib.robotparser --- robots.txt
    rp = robotparser.RobotFileParser()
    rp.set_url("https://www.zhihu.com/robots.txt") # 設置指向 robots.txt 文件的網址
    rp.read() # 獲取數據給解析器
    boolean = rp.can_fetch("*", "http://www.musi-cal.com/") # 是否允許提取該url
    time = rp.mtime() # 獲取 robots.txt 的時間
    rp.modified() # 將 robots.txt 時間設爲當前時間



# 下載進度回調
def callback(datanum, datasize, filesize): # (數據塊數量 數據塊大小 文件大小)
    down = 100 * datanum * datasize / filesize

    if down > 100:
        down = 100

    print ("%.2f%%"%down)



if __name__ == "__main__":
    demo()
    fun()
    robot()





發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章