爬蟲學習筆記 - day02


爬蟲流程

重寫ua
指定url
封裝參數
通用爬蟲
聚焦爬蟲
導入模塊
發起請求
處理響應
持久化存儲
處理數據

爬蟲入門代碼示例

# 最簡單的爬蟲

# 導入模塊
import requests

# 指定url
url = "https://www.sogou.com/web"

# 發送請求, 拿到響應結果
rep = requests.get(url=url)

# 得到網頁原編碼
repEncode = rep.encoding
with open(filename, "w", encoding=repEncode) as f:
	f.write(rep.text)


關於響應結果

方法 結果
response.status_code 狀態碼, 正常爲200
response.encoding 網頁原字符編碼
response.text 字符串
response.content bytes類型結果
response.url 請求的url
response.headers 請求頭信息

通用爬蟲 - requests模塊

代碼示例

get請求

"""

帶參數發送get請求, 之後所有代碼都會進行ua僞裝

"""
# 導入模塊
import requests

# 重寫ua(ua僞裝)
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}

# 指定url
url = "https://www.sogou.com/web"

# 封裝參數
params = {
	"query": "美女圖片",
}

# 發送請求, 拿到響應結果
rep = requests.get(url=url, params=params, headers=headers)

# 得到網頁原編碼
repEncode = rep.encoding
with open(filename, "w", encoding=repEncode) as f:
	f.write(rep.text)


post請求

import requests

# 指定url
url = "xxx"

# ua僞裝
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}

# 設置參數
data = {
    "keyA": "valueA",
    "keyB": "valueB",
}

# 發送post請求
rep = requests.post(url=url, data=data, headers=headers)


# 獲取取數據
repText = rep.text

# 模擬持久化存儲
print(repText)


ajax get請求

import requests

# 通過抓包確定url
url = "http://image.so.com/zjl?ch=beauty&sn=60&listtype=new&temp=1"

# 處理參數
param = {
    "ch": "beauty",
    "sn": "60",
    "listtype": "new",
    "temp": "1",
}

# 僞裝ua
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}

# 發送請求
rep = requests.get(url=url, params=param, headers=headers)

repEncode = rep.encoding
repText = rep.text

print(repText)

# 輸出結果
with open("360pic美女.html","w",encoding=repEncode) as f:
    f.write(repText)
    

ajax post請求

import requests

# 通過轉包確定url
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"

data = {
    "cname": "",
    "pid": "",
    "keyword": "北京",
    "pageindex": "3",
    "pageSize": "10",
}

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}

rep = requests.post(url=url, data=data, headers=headers)

print(rep.text)


練習 - 普通爬蟲

# get請求 爬取頁碼範圍內的數據
#
# 涉及內容:
# 	1. 交互
#   2. 代理IP

import requests

# 指定url
url = "https://www.sogou.com/sogou"

# 獲取關鍵字, 頁碼範圍
keyword = input("input key word: ")
startPage = int(input("input start page: "))
endPage = int(input("input end page: "))

# ua僞裝
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}

# 循環頁碼範圍, 分別設定參數並請求數據, 增加了代理ip設置
for i in range(startPage, endPage+1):
    params = {
        "query": keyword,
        "page": str(i),
    }

    # 增加代理ip設置, 網址www.goubanjia.com
    # 在網頁中選擇一個ip, 在proxies參數中填寫協議和ip端口
    rep = requests.get(url=url, params=params, headers=headers, proxies={"http": "117.127.16.207:8080"})

    # 獲取頁面的編碼
    repEncode = rep.encoding

    # 文件名
    fileName = "{}_{}.html".format(keyword, i)

    # 根據編碼寫入文件
    with open(fileName, "w", encoding=repEncode) as f:
        f.write(rep.text)


聚焦爬蟲

正則

介紹

正則表達式


代碼示例

import requests
import re
import os

# 指定url
url = "https://www.qiushibaike.com/pic/"

# 僞裝ua
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}

# 發請求, 得到數據
rep = requests.get(url=url,headers=headers)

# 拿到頁面數據
repText = rep.text

# 設定正則條件
re_rule = '<div class="thumb">.*?<img src="//(.*?)" .*?</div>' # 這裏使用正則的非貪婪匹配模式

# 根據正則拿到圖片地址
img_url = re.findall(re_rule, repText, re.S) # re.S單行匹配(simple), re.M多行匹配(More), re.I忽略大小寫(ignore)

# 設置圖片存儲路徑
if not os.path.exists("./qiutu"):
    os.mkdir("qiutu")

# 循環正則匹配的結果
for i in img_url:
    # 完善url
    url = "http://" + i

    # 根據url請求圖片
    r = requests.get(url=url, headers=headers)

    # 獲得二進制數據
    rCont = r.content

    # 生成文件名
    filename = i.split("/")[-1]

    # 指定存儲路徑
    file = "./qiutu/" + filename

    # 持久化存儲
    with open(file,"wb") as f:
        f.write(rCont)
        print("{}下載完成".format(file))


xpath

xpath表達式

屬性定位:
    # 找到class屬性值爲song的div標籤
    //div[@class="song"] 

層級&索引定位:
    # 找到class屬性值爲tang的div的直系子標籤ul下的第二個子標籤li下的直系子標籤a
    //div[@class="tang"]/ul/li[2]/a

邏輯運算:
    # 找到href屬性值爲空且class屬性值爲du的a標籤
    //a[@href="" and @class="du"]

模糊匹配:
    //div[contains(@class, "ng")]
    //div[starts-with(@class, "ta")]

取文本:
    # /表示獲取某個標籤下的文本內容
    # //表示獲取某個標籤下的文本內容和所有子標籤下的文本內容

    //div[@class="song"]/p[1]/text()
    //div[@class="tang"]//text()

取屬性:
    //div[@class="tang"]//li[2]/a/@href


代碼示例

# 爬取17173手遊網即將開測
import requests
from lxml import etree
import xlwt


def write_excel(data):
    # 創建一個workbook 設置編碼
    workbook = xlwt.Workbook(encoding='utf-8')
    # 創建一個worksheet
    worksheet = workbook.add_sheet('My Worksheet')

    # 寫入excel
    # 參數對應 行, 列, 值
    for i in range(len(data)):
        for j in range(len(data[i])):
            if len(data[i][j]) > 1:
                data[i][j] = "{}, {}".format(data[i][j][0], data[i][j][1])
            worksheet.write(i, j, data[i][j])

    # 保存
    workbook.save('Excel_test.xls')


def spider():
    """
    爬取特定url的數據
    並將數據處理, 返回列表
    :return:
    """
    url = "http://newgame.17173.com/shouyou/ceshi"

    # ua僞裝
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
    }

    # 請求
    rep = requests.get(url=url, headers=headers)
    rep_text = rep.text

    tree = etree.HTML(rep_text) # tree 是element 類型

    # 獲取列表
    div_list = tree.xpath('//div[@class="g-box4 box"][1]//ul[2]/li')

    ret = [["遊戲名", "上線時間", "測試類型", "遊戲類型", "平臺", "工作室"]]

    for i in div_list:
        name = i.xpath('.//h6[@class="c1"]/a/text()')
        time = i.xpath('.//p[@class="c2"]/text()')
        qa_type = i.xpath('.//p[@class="c3"]/text()')
        game_type = i.xpath('.//i[@class="c4"]/text()')
        plate = i.xpath('./p[@class="c5"]/span//text()')
        auth = i.xpath('.//span[@class="c7"]/text()')


        data = [name, time, qa_type, game_type, plate, auth]
        ret.append(data)

    return ret


if __name__ == "__main__":
    data = spider()
    # print(data)
    write_excel(data)
	
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章