python爬蟲入門xpath

原創

2020-02-23 02:43

import requests
import csv
import os
from lxml import etree


def getHtml(name, page):
    url = "https://search.jd.com/Search?"
    parmas = {"keyword": name, "enc": "utf-8", "page": page}
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
    try:
        res = requests.get(url=url, params=parmas, headers=headers)

        res.encoding = res.apparent_encoding

        return res.text
    except Exception as e:
        print(e)


def getdiv(html):
    xpl = etree.HTML(html)  # 解析網頁,構造xpath對象
    divs = xpl.xpath("//div[@class='gl-i-wrap']")
    return divs


def getTitle(divs):
    for i in divs:
        print("-------------------------------------------------------------")
        print(i.xpath('div[@class="p-name p-name-type-2"]//em/text()'))
        data = i.xpath('div[@class="p-name p-name-type-2"]//em/text()')
        for data1 in data:
            index = data1.find(" ")
            dict1 = {"商品名稱": data1[0:index], "描述": data1[index + 1:]}
        price = i.xpath('div[@class="p-price"]//i/text()')
        shop = i.xpath('div[@class="p-shop"]//a/text()')
        for data2, data3 in zip(price, shop):
            dict2 = {"價格": data2, "店鋪": data3}
        print("評論數:", i.xpath('div[@class="p-commit"]//a'), "條評價")
        # for item in i.xpath('div[@class="p-commit"]/strong'):
        #     print(etree.tostring(item).decode("utf-8"))
        dic = {}
        dic.update(dict1)
        dic.update(dict2)
        imgs = i.xpath('div[@class="p-img"]//img[@source-data-lazy-img]')  # @[0].get("source-data-lazy-img")
        for index in range(0, len(imgs)):
            img = imgs[index].get("source-data-lazy-img")
            dict3 = {"圖片鏈接": img}
            # print(img)
        dic.update(dict3)
        with open("jd.csv", "a", encoding="utf8", newline="")as f:
            title = ["商品名稱", "描述", "價格", "店鋪", "圖片鏈接"]
            dictwri = csv.DictWriter(f, title)
            if os.path.getsize('jd.csv') == 0:
                dictwri.writeheader()
            dictwri.writerow(dic)


if __name__ == "__main__":
    for i in range(1, 100):
        html = getHtml("寵物", str(i))
        divs = getdiv(html)
        getTitle(divs)

運算符	描述	實例	返回值
or	或	price=9.80 or price=9.70	如果 price 是 9.80，則返回 true如果 price 是 9.50，則返回 false
and	與	price>9.00 and price<9.90	如果 price 是 9.80，則返回 true如果 price 是 8.50，則返回 false
mod	計算除法的餘數	5 mod 2	1
\	計算兩個節點集	//book \ //cd	返回所有擁有 book 和 cd 元素的節點集
+	加法	6 + 4	10
-	減法	6 - 4	2
*	乘法	6 * 4	24
div	除法	8 div 4	2

運算符	描述	實例	返回值
or	或	price=9.80 or price=9.70	如果 price 是 9.80，則返回 true如果 price 是 9.50，則返回 false
and	與	price>9.00 and price<9.90	如果 price 是 9.80，則返回 true如果 price 是 8.50，則返回 false
mod	計算除法的餘數	5 mod 2	1
\	計算兩個節點集	//book \ //cd	返回所有擁有 book 和 cd 元素的節點集
+	加法	6 + 4	10
-	減法	6 - 4	2
*	乘法	6 * 4	24
div	除法	8 div 4	2

有時需要選取可能某些屬性同時匹配了多個節點，但是隻想需要其中的某個節點，如第二個節點，或者最後一個節點，這時該怎麼辦呢？

可以利用中括號傳入索引的方法獲取特定次序的節點，示例如下：

from lxml import etreehtml = etree.parse('testData.html', etree.HTMLParser())result = html.xpath('//li[1]/a/text()')print(result)result = html.xpath('//li[last()]/a/text()')print(result)result = html.xpath('//li[position()<3]/a/text()')print(result)result = html.xpath('//li[last()-2]/a/text()')print(result)

表達式	描述
nodename	選取此節點的所有子節點
/	從當前節點選取直接子節點
//	從當前節點選取子孫節點
.	選取當前節點
..	選取當前節點的父節點
@	選取屬性

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python爬蟲入門xpath

Python 爬蟲：Spring Boot 反爬蟲的成功案例

京東科技數字化營銷能力的演進與最佳實踐| 京東雲技術團隊

VirtualAlloc的使用

http頭部常見信息

Linux複習： semaphore.h信號量和生產者消費者

傳遞對象和傳遞指針（是否產生多態）

排序1 O（nlogn）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結