Python3-爬取頁面元素的幾種方法

原創

2020-07-01 20:02

方法一：xpath

from lxml import etree
import requests
from requests import exceptions

def get_response(url, headers=None, timeout=None):

    """"
    當條件爲假，斷言失敗
    assert response.status_code == 200'請求報錯，請求狀態碼爲: %s'%str(response.status_code)
    """
    try:
        response = requests.get(url, headers=headers, timeout=timeout)
    except exceptions.Timeout as e:
        response = None
        raise e
    except exceptions.HTTPError as e:
        response = None
        raise e
    except exceptions.ConnectTimeout as e:
        response = None
        raise e
    except exceptions.ReadTimeout as e:
        response = None
        raise e
    except exceptions.ProxyError as e:
        response = None
        raise e
    except Exception as e:
        response = None
        raise e
    finally:
        return response

def get_content(etree_html, xpath):
    result = []
    content = etree_html.xpath(xpath)  # list 類型
    for each in content:
        # 把結果中的換行和空格去掉
        re_each = each.replace('\n', '').replace(' ', '')
        if re_each == '\n' or re_each == '':
            continue
        else:
            # print(re_each)
            result.append(re_each)
    return result

# 然後

    def get_spider_content_xpath(self):
        # func01. 獲取源碼, 通過 xpath 爬取數據
        html = get_response(self.url, param.headers, 10)
        etree_html = etree.HTML(html.text)
        result = get_content(etree_html, param.xpath) # 這裏param.xpath是封裝的xpath參數
        return result

result 就是通過 xpath 方式得到的數據內容；

其中，xpath 我寫了個簡單例子：

xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/text()"

方法二：正則匹配

from lxml import etree
import requests
from requests import exceptions

def get_response(url, headers=None, timeout=None):

    """"
    當條件爲假，斷言失敗
    assert response.status_code == 200'請求報錯，請求狀態碼爲: %s'%str(response.status_code)
    """
    try:
        response = requests.get(url, headers=headers, timeout=timeout)
    except exceptions.Timeout as e:
        response = None
        raise e
    except exceptions.HTTPError as e:
        response = None
        raise e
    except exceptions.ConnectTimeout as e:
        response = None
        raise e
    except exceptions.ReadTimeout as e:
        response = None
        raise e
    except exceptions.ProxyError as e:
        response = None
        raise e
    except Exception as e:
        response = None
        raise e
    finally:
        return response

# 然後

    def get_spider_content_re(self):
        # func02. 獲取源碼，直接通過源碼正則匹配爬取數據
        # 需要配置參數：url 和 headers 中的user-agent ，均在 param 中配置
        html = get_response(self.url, param.headers, 10)
        html_text = html.text
        # 正則匹配頁面所有url
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html_text,
                          re.S)
        # list 去重 -- list({}.fromkeys(list).keys())
        urls = list({}.fromkeys(urls).keys())
        return urls

urls 就是要匹配的內容

其中，匹配 url 的正則表達式還算可用，試了很多版本，個人認爲這個表達式還算靠譜：

'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

方法三：標籤 BeautifulSoup find find_all

import requests
from urllib.parse import urlparse
from urllib import request, parse
from bs4 import BeautifulSoup

word = '周杰倫'
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(word) + '&pn=0'
page = request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
tagh3 = soup.find_all('a')  # 返回 list
hrefs = []
for h3 in tagh3:
    # href = h3.find('a').get('href')
    try:
        href = h3.get('href')
    except:
        pass
    else:
        hrefs.append(href)

hrefs 就是通過標籤獲取到的內容，這裏我獲取的是頁面所有 url

方法四：標籤 BeautifulSoup select

import urllib
import requests
from urllib.parse import urlparse
from urllib import request
from bs4 import BeautifulSoup

word = '周杰倫'
url = 'http://www.baidu.com/s?wd=' + urllib.parse.quote(word) + '&pn=0'
page = request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.select('h3 > a[href]')
tags = soup.select('a[href]')  # 返回 list，同級不需要空格分隔
hrefs = []
for tag in tags:
    hrefs.append(tag.get('href'))  # 提取 href 的內容
hrefs = list({}.fromkeys(hrefs).keys())  # 去重

hrefs 就是通過標籤 select 方法獲取到的內容，這裏我獲取的是頁面上所有 url；

關於BeautifulSoup find_all 、select 的使用，詳細可見如下兩個網址的描述（轉）：

https://www.cnblogs.com/yizhenfeng168/p/6979339.html

https://www.cnblogs.com/kangblog/p/9153871.html

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Python3-爬取頁面元素的幾種方法

Python3-爬取頁面元素的幾種方法

Python3 爬蟲-判斷 url 是否使用了CDN

Python3 爬蟲-提取請求頁面所有的真實url-BeautifulSoup

頁面元素定位方式：xpath----軸定位方式

Python爬蟲後獲取重定向url的兩種方法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結