六、學習分佈式爬蟲之正則表達式

正則表達式和re模塊

什麼是正則表達式：
通俗的理解，就是按照一定的規則，從某個字符串中匹配出想要的數據。這個規則就是正則表達式。
正則表達式語法

import re

# 匹配某個字符串
# text = 'abc'
# ret = re.match('a',text)  #從最開始的位置匹配第一個符合規則的字符或字符串
# print(ret)      #<re.Match object; span=(0, 1), match='a'>
# print(ret.group()) #a

# 點(.):匹配任意的字符（除了\n）
# text = 'abc'
# ret = re.match('.',text)
# print(ret.group()) #a

# \d:匹配任意的數字
# text = '5abc'
# ret = re.match('\d',text)
# print(ret.group()) #5

# \D:匹配任意的非數字
# text = '+abc'
# ret = re.match('\D',text)
# print(ret.group()) #+

# \s:匹配的是空白字符（包括\n,\t,\r和空格）
# text = ' abc'
# ret = re.match('\s',text)
# print(ret.group()) #

# \S:非空白字符
# text = 'abc'
# ret = re.match('\S',text)
# print(ret.group()) #a

# \w:匹配的是a-z和A-Z以及數字和下劃線
# text = '6abc'
# ret = re.match('\w',text)
# print(ret.group()) #6

# \W:匹配的是和\w相反的
# text = '@abc'
# ret = re.match('\W',text)
# print(ret.group()) #@

# []組合的方式，只要滿足中括號的某一項都算匹配成功
# text = '1bc'
# ret = re.match('[\d|\w]',text) #匹配數字或字母
# print(ret.group()) #@

#使用組合的方式實現\d:
# text = '1abc'
# ret = re.match('[0-9]',text)
# print(ret.group())

#使用組合的方式實現\D:
# text = 'abc'
# ret = re.match('[^0-9]',text)
# print(ret.group())

#使用組合的方式實現\w:
text = '_1abc'
ret = re.match('[0-9a-zA-Z_]',text)
print(ret.group())

import re

# *：匹配前一個字符0次或無限次
# text = 'abc'
# ret = re.match('\w*',text)
# print(ret.group())  #abc

# +：匹配前一個字符1次或無限次
# text = 'abc'
# ret = re.match('\w+',text)
# print(ret.group())  #abc

# ？：匹配前一個字符0次或1次
# text = 'abc'
# ret = re.match('\w?',text)
# print(ret.group())  #a

# {m}：匹配前一個字符m次
# text = 'abc'
# ret = re.match('\w{2}',text)
# print(ret.group())  #ab

# {m，n}：匹配前一個字符m到n次(貪婪匹配)
text = '1+abc'
ret = re.match('\w{1,3}',text)
print(ret.group())  #1

正則表達式小案例

import re
#1.驗證手機號：手機號的規則是以1開頭，第二位可以是34587，後面9爲可以隨意
# text = '157*****414'
# res = re.match('1[34587]\d{9}',text)
# print(res.group())

#2.驗證郵箱：郵箱的規則是郵箱名稱使用數字、英文字符、下劃線組成的，然後是@符號，後面就是域名了
# text = '********@163.com'
# res = re.match('\w+@(qq.com|163.com)',text)
# print(res.group())

#3.驗證URL：URL的規則是前面是http或則是ftp加上一個冒號，再加上一個斜槓，在後面就是可以出現任意非空白字符了
# text = 'https://mpnew.csdn.net/console/article'
# res = re.match('(http|https|ftp)://\S+',text)
# print(res.group())

#4.驗證身份證：身份證的規則是總共有18位，前面17位都是數字，後面一位可以是數字，也可以是小寫的x或大寫的x
text = '460033********361*'
res = re.match('\d{17}(\d|x|X)',text)
print(res.group())

開始、結束、貪婪、非貪婪

import re

# ^:以....開頭
# text = 'hello world'
# res = re.search('^hello',text) #re.search()匹配字符串中符合規則的字符或字符串，不在乎起始位置
# print(res.group())

# $:以....結尾
# text = 'hello world'
# res = re.search('world$',text)
# print(res.group())

# |:匹配多個字符串或表達式
# text = 'hello world'
# res = re.search('hello|world',text)
# print(res.group())

# 貪婪和非貪婪
# text = '12345'
# res = re.search('\d+',text) #貪婪模式
# res1 = re.search('\d+?',text) #非貪婪模式
# print(res.group())

# 案例一：提取html標籤名稱
# text = '<h1>這是標題</h1>'
# res = re.search('<.+?>',text) #<h1> 非貪婪模式匹配
# resq1 = re.search('<.+>',text) #<h1>這是標題</h1> 貪婪模式匹配
# print(res.group())

#案例二：驗證一個字符是不是0-100之間的數字
# 0 , 1 , 99 , 100
text = '100'
res = re.match('0$|[1-9]\d?$|100$',text)
print(res.group())

原生字符串和轉義字符串

import re

#python中的轉義字符
# text = 'hello\nword'   #\n換行符
# text1 = r'hello\nword' #\原生字符串
# text2 = 'hello\\nword' #\將後面具有特殊意義的字符轉義成普通字符
# print(text)

#正則表達式中的轉義字符
# text = 'apple price is $99,orange price is $88'
# res = re.findall('\$\d+',text) #返回一個列表
# print(res)

#原生字符串和正則表達式
text = '\cba c'
res = re.match('\\\c',text)
res1 = re.match(r'\\c',text)
print(res.group(),res1.group())

分組

import re
#分組
text = 'apple price is $99,orange price is $88'
res = re.search('.+(\$\d+).+(\$\d+)',text)
print(res.group())  #apple price is $99,orange price is $88
print(res.group(0)) #apple price is $99,orange price is $88
print(res.group(1)) #$99
print(res.group(2)) #$88
print(res.groups()) #('$99', '$88')

#group()/group(0) :獲取整個正則表達式所匹配的子串
#group(1) :匹配第一個分組
#group(2) :匹配第二個分組
#groups() :匹配所有的分組

正則表達式常用函數

import re
#re.match()從最開始的位置匹配第一個符合規則的字符或字符串
# re.search()匹配字符串中符合規則的字符或字符串，不在乎起始位置
# findall:查找所有滿足條件的字符串
text = 'apple price is $99,orange price is $88'
res = re.findall(r'\$\d+',text) #返回一個列表
print(res)

# sub：根據規則替換其他字符串
text = 'hello world hello china'
res = re.sub(r' ','\n',text) #返回一個列表
print(res)
html = """
<div class="job-detail">
    <p>1. 3年以上相關開發經驗 ，全日制統招本科以上學歷</p>
    <p>2. 精通一門或多門開發語言(Python,C,Java等)，其中至少有一門有3年以上使用經驗</p>
    <p>3. 熟練使用ES/mysql/mongodb/redis等數據庫；</p>
    <p>4. 熟練使用django、tornado等web框架，具備獨立開發 Python/Java 後端開發經驗；</p>
    <p>5. 熟悉 Linux / Unix 操作系統&nbsp;</p>
    <p>6. 熟悉 TCP/IP，http等網絡協議</p>
    <p>福利：</p>
    <p>1、入職購買六險一金（一檔醫療+公司全額購買商業險）+開門紅+全額年終獎（1年13薪，一般會比一個月高）</p>
    <p>2、入職滿一年有2次調薪調級機會</p>
    <p>3、項目穩定、團隊穩定性高，團隊氛圍非常好（匯合員工佔招行總員工比例接近50%）；</p>
    <p>4、有機會轉爲招商銀行內部員工；</p>
    <p>5、團隊每月有自己的活動經費，法定節假日放假安排；</p>
    <p>6、辦公環境優良，加班有加班費（全額工資爲計算基數，加班不超過晚上10點，平日加班爲時薪1.5倍，週末加班爲日薪2倍，週末加班也可優先選擇調休，管理人性化）。</p>
</div>
"""
new_html = re.sub(r'<.+?>','',html)
print(new_html)

# split：根據規則分割字符串
text = 'hello world hello china'
res = re.split(r' ',text)
print(res) #['hello', 'world', 'hello', 'china']

# compile：編譯正則表達式
text = 'apple price is 55.46'
r = re.compile(r'\d+\.?\d*') #先編譯後查找，可以方便以後重複使用
res = re.search(r,text)
print(res.group())

#正則表達式的註釋
text = 'apple price is 55.46'
res = re.search(r"""
\d+ #整數部分
\.? #小數點
\d* #小數部分
""",text,re.VERBOSE)
print(res.group())

爬取趕集網租房信息

import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
}

def parse_page(page_url):
    response = requests.get(page_url, headers=headers)
    text = response.text
    #findall()返回的是括號所匹配到的結果，多個括號就會返回多個括號分別匹配到的結果，
    # 如果沒有括號就返回就返回整條語句所匹配到的結果(如regex2)。所以在提取數據的時候就需要注意這個坑。
    house_title = re.findall(r"""
        <div.+?ershoufang-list.+?<a.+?js-title.+?>(.+?)</a> #獲取房源標題
        .+?<dd.+?dd-item.+?<span>(.+?)</span> #獲取戶型
        .+?<span.+?<span>(.+?)</span>       #獲取面積
        .+?<div.+?price.+?<span.+?>(.+?)</span> #獲取價格
    """,text,re.VERBOSE|re.DOTALL) #re.DOTALL設置點（.）可以匹配所有字符
    # print(house_title)
    for title in house_title:
        print(title)

def main():
    base_url = 'http://hn.ganji.com/zufang/pn{}/'
    for x in range(1,11):
        page_url = base_url.format(x)
        parse_page(page_url)
        break

if __name__ == '__main__':
    main()
    
#總結
#1.如果想要讓點（.）代表所有字符，那麼需要在函數後面加re.DOTALL來表示，否則不會代表\n
#2.獲取數據的時候，儘量都用非貪婪模式
#3.findall()返回的是括號所匹配到的結果，多個括號就會返回多個括號分別匹配到的結果,如果沒有括號就返回就返回整條語句所匹配到的結果

六、學習分佈式爬蟲之正則表達式

正則表達式和re模塊

五、分佈式爬蟲學習之BeautSoup4

十三、學習分佈式爬蟲之字體反爬

八、學習分佈式爬蟲之多線程

十四、學習分佈式爬蟲之Scrapy

數據分析學習之基礎知識

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結