網頁爬蟲1--正則表達式

這裏寫圖片描述

教程來源: 莫煩Python

學習爬蟲前先了解一下正則表達式吧~

導入模塊

import re #正則表達式模塊

簡單匹配

# matching string
pattern1="cat"
pattern2="bird"
string="dog runs to cat"
print(pattern1 in string)
print(pattern2 in string)
True
False

用正則尋找配對

# regular expression
pattern1="cat"
pattern2="bird"
string="dog runs to cat"
print(re.search(pattern1,string)) #顯示匹配到的對象
print(re.search(pattern1,string).group()) #加grop()可以顯示匹配子串
print(re.search(pattern1,string).span()) #加span()顯示匹配到的子串在原字符串中的索引位置
print(re.search(pattern2,string))
<_sre.SRE_Match object at 0x7fde38270b28>
cat
(12, 15)
None

匹配多種可能使用[]

# multiple patterns ("run" or "ran")
print(re.search(r'r[au]n',"dog runs to cat").group())
run

匹配更多種可能

# continue
print(re.search(r'r[A-Z]n','dog runs to cat'))
print(re.search(r'r[a-z]n','dog runs to cat'))
print(re.search(r'r[0-9]n','dog r2ns to cat'))
print(re.search(r'r[0-9a-z]n','dog runs to cat'))
None
<_sre.SRE_Match object at 0x7fde382ab1d0>
<_sre.SRE_Match object at 0x7fde382ab1d0>
<_sre.SRE_Match object at 0x7fde382ab1d0>

特殊種類匹配

數字

# \d: decimal digit 任何數字
print(re.search(r'r\dn','run r4n').group())
# \D: any non-decimal digit 不是數字
print(re.search(r'r\Dn','run r4n').group())
r4n
run

空白

# \s : any white apace [\t\n\r\f\v] 任何white space
print(re.search(r'r\sn','r\nn r4n').group())
# \S : opposite to \s, any none-white space 
print(re.search(r'r\Sn','r\nn r4n').group())
r
n
r4n

所有字母數字和"__"

# \w : [a-zA-Z0-9_] 任何大小寫字母,數字
print(re.search(r'r\wn','r\nn r4n').group())
# \W: opposite to \w
print(re.search(r'r\Wn','r\nn r4n').group())
r4n
r
n

空白字符

# \b : empty string (only at the start or end of the world) 空白字符(只在某個字的開頭或結尾)
print(re.search(r'\bruns\b','dog runs to cat').group())
# \B : empty string (but not at the start or end of a world) 空白字符(不在某個字的開頭或結尾)
print(re.search(r'\B runs \B','dog  runs  to cat').group())
runs
 runs 

特殊字符 任意字符

# \\ : match \ 匹配\
print(re.search(r'runs\\','runs\ to me').group())
# . : match anything (except \n) 匹配任何字符(除了\n)
print(re.search(r'r.n','r[ns to me]').group())
runs\
r[n

句首句尾

# ^ : match line beginning
print(re.search(r'^dog','dog runs to cat').group())
# $ : match line ending
print(re.search(r'cat$','dog runs to cat').group())
dog
None
cat

是否

# ? : may or may not accur ?前面的字符可有可無
print(re.search(r'Mon(day)?','Monday').group())
print(re.search(r'Mon(day)?','Mon').group())
print(re.search(r'Mon(day)?','Mond').group())
Monday
Mon
Mon

多行匹配

# multi-line
string="""
dog runs to cat.
I run to dog.
"""
print(re.search(r'^I',string))
print(re.search(r'^I',string,flags=re.M).group()) #加flags=re.M參數可以單獨對每一行處理
print(re.search(r'^I',string,flags=re.MULTILINE).group())
None
I
I

0或多次

# * : occur 0 or more times
print(re.search(r'ab*','a').group())
print(re.search(r'ab*','abbb').group())
a
abbb

1或多次

# + : occur 1 or more times
print(re.search(r'ab+','a'))
print(re.search(r'ab+','abbb').group())
None
abbb

可選次數

# {n,m} : occur n to m times
print(re.search(r'ab{2,10}','a'))
print(re.search(r'ab{2,10}','abbbb').group())
None
abbbb

group組

# group
match=re.search(r'(\d+), Data: (.+)','ID: 20180317, Data: Mar/17/2018')
print(match.group())
print(match.group(1))
print(match.group(2))
20180317, Data: Mar/17/2018
20180317
Mar/17/2018
match=re.search(r'(?P<id>\d+), Data: (?P<date>.+)','ID: 20180317, Data: Mar/17/2018')
print(match.group('id'))
print(match.group('date'))
20180317
Mar/17/2018

尋找所有匹配

# findall
print(re.findall(r'r[ua]n','run ran ren'))
['run', 'ran']
# | : or #要麼是前者,要麼是後者
print(re.findall(r'run|ran','run ran ren'))
['run', 'ran']

替換

# re.sub() replace
print(re.sub(r'r[au]ns','catches','dog runs to cat'))
print(re.sub(r'I','You','I like apple'))
dog catches to cat
You like apple

分裂

# re.split()
print(re.split(r'[,;\.]','a;b,c.d;e.f'))
['a', 'b', 'c', 'd', 'e', 'f']

compile

# compile
compiled_re=re.compile(r'r[au]n')
print(compiled_re.search('dog ran to cat').group())
ran
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章