python正則表達式複習2

大小寫不區分匹配, 使用\b

import re
text = 'This is some text -- with punctuation.'
# 匹配以T開頭的單詞
pattern = r'\bT\w+'
with_case = re.compile(pattern)
# 不區分大小寫
without_case = re.compile(pattern, re.IGNORECASE)
print 'Text:\n %r' % text
print 'Pattern:\n %s' % pattern

print 'Case-sensitive:'
for match in with_case.findall(text):
    print ' %r' % match

print 'Case-insensitive:'
for match in without_case.findall(text):
    print ' %r' % match

結果
Text:
‘This is some text – with punctuation.’
Pattern:
\bT\w+
Case-sensitive:
‘This’
Case-insensitive:
‘This’
‘text’

多行匹配與單行匹配

text = 'This is some text -- with punctuation.\nA second line.'
# 查找開頭和結尾的非空白字符
pattern = r'(^\w+)|(\w+\S*$)'
single_line = re.compile(pattern)
# 加上多行匹配,即處理\n
multiline = re.compile(pattern, re.MULTILINE)
print 'Text:\n %r' % text
print 'Pattern:\n %s' % pattern
print 'Single Line :'
for match in single_line.findall(text):
    print ' %r' % (match,)
print 'Multiline :'
for match in multiline.findall(text):
    print ' %r' % (match,)

結果
Text:
‘This is some text – with punctuation.\nA second line.’
Pattern:
(^\w+)|(\w+\S*$)
Single Line :
(‘This’, ”)
(”, ‘line.’)
Multiline :
(‘This’, ”)
(”, ‘punctuation.’)
(‘A’, ”)
(”, ‘line.’)
注:多行考慮時,\n後面的字符串被當作另外一行考慮

dotall,點包括了換行符

text = 'This is some text -- with punctuation.\nA second line.'
pattern = r'.+'
no_newlines = re.compile(pattern)
# 默認'.'是不匹配換行符的,加上re.DOTALL標記,即包括換行符
dotall = re.compile(pattern, re.DOTALL)
print 'Text:\n %r' % text
print 'Pattern:\n %s' % pattern

for match in no_newlines.findall(text):
    print ' %r' % match

print 'Dotall :'
for match in dotall.findall(text):
    print ' %r' % match

結果
Text:
‘This is some text – with punctuation.\nA second line.’
Pattern:
.+
No newlines :
‘This is some text – with punctuation.’
‘A second line.’
Dotall :
‘This is some text – with punctuation.\nA second line.’

unicode匹配,re.UNICODE

import codecs
import sys
# 設置標準輸出爲utf-8格式
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
text = u'Français złoty Österreich'
pattern = ur'\w+'
ascii_pattern = re.compile(pattern)
unicode_pattern = re.compile(pattern, re.UNICODE)
print 'Text :', text
print 'Pattern :', pattern
print 'ASCII :', u', '.join(ascii_pattern.findall(text))
print 'Unicode :', u', '.join(unicode_pattern.findall(text))

結果
Text : Français złoty Österreich
Pattern : \w+
ASCII : Fran, ais, z, oty, sterreich
Unicode : Français, złoty, Österreich
注:在沒有加上unicode匹配時,不是ascii碼的字符,無法匹配

正則表達式註釋,並已json輸出

# 匹配某些帶尖括號的匹配郵箱
address = re.compile(
    '''
    ((?P<name>
    ([\w.,]+\s+)*[\w.,]+) # 名字中可能包含點字符
    \s*
    < # 當有名字的時候,郵箱是放在尖括號裏面的
    )? # 郵箱前面的名字可有可無
    (?P<email>
    [\w\d.+-]+ # 郵箱符號的前面是一個名稱
    @
    ([\w\d.]+\.)+ # 域名的前綴
    (com|org|edu) # 限制哪些域名是在考慮範圍的
    )
    >? # 尖括號是根據前面有沒有名字而可有可無的
    ''',
    re.UNICODE | re.VERBOSE)

candidates = [
    u'[email protected]',
    u'[email protected]',
    u'[email protected]',
    u'[email protected]',
    u'First Last <[email protected]>',
    u'No Brackets [email protected]',
    u'First Last',
    u'First Middle Last <[email protected]>',
    u'First M. Last <[email protected]>',
    u'<[email protected]>',
    ]

for candidate in candidates:
    print 'Candidate:', candidate
    match = address.search(candidate)
    if match:
        print match.groupdict()
    else:
        print ' No match'

結果
Candidate: [email protected]
{‘name’: None, ‘email’: u’[email protected]’}
Candidate: [email protected]
{‘name’: None, ‘email’: u’[email protected]’}
Candidate: [email protected]
{‘name’: None, ‘email’: u’[email protected]’}
Candidate: [email protected]
No match
Candidate: First Last [email protected]
{‘name’: u’First Last’, ‘email’: u’[email protected]’}
Candidate: No Brackets [email protected]
{‘name’: None, ‘email’: u’[email protected]’}
Candidate: First Last
No match
Candidate: First Middle Last [email protected]
{‘name’: u’First Middle Last’, ‘email’: u’[email protected]’}
Candidate: First M. Last [email protected]
{‘name’: u’First M. Last’, ‘email’: u’[email protected]’}
Candidate: [email protected]
{‘name’: None, ‘email’: u’[email protected]’}

注:名字可有可無,因此會出現name爲none的情況

通過?i來標記忽略大小寫

# 通過加入標籤(?i)忽略大小寫,其它的標籤 IGNORECASE:i,
# MULTILINE:m,DOTALL:s,UNICODE:u,VERBOSE:x ,可以同時加入多個標籤如:?imu
text = 'This is some text -- with punctuation.'
pattern = r'(?i)\bT\w+'
regex = re.compile(pattern)
print 'Text :', text
print 'Pattern :', pattern
print 'Matches :', regex.findall(text)

結果
Text : This is some text – with punctuation.
Pattern : (?i)\bT\w+
Matches : [‘This’, ‘text’]

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章