英文文本預處理
單詞原型
word_map = {
"i'll": "i will",
"it'll": "it will",
"we'll": "we will",
"he'll": "he will",
"they'll": "they will",
"i'd": "i would",
"we'd": "we would",
"he'd": "he would",
"they'd": "they would",
"i'm": "i am",
"he's": "he is",
"she's": "she is",
"that's": "that is",
"here's": "here is",
"there's": "there is",
"we're": "we are",
"they're": "they are",
"who's": "who is",
"what's": "what is",
"i've": "i have",
"we've": "we have",
"they've": "they have",
"wanna": "want to",
"can't": "can not",
"ain't": "are not",
"isn't": "is not",
"and/or": "and or",
}
實例
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
newsgroups = fetch_20newsgroups()
# 縮寫詞補全映射關係
word_map = {
"i'll": "i will",
"it'll": "it will",
"we'll": "we will",
"he'll": "he will",
"they'll": "they will",
"i'd": "i would",
"we'd": "we would",
"he'd": "he would",
"they'd": "they would",
"i'm": "i am",
"he's": "he is",
"she's": "she is",
"that's": "that is",
"here's": "here is",
"there's": "there is",
"we're": "we are",
"they're": "they are",
"who's": "who is",
"what's": "what is",
"i've": "i have",
"we've": "we have",
"they've": "they have",
"wanna": "want to",
"can't": "can not",
"ain't": "are not",
"isn't": "is not",
}
# 檢驗符號是否有效(字母 和 符號【'】)
def isValidChar(ch: str):
if ch.isalpha(): return True
if ch == "'": return True
# 檢驗單詞是否有效(必須由 字母 和 符號【'】 構成)
def isValidWord(word: str):
if len(word) > 20: return False
for ch in word:
if not isValidChar(ch): return False
return True
# 處理單詞
def handleWord(word: str):
if word in word_map: return word_map[word]
if len(word) > 1 and word[-2:] == "'s": return word[:-2]
return word
# 處理文本
def handleText(text: str):
text = text.lower().split()
res = [""]
for word in text:
if not isValidChar(word[-1]): word = word[:-1]
if not isValidWord(word): continue
res.append(handleWord(word))
res = " ".join(res).split()
res = [word for word in res if word not in stop]
return res
# 處理文本集合
def handleTextList(texts: list):
res = []
for i, text in enumerate(texts):
if (i + 1) % 1000 == 0:
print("\r\t正在進行過濾:{:.2f}% 共{}條".format((i + 1) * 100 / len(texts), len(texts)), end="", flush=True)
res.append(handleText(text))
print()
return res
if __name__ == '__main__':
texts = handleTextList(newsgroups['data'])
# 打印可以看到結果
# print(text[0])