python版本-文本分類流程-英文文本預處理

英文文本預處理

單詞原型

word_map = {
    "i'll": "i will",
    "it'll": "it will",
    "we'll": "we will",
    "he'll": "he will",
    "they'll": "they will",
    "i'd": "i would",
    "we'd": "we would",
    "he'd": "he would",
    "they'd": "they would",
    "i'm": "i am",
    "he's": "he is",
    "she's": "she is",
    "that's": "that is",
    "here's": "here is",
    "there's": "there is",
    "we're": "we are",
    "they're": "they are",
    "who's": "who is",
    "what's": "what is",
    "i've": "i have",
    "we've": "we have",
    "they've": "they have",
    "wanna": "want to",
    "can't": "can not",
    "ain't": "are not",
    "isn't": "is not",
    "and/or": "and or",
}

實例

from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
newsgroups = fetch_20newsgroups()


# 縮寫詞補全映射關係
word_map = {
    "i'll": "i will",
    "it'll": "it will",
    "we'll": "we will",
    "he'll": "he will",
    "they'll": "they will",
    "i'd": "i would",
    "we'd": "we would",
    "he'd": "he would",
    "they'd": "they would",
    "i'm": "i am",
    "he's": "he is",
    "she's": "she is",
    "that's": "that is",
    "here's": "here is",
    "there's": "there is",
    "we're": "we are",
    "they're": "they are",
    "who's": "who is",
    "what's": "what is",
    "i've": "i have",
    "we've": "we have",
    "they've": "they have",
    "wanna": "want to",
    "can't": "can not",
    "ain't": "are not",
    "isn't": "is not",
}


# 檢驗符號是否有效(字母 和 符號【'】)    
def isValidChar(ch: str):
    if ch.isalpha(): return True
    if ch == "'": return True

# 檢驗單詞是否有效(必須由 字母 和 符號【'】 構成)
def isValidWord(word: str):
    if len(word) > 20: return False
    for ch in word:
        if not isValidChar(ch): return False
    return True

# 處理單詞
def handleWord(word: str):
    if word in word_map: return word_map[word]
    if len(word) > 1 and word[-2:] == "'s": return word[:-2]
    return word

# 處理文本
def handleText(text: str):
    text = text.lower().split()
    res = [""]
    for word in text:
        if not isValidChar(word[-1]): word = word[:-1]
        if not isValidWord(word): continue
        res.append(handleWord(word))
    res = " ".join(res).split()
    res = [word for word in res if word not in stop]
    return res

# 處理文本集合
def handleTextList(texts: list):
    res = []
    for i, text in enumerate(texts):
        if (i + 1) % 1000 == 0:
            print("\r\t正在進行過濾:{:.2f}% 共{}條".format((i + 1) * 100 / len(texts), len(texts)), end="", flush=True)
        res.append(handleText(text))
    print()
    return res


if __name__ == '__main__':
    texts = handleTextList(newsgroups['data'])
    # 打印可以看到結果
    # print(text[0])

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章