基於詞級別和字級別的中文文本預處理

一、基於詞級別文本預處理

詞級別預處理過程：

1、文本分詞

2、統計詞頻

3、去除低頻詞和停用詞

4、選取適量的高頻詞形成詞典

5、用詞典過濾並映射文本分詞

6、查看文本序列長度分佈，確定補全長度

7、文本序列補全

具體代碼：

from collections import Counter
import jieba_fast as jieba
import joblib
import pandas as pd
import os
from seaborn import distplot
from matplotlib import pyplot


# 結果保留位置
root = r"E:\result"


# 加載文本以及標籤
# 數據示例
# return：
# texts = ["體育畫報：李娜奪得法網國軍","世界衛生組織宣佈：人類戰勝了新冠狀病毒"]
# labels = ["體育","國際"]
def loadData():  #
    data = pd.read_csv(r"E:\cnews.train.txt", header=None, index_col=None, delimiter="\t").values
    labels = [l for l, t in data]
    texts = [t for l, t in data]
    return texts, labels


# 加載停用詞
# 自己定義停用詞，記住一定要轉換爲set集合類，list慢的要死，別問我爲什麼知道
def loadStopwords():  #
    return set(joblib.load(r"E:\stopwords.pkl"))


# 標籤和下標的雙映射
def getLabelMap(labels: list):
    label_set = list(set(labels))
    index2label = {i: l for i, l in enumerate(label_set)}
    label2index = {l: i for i, l in enumerate(label_set)}
    return label2index, index2label

# 單詞和下標的雙映射
def getWordMap(word_set):
    word2index = {w: i + 1 for i, w in enumerate(word_set)}
    index2word = {i + 1: w for i, w in enumerate(word_set)}
    return word2index, index2word


# 定義分詞方式
# 這裏用jiba_fast分詞，當然使用jieba或其他的也行，看自己選擇
# input： "我是一箇中國人"
# return： ["我","是","一個","中國人"]
def splitText(text: str):
    return jieba.lcut(text)


# 對文本列表進行分詞
def splitTextList(texts):
    res = []
    for i, text in enumerate(texts):
        if (i + 1) % 1000 == 0:
            print("func[splitTextList]: {}/{}".format(i + 1, len(texts)))
        res.append(splitText(text))
    return res


# 判斷是否爲中文詞
def isChinese(word):
    for ch in word:
        if not '\u4e00' <= ch <= '\u9fff':
            return False
    return True


# 過濾停用詞和低頻詞
# 這裏選擇過濾後的5萬個高頻詞作爲詞典
def filterWordSet(word2count, stopword: set):
    wc_list = sorted(list(word2count.items()), key=lambda x: x[1], reverse=True)
    if len(wc_list) > 60000: wc_list = wc_list[:60000]
    wc_list = [[w, c] for w, c in wc_list if w not in stopwords and isChinese(w)]
    if len(wc_list) > 50000: wc_list = wc_list[:50000]
    return {w for w, c in wc_list if w not in stopwords and isChinese(w)}


# 顯示文本序列長度的分佈
def showLenDist(max_len):
    lens = [len(e) for e in words_list]
    lens = sorted([e for e in lens if e < max_len], reverse=True)
    distplot(lens)
    pyplot.show()


# 序列補全
def padding(Input, pad_len):
    if len(Input) > pad_len: return Input[:pad_len]
    return Input + [0] * (pad_len - len(Input))


if __name__ == '__main__':
    texts, labels = loadData()
    stopwords = loadStopwords()

    # 得到分詞結果
    words_list = splitTextList(texts)
    
    # 得到詞頻
    word2count = Counter([w for words in words_list for w in words])

    # 得到詞典
    word_set = filterWordSet(word2count, stopword)

    # 標籤雙映射，詞典雙映射
    label2index, index2label = getLabelMap(labels)  
    word2index, index2word = getWordMap(word_set)  # 要看詞個數

    ##################################################################
    # max_len補全長度
    # 查看長度分佈，選擇合適的max_len
    inputs = [[word2index[w] for w in words if w in word_set] for words in words_list]
    max_len = 480
    showLenDist(inputs, max_len)

    #################################################
    # 得到輸入輸出
    inputs = [padding(Input, max_len) for Input in inputs]
    outputs = [label2index[l] for l in labels]

    #################################################
    # 保存需要的結果
    joblib.dump(label2index, os.path.join(root, "label2index.pkl"))
    joblib.dump(index2label, os.path.join(root, "index2label.pkl"))
    joblib.dump(word2index, os.path.join(root, "word2index.pkl"))
    joblib.dump(index2word, os.path.join(root, "index2word.pkl"))
    joblib.dump(word_set, os.path.join(root, "word_set.pkl"))
    joblib.dump(word2count, os.path.join(root, "word2count.pkl"))
    joblib.dump(outputs, os.path.join(root, "train.outputs.pkl"))
    pd.DataFrame(inputs).to_csv(os.path.join(root, "train.inputs.csv"), header=None, index=None)

二、基於字級別文本預處理

定義詞典：

所有的中文字，常用的標點符號，英文以及數字：也就是a-z||0-9||中文||標點符號||空格符

但要注意半角全角的問題，我們一般將全角轉換爲半角。

字級別預處理過程：

1、全角轉半角

2、大寫轉小寫

3、用詞典過濾文本

4、對於連續的空格，只保留一個

具體代碼

from preprocessor.word_based import loadData
from collections import Counter
from seaborn import distplot
from matplotlib import pyplot
import joblib
import pandas as pd
import os

# 全角轉半角
full2half = dict(zip("·！￥%…（）—《》？：“”，。；‘【】、＄１２３４５６７８９０ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ",
                     "`!$%^()_<>?:\"\",.;'[]\\$1234567890abcdefghijklmnopqrstuvwxyz"))

# 基礎符號集合（符號+數字+字母）
ch_base_set = set("!@#$%^&*()~_+`-=,./;'[]\\<>?:\"{}|1234567890abcdefghijklmnopqrstuvwxyz")

root = r"E:\Projects\PycharmProjects\test\val"


# 判斷是否爲中文字符
def isChinese(ch: str):
    return '\u4e00' <= ch <= '\u9fff'


# 判斷是否在詞典中
def isVaild(ch: str):
    return ch in ch_base_set or isChinese(ch)


# 過濾過程，全角轉半角+大寫轉小寫+字典過濾+連續的空格處理
def filterText(text: str):
    res, pre = "", ""
    for ch in text:
        ch = ch.lower()
        if ch in full2half: ch = full2half[ch]
        if not (isChinese(ch) or ch in ch_base_set): ch = " "
        temp = ch
        if ch == " " and pre == " ": ch = ""
        pre = temp
        res += ch
    return res.strip()


def filterTextList(texts):
    res = []
    for i, text in enumerate(texts):
        if (i + 1) % 1000 == 0:
            print("func[filterTextList]: {}/{}".format(i + 1, len(texts)))
        res.append(filterText(text))
    return res


def filterCharSet(ch2count):
    wc_list = sorted(list(ch2count.items()), key=lambda x: x[1], reverse=True)
    wc_list = [[w, c] for w, c in wc_list if isVaild(w)]
    if len(wc_list) > 5000: wc_list = wc_list[:5000]
    return {w for w, c in wc_list}


# labels 單標籤
def getLabelMap(labels: list):
    label_set = list(set(labels))
    index2label = {i: l for i, l in enumerate(label_set)}
    label2index = {l: i for i, l in enumerate(label_set)}
    return label2index, index2label


def getCharMap(ch_set):
    ch2index = {w: i + 1 for i, w in enumerate(ch_set)}
    index2ch = {i + 1: w for i, w in enumerate(ch_set)}
    return ch2index, index2ch


def showLenDist(seqs, max_len):
    lens = [len(e) for e in seqs]
    lens = sorted([e for e in lens if e < max_len], reverse=True)
    distplot(lens)
    pyplot.show()


def padding(Input, pad_len):
    if len(Input) > pad_len: return Input[:pad_len]
    return Input + [0] * (pad_len - len(Input))


if __name__ == '__main__':
    texts, labels = loadData()

    # 過濾文本
    texts = filterTextList(texts)
    char2count = Counter("".join(texts))
    char_set = filterCharSet(char2count)

    label2index, index2label = getLabelMap(labels)
    char2index, index2char = getCharMap(char_set)

    ############################
    inputs = [[char2index[char] for char in text if char in char_set] for text in texts]
    max_len = 1800
    showLenDist(inputs, max_len)

    ###########################
    inputs = [padding(Input, max_len) for Input in inputs]
    outputs = [label2index[l] for l in labels]

    ######################
    #################################################
    joblib.dump(label2index, os.path.join(root, "label2index.pkl"))
    joblib.dump(index2label, os.path.join(root, "index2label.pkl"))
    joblib.dump(char2index, os.path.join(root, "char2index.pkl"))
    joblib.dump(index2char, os.path.join(root, "index2char.pkl"))
    joblib.dump(char_set, os.path.join(root, "char_set.pkl"))
    joblib.dump(char2count, os.path.join(root, "char2count.pkl"))
    joblib.dump(outputs, os.path.join(root, "train.outputs.pkl"))
    pd.DataFrame(inputs).to_csv(os.path.join(root, "train.inputs.csv"), header=None, index=None)

Young Panda

發佈了17 篇原創文章 · 獲贊 13 · 訪問量 1萬+

私信關注

基於詞級別和字級別的中文文本預處理

杭州的 IT 崩盤了麼？

Vue3 運行可以，build 打包發佈報錯，app.config.globalProperties 用法坑

VS2022 解決方案打不開 .NET Framework 4.0 、 4.5 等老項目

python版本-文本分類流程-英文文本預處理

文本分類論文及pytorch版復現（五）：TextLevelGNN

文本分類論文及pytorch版復現（四）：TextGCN

文本匹配論文及pytorch版復現（一）：DRCN

英文文本預處理

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結