一、基於詞級別文本預處理
詞級別預處理過程:
1、文本分詞
2、統計詞頻
3、去除低頻詞和停用詞
4、選取適量的高頻詞形成詞典
5、用詞典過濾並映射文本分詞
6、查看文本序列長度分佈,確定補全長度
7、文本序列補全
具體代碼:
from collections import Counter
import jieba_fast as jieba
import joblib
import pandas as pd
import os
from seaborn import distplot
from matplotlib import pyplot
# 結果保留位置
root = r"E:\result"
# 加載文本以及標籤
# 數據示例
# return:
# texts = ["體育畫報:李娜奪得法網國軍","世界衛生組織宣佈:人類戰勝了新冠狀病毒"]
# labels = ["體育","國際"]
def loadData(): #
data = pd.read_csv(r"E:\cnews.train.txt", header=None, index_col=None, delimiter="\t").values
labels = [l for l, t in data]
texts = [t for l, t in data]
return texts, labels
# 加載停用詞
# 自己定義停用詞,記住一定要轉換爲set集合類,list慢的要死,別問我爲什麼知道
def loadStopwords(): #
return set(joblib.load(r"E:\stopwords.pkl"))
# 標籤和下標的雙映射
def getLabelMap(labels: list):
label_set = list(set(labels))
index2label = {i: l for i, l in enumerate(label_set)}
label2index = {l: i for i, l in enumerate(label_set)}
return label2index, index2label
# 單詞和下標的雙映射
def getWordMap(word_set):
word2index = {w: i + 1 for i, w in enumerate(word_set)}
index2word = {i + 1: w for i, w in enumerate(word_set)}
return word2index, index2word
# 定義分詞方式
# 這裏用jiba_fast分詞,當然使用jieba或其他的也行,看自己選擇
# input: "我是一箇中國人"
# return: ["我","是","一個","中國人"]
def splitText(text: str):
return jieba.lcut(text)
# 對文本列表進行分詞
def splitTextList(texts):
res = []
for i, text in enumerate(texts):
if (i + 1) % 1000 == 0:
print("func[splitTextList]: {}/{}".format(i + 1, len(texts)))
res.append(splitText(text))
return res
# 判斷是否爲中文詞
def isChinese(word):
for ch in word:
if not '\u4e00' <= ch <= '\u9fff':
return False
return True
# 過濾停用詞和低頻詞
# 這裏選擇過濾後的5萬個高頻詞作爲詞典
def filterWordSet(word2count, stopword: set):
wc_list = sorted(list(word2count.items()), key=lambda x: x[1], reverse=True)
if len(wc_list) > 60000: wc_list = wc_list[:60000]
wc_list = [[w, c] for w, c in wc_list if w not in stopwords and isChinese(w)]
if len(wc_list) > 50000: wc_list = wc_list[:50000]
return {w for w, c in wc_list if w not in stopwords and isChinese(w)}
# 顯示文本序列長度的分佈
def showLenDist(max_len):
lens = [len(e) for e in words_list]
lens = sorted([e for e in lens if e < max_len], reverse=True)
distplot(lens)
pyplot.show()
# 序列補全
def padding(Input, pad_len):
if len(Input) > pad_len: return Input[:pad_len]
return Input + [0] * (pad_len - len(Input))
if __name__ == '__main__':
texts, labels = loadData()
stopwords = loadStopwords()
# 得到分詞結果
words_list = splitTextList(texts)
# 得到詞頻
word2count = Counter([w for words in words_list for w in words])
# 得到詞典
word_set = filterWordSet(word2count, stopword)
# 標籤雙映射,詞典雙映射
label2index, index2label = getLabelMap(labels)
word2index, index2word = getWordMap(word_set) # 要看詞個數
##################################################################
# max_len補全長度
# 查看長度分佈,選擇合適的max_len
inputs = [[word2index[w] for w in words if w in word_set] for words in words_list]
max_len = 480
showLenDist(inputs, max_len)
#################################################
# 得到輸入輸出
inputs = [padding(Input, max_len) for Input in inputs]
outputs = [label2index[l] for l in labels]
#################################################
# 保存需要的結果
joblib.dump(label2index, os.path.join(root, "label2index.pkl"))
joblib.dump(index2label, os.path.join(root, "index2label.pkl"))
joblib.dump(word2index, os.path.join(root, "word2index.pkl"))
joblib.dump(index2word, os.path.join(root, "index2word.pkl"))
joblib.dump(word_set, os.path.join(root, "word_set.pkl"))
joblib.dump(word2count, os.path.join(root, "word2count.pkl"))
joblib.dump(outputs, os.path.join(root, "train.outputs.pkl"))
pd.DataFrame(inputs).to_csv(os.path.join(root, "train.inputs.csv"), header=None, index=None)
二、基於字級別文本預處理
定義詞典:
所有的中文字,常用的標點符號,英文以及數字:也就是a-z||0-9||中文||標點符號||空格符
但要注意半角全角的問題,我們一般將全角轉換爲半角。
字級別預處理過程:
1、全角轉半角
2、大寫轉小寫
3、用詞典過濾文本
4、對於連續的空格,只保留一個
具體代碼
from preprocessor.word_based import loadData
from collections import Counter
from seaborn import distplot
from matplotlib import pyplot
import joblib
import pandas as pd
import os
# 全角轉半角
full2half = dict(zip("·!¥%…()—《》?:“”,。;‘【】、$1234567890abcdefghijklmnopqrstuvwxyz",
"`!$%^()_<>?:\"\",.;'[]\\$1234567890abcdefghijklmnopqrstuvwxyz"))
# 基礎符號集合(符號+數字+字母)
ch_base_set = set("!@#$%^&*()~_+`-=,./;'[]\\<>?:\"{}|1234567890abcdefghijklmnopqrstuvwxyz")
root = r"E:\Projects\PycharmProjects\test\val"
# 判斷是否爲中文字符
def isChinese(ch: str):
return '\u4e00' <= ch <= '\u9fff'
# 判斷是否在詞典中
def isVaild(ch: str):
return ch in ch_base_set or isChinese(ch)
# 過濾過程,全角轉半角+大寫轉小寫+字典過濾+連續的空格處理
def filterText(text: str):
res, pre = "", ""
for ch in text:
ch = ch.lower()
if ch in full2half: ch = full2half[ch]
if not (isChinese(ch) or ch in ch_base_set): ch = " "
temp = ch
if ch == " " and pre == " ": ch = ""
pre = temp
res += ch
return res.strip()
def filterTextList(texts):
res = []
for i, text in enumerate(texts):
if (i + 1) % 1000 == 0:
print("func[filterTextList]: {}/{}".format(i + 1, len(texts)))
res.append(filterText(text))
return res
def filterCharSet(ch2count):
wc_list = sorted(list(ch2count.items()), key=lambda x: x[1], reverse=True)
wc_list = [[w, c] for w, c in wc_list if isVaild(w)]
if len(wc_list) > 5000: wc_list = wc_list[:5000]
return {w for w, c in wc_list}
# labels 單標籤
def getLabelMap(labels: list):
label_set = list(set(labels))
index2label = {i: l for i, l in enumerate(label_set)}
label2index = {l: i for i, l in enumerate(label_set)}
return label2index, index2label
def getCharMap(ch_set):
ch2index = {w: i + 1 for i, w in enumerate(ch_set)}
index2ch = {i + 1: w for i, w in enumerate(ch_set)}
return ch2index, index2ch
def showLenDist(seqs, max_len):
lens = [len(e) for e in seqs]
lens = sorted([e for e in lens if e < max_len], reverse=True)
distplot(lens)
pyplot.show()
def padding(Input, pad_len):
if len(Input) > pad_len: return Input[:pad_len]
return Input + [0] * (pad_len - len(Input))
if __name__ == '__main__':
texts, labels = loadData()
# 過濾文本
texts = filterTextList(texts)
char2count = Counter("".join(texts))
char_set = filterCharSet(char2count)
label2index, index2label = getLabelMap(labels)
char2index, index2char = getCharMap(char_set)
############################
inputs = [[char2index[char] for char in text if char in char_set] for text in texts]
max_len = 1800
showLenDist(inputs, max_len)
###########################
inputs = [padding(Input, max_len) for Input in inputs]
outputs = [label2index[l] for l in labels]
######################
#################################################
joblib.dump(label2index, os.path.join(root, "label2index.pkl"))
joblib.dump(index2label, os.path.join(root, "index2label.pkl"))
joblib.dump(char2index, os.path.join(root, "char2index.pkl"))
joblib.dump(index2char, os.path.join(root, "index2char.pkl"))
joblib.dump(char_set, os.path.join(root, "char_set.pkl"))
joblib.dump(char2count, os.path.join(root, "char2count.pkl"))
joblib.dump(outputs, os.path.join(root, "train.outputs.pkl"))
pd.DataFrame(inputs).to_csv(os.path.join(root, "train.inputs.csv"), header=None, index=None)