本次NLP作业需要每个人在小组选定领域下进行子领域词典制作,我们小组选定的领域为动物。我个人选定的子领域为昆虫,原始语料库来自《昆虫记》这本书。通过爬虫或者复制粘贴可以在本地得到关于《昆虫记》的文本文件。
数据的处理
- 读取文本,将句号替换成换行,跳过空行
- 通过自建筛选字典和清华动物字典,对文本进行处理,保留每行含有动物词汇的行
- 按照7:3的比例,划分训练集和测试集
- 读取训练集,生成昆虫领域词典。(most_common可以指定返回数目,因为有些动物名只出现一次,并且生成词典较小。所以这里并未指定,而是全部返回,然后人工筛选)
- 用jeiba对训练集进行分词处理
评估
中文分词系统的评价指标一般有切分准确率(Precision)、召回率(Recall)、分词精度(F1-Measure)等。本实验将F1作为评测指标来评价HMM模型对昆虫领域的分词效果,F1的计算公式如下所示:
python程序
import os
import sys
import random
import json
import jieba
import jieba.posseg
from collections import Counter
class ProcessData(object):
def __init__(self, *dic_path):
self.dictionary = set()
self.maximum = 0
# 加载词典
for path in dic_path:
self.load_dic(path)
# 加载字典
def load_dic(self, dic_path):
with open(dic_path, 'r', encoding='utf-8') as fp:
for line in fp:
line = line.strip().split()[0]
if not line:
continue
self.dictionary.add(line)
self.maximum = max(self.maximum, len(line))
# 判断text中是否包含词典中的词
def return_data(self, text):
index = 0
while index < len(text):
match = False
for size in range(self.maximum, 0, -1):
if index + size > len(text):
continue
piece = text[index:(index + size)]
if piece in self.dictionary:
return True
if not match:
index += 1
return False
# 划分数据集和测试集
def train_test_split(self, data_path, train_path, test_path):
try:
with open(data_path, 'r', encoding='utf-8') as fp, open(
train_path, 'w',
encoding='utf-8') as train, open(test_path,
'w',
encoding='utf-8') as test:
lines = fp.readlines()
threshold = int(len(lines) * 0.7)
random.shuffle(lines)
for i, line in enumerate(lines):
if i < threshold:
train.write(line)
else:
test.write(line)
except Exception:
print(sys.stderr, "文件读写出现错误")
raise Exception
sys.exit(1)
# 对text文件进行词性标注
def postag_txt(self, inputFile, outputFile):
with open(inputFile, 'r',
encoding="utf-8") as fin, open(outputFile,
'w+',
encoding="utf-8",
newline='') as fout:
# 词性统计
d = {}
for eachLine in fin:
# 跳过空行
if not len(eachLine.strip()):
continue
else:
line = eachLine.strip().replace('。', os.linesep)
# 使用jieba词性标注
posLine = jieba.posseg.cut(line)
newLine = ''
for key in posLine:
# newLine += "{}/{} ".format(key.word, key.flag)
d[key.flag] = d.get(key.flag, 0) + 1
# fout.write(newLine + os.linesep)
fout.write(json.dumps(d))
return True
# 词频统计
def count_words(self, text):
# 加载停用词表
stop_path = os.path.join(sys.path[0], r'.\data\stop_words.utf8')
stopwords = [
line.strip()
for line in open(stop_path, 'r', encoding='utf-8').readlines()
]
seg_list = jieba.cut(text)
c = Counter()
for word in seg_list:
if word not in stopwords and len(word) > 1 and word != os.linesep:
c[word] += 1
# 前count
lines = ""
for (word, num) in c.most_common():
line = word + " " + str(num) + os.linesep
lines += line
return lines
# jeiba分词
def seq_word(self, input_path, output_path):
with open(input_path, 'r',
encoding='utf-8') as fin, open(output_path,
'w',
encoding='utf-8') as fout:
for line in fin.readlines():
seq_list = jieba.cut(line, cut_all=False)
fout.write(' '.join(seq_list))
print("jeiba分词完成")
def Process():
dict1_path = os.path.join(sys.path[0],
r'.\data\THUOCL_animal.txt') # 清华动物词典
dict2_path = os.path.join(sys.path[0], r'.\data\my_animal.txt') # 自建词典
input_path = os.path.join(sys.path[0], r'.\data\insect_origin.txt')
output_path = os.path.join(sys.path[0], r'.\data\insect.txt')
train_path = os.path.join(sys.path[0],
r'.\data\train_insect.txt') # 生成训练集路径
test_path = os.path.join(sys.path[0], r'.\data\test_insect.txt') # 生成测试集路径
pro = ProcessData(dict1_path, dict2_path) # 加载词典
try:
with open(input_path, 'r',
encoding='utf-8') as input_text, open(output_path,
'w',
encoding='utf-8',
newline='') as output:
for line in input_text:
flag = pro.return_data(line.strip())
if flag:
print("line:", line)
output.write(line)
except Exception:
print(sys.stderr, "文件打开错误")
raise Exception
sys.exit(1)
print("数据处理完成")
pro.train_test_split(output_path, train_path, test_path)
print("训练集和测试集生成")
my_dict = os.path.join(sys.path[0], r'.\data\my_dict.txt')
# 词典生成
with open(train_path, 'r',
encoding='utf-8') as fin, open(my_dict,
'w',
encoding='utf-8',
newline="") as fout:
text = fin.read()
text = pro.count_words(text)
fout.write(text)
# 用jeiba对训练集进行分词处理
jieba_train = os.path.join(sys.path[0], r'.\data\jieba_train.txt')
pro.seq_word(train_path, jieba_train)
# 用jeiba对测试集进行分词处理
jieba_train = os.path.join(sys.path[0], r'.\data\jieba_test.txt')
pro.seq_word(test_path, jieba_train)
output_tag = os.path.join(sys.path[0], r'.\data\train_tag.txt')
pro.postag_txt(train_path, output_tag)
# F1值计算
def estimate_F1():
# 评估分词模型的准确率
hmm_path = r'.\data\hmm_output.txt'
jieba_path = r'.\data\jieba_test.txt'
with open(hmm_path, 'r',
encoding='utf-8') as f_hmm, open(jieba_path,
'r',
encoding='utf-8') as f_jieba:
all_words_answer = 0
all_words_sample = 0
correct = 0
sentence1 = f_hmm.readline()
sentence2 = f_jieba.readline()
while sentence1:
hmm_line = sentence1.split()
jieab_line = set(sentence2.split())
same_list = [x for x in hmm_line if x in jieab_line]
all_words_answer += len(hmm_line)
all_words_sample += len(jieab_line)
correct += len(same_list)
sentence1 = f_hmm.readline()
sentence2 = f_jieba.readline()
recall = correct / all_words_answer
precision = correct / all_words_sample
f_mesure = (2 * precision * recall) / (precision + recall)
print("词数:", all_words_answer)
print("Precision:", round(precision, 4), "Recall", round(recall, 4),
"F-mesure", round(f_mesure, 4))
if __name__ == "__main__":
Process()
清华动物词典
自建筛选词典
stop_words文件
insect.txt文件
生成词典my_dict