NLP之昆虫词典制作

本次NLP作业需要每个人在小组选定领域下进行子领域词典制作，我们小组选定的领域为动物。我个人选定的子领域为昆虫，原始语料库来自《昆虫记》这本书。通过爬虫或者复制粘贴可以在本地得到关于《昆虫记》的文本文件。

数据的处理

读取文本，将句号替换成换行，跳过空行
通过自建筛选字典和清华动物字典，对文本进行处理，保留每行含有动物词汇的行
按照7:3的比例，划分训练集和测试集
读取训练集，生成昆虫领域词典。（most_common可以指定返回数目，因为有些动物名只出现一次，并且生成词典较小。所以这里并未指定，而是全部返回，然后人工筛选）
用jeiba对训练集进行分词处理

评估

中文分词系统的评价指标一般有切分准确率(Precision)、召回率(Recall)、分词精度(F1-Measure)等。本实验将F1作为评测指标来评价HMM模型对昆虫领域的分词效果，F1的计算公式如下所示：
$\begin{aligned} P&=\frac{系统正确识别的词语个数}{系统识别的词语个数}\times100\% \\ \\ R&=\frac{系统正确识别的词语个数}{测试语料中所有的词语个数}\times100\% \\ \\ F1&=\frac{2×P×R}{P+R}(P+R)\times100\% \end{aligned}$

python程序

import os
import sys
import random
import json
import jieba
import jieba.posseg
from collections import Counter


class ProcessData(object):
    def __init__(self, *dic_path):
        self.dictionary = set()
        self.maximum = 0
        # 加载词典
        for path in dic_path:
            self.load_dic(path)

    # 加载字典
    def load_dic(self, dic_path):
        with open(dic_path, 'r', encoding='utf-8') as fp:
            for line in fp:
                line = line.strip().split()[0]
                if not line:
                    continue
                self.dictionary.add(line)
                self.maximum = max(self.maximum, len(line))

    # 判断text中是否包含词典中的词
    def return_data(self, text):
        index = 0
        while index < len(text):
            match = False
            for size in range(self.maximum, 0, -1):
                if index + size > len(text):
                    continue
                piece = text[index:(index + size)]
                if piece in self.dictionary:
                    return True
            if not match:
                index += 1
        return False

    # 划分数据集和测试集
    def train_test_split(self, data_path, train_path, test_path):
        try:
            with open(data_path, 'r', encoding='utf-8') as fp, open(
                    train_path, 'w',
                    encoding='utf-8') as train, open(test_path,
                                                     'w',
                                                     encoding='utf-8') as test:
                lines = fp.readlines()
                threshold = int(len(lines) * 0.7)
                random.shuffle(lines)

                for i, line in enumerate(lines):
                    if i < threshold:
                        train.write(line)
                    else:
                        test.write(line)
        except Exception:
            print(sys.stderr, "文件读写出现错误")
            raise Exception
            sys.exit(1)

    # 对text文件进行词性标注
    def postag_txt(self, inputFile, outputFile):
        with open(inputFile, 'r',
                  encoding="utf-8") as fin, open(outputFile,
                                                 'w+',
                                                 encoding="utf-8",
                                                 newline='') as fout:
            # 词性统计
            d = {}
            for eachLine in fin:
                # 跳过空行
                if not len(eachLine.strip()):
                    continue
                else:
                    line = eachLine.strip().replace('。', os.linesep)
                    # 使用jieba词性标注
                    posLine = jieba.posseg.cut(line)
                    newLine = ''
                    for key in posLine:
                        # newLine += "{}/{}  ".format(key.word, key.flag)
                        d[key.flag] = d.get(key.flag, 0) + 1
                    # fout.write(newLine + os.linesep)
            fout.write(json.dumps(d))
        return True

    # 词频统计
    def count_words(self, text):
        # 加载停用词表
        stop_path = os.path.join(sys.path[0], r'.\data\stop_words.utf8')
        stopwords = [
            line.strip()
            for line in open(stop_path, 'r', encoding='utf-8').readlines()
        ]
        seg_list = jieba.cut(text)
        c = Counter()
        for word in seg_list:
            if word not in stopwords and len(word) > 1 and word != os.linesep:
                c[word] += 1

        # 前count
        lines = ""
        for (word, num) in c.most_common():
            line = word + "  " + str(num) + os.linesep
            lines += line
        return lines

    # jeiba分词
    def seq_word(self, input_path, output_path):
        with open(input_path, 'r',
                  encoding='utf-8') as fin, open(output_path,
                                                 'w',
                                                 encoding='utf-8') as fout:
            for line in fin.readlines():
                seq_list = jieba.cut(line, cut_all=False)
                fout.write(' '.join(seq_list))
        print("jeiba分词完成")


def Process():

    dict1_path = os.path.join(sys.path[0],
                              r'.\data\THUOCL_animal.txt')  # 清华动物词典
    dict2_path = os.path.join(sys.path[0], r'.\data\my_animal.txt')  # 自建词典

    input_path = os.path.join(sys.path[0], r'.\data\insect_origin.txt')
    output_path = os.path.join(sys.path[0], r'.\data\insect.txt')

    train_path = os.path.join(sys.path[0],
                              r'.\data\train_insect.txt')  # 生成训练集路径

    test_path = os.path.join(sys.path[0], r'.\data\test_insect.txt')  # 生成测试集路径

    pro = ProcessData(dict1_path, dict2_path)  # 加载词典

    try:
        with open(input_path, 'r',
                  encoding='utf-8') as input_text, open(output_path,
                                                        'w',
                                                        encoding='utf-8',
                                                        newline='') as output:
            for line in input_text:
                flag = pro.return_data(line.strip())
                if flag:
                    print("line:", line)
                    output.write(line)
    except Exception:
        print(sys.stderr, "文件打开错误")
        raise Exception
        sys.exit(1)

    print("数据处理完成")

    pro.train_test_split(output_path, train_path, test_path)
    print("训练集和测试集生成")

    my_dict = os.path.join(sys.path[0], r'.\data\my_dict.txt')
    # 词典生成
    with open(train_path, 'r',
              encoding='utf-8') as fin, open(my_dict,
                                             'w',
                                             encoding='utf-8',
                                             newline="") as fout:
        text = fin.read()
        text = pro.count_words(text)
        fout.write(text)

    # 用jeiba对训练集进行分词处理
    jieba_train = os.path.join(sys.path[0], r'.\data\jieba_train.txt')
    pro.seq_word(train_path, jieba_train)

    # 用jeiba对测试集进行分词处理
    jieba_train = os.path.join(sys.path[0], r'.\data\jieba_test.txt')
    pro.seq_word(test_path, jieba_train)

    output_tag = os.path.join(sys.path[0], r'.\data\train_tag.txt')
    pro.postag_txt(train_path, output_tag)

# F1值计算
def estimate_F1():
    # 评估分词模型的准确率
    hmm_path = r'.\data\hmm_output.txt'
    jieba_path = r'.\data\jieba_test.txt'

    with open(hmm_path, 'r',
              encoding='utf-8') as f_hmm, open(jieba_path,
                                               'r',
                                               encoding='utf-8') as f_jieba:
        all_words_answer = 0
        all_words_sample = 0
        correct = 0
        sentence1 = f_hmm.readline()
        sentence2 = f_jieba.readline()

        while sentence1:
            hmm_line = sentence1.split()
            jieab_line = set(sentence2.split())
            same_list = [x for x in hmm_line if x in jieab_line]

            all_words_answer += len(hmm_line)
            all_words_sample += len(jieab_line)
            correct += len(same_list)

            sentence1 = f_hmm.readline()
            sentence2 = f_jieba.readline()

            recall = correct / all_words_answer
            precision = correct / all_words_sample
            f_mesure = (2 * precision * recall) / (precision + recall)
        print("词数：", all_words_answer)
        print("Precision:", round(precision, 4), "Recall", round(recall, 4),
              "F-mesure", round(f_mesure, 4))


if __name__ == "__main__":
    Process()