【GuidedLDA】创建数据集预处理

安装

pip install guidedlda

使用

这个就略了,官方文档无脑运行就行,问题是如何换自己的数据集

数据集预处理

首先看一下他的数据集是啥样的

nyt.ldac和nyt.tokens

nyt.tokens存放了所有文档的所有词(分词之后的),这是没有重复的,后面会将如何用工具给去重

nyt.ldac存放了每一篇文档分词之后每个词在tokens中的位置:出现次数,每篇文档的词只做分词和去停词,不做去重

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import csv
import re
import numpy as np
import guidedlda
from gensim.corpora import Dictionary
from gensim.corpora.dictionary import Dictionary
import jieba
import pandas as pd

# 读取除了id的所有列
news = pd.read_csv('../news.csv', encoding='utf-8').iloc[:, 1:]

STOP_WORDS = []
with open('../stopwords.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        STOP_WORDS.append(line)


def stopwordslist():
    stopwords = [line.strip() for line in open('../stopwords.txt', encoding='UTF-8').readlines()]
    return stopwords


# 对句子进行中文分词
def seg_depart(sentence):
    # 对文档中的每一行进行中文分词
    print("正在分词")
    sentence_depart = jieba.cut(sentence.strip())
    # 创建一个停用词列表
    stopwords = stopwordslist()
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr


# 给出文档路径
filename = "../news.csv"
outfilename = "out.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')

documents = []

for row in news.itertuples():
    documents.append(row.content)

words = []
# 将输出结果写入ou.txt中
for line in documents:
    line_seg = seg_depart(line)
    # str
    print(line_seg)
    words.append(line_seg.split())
    outputs.write(line_seg + '\n')
    print("-------------------正在分词和去停用词-----------")
outputs.close()
inputs.close()
print("删除停用词和分词成功!!!")


def print_dict(dic):
    for key in dic:
        print(dic[key])

# [[],[],[],...,[]]
print(len(words))
dic = Dictionary(words)
print(dic.dfs)
print(len(dic.dfs))
tokens = "news.tokens"

write_tokens = open(tokens, 'w', encoding='UTF-8')
for key in dic:
    write_tokens.write(dic[key])
    write_tokens.write('\r\n')

这一步,把所有文档的词分词、去停词、然后生成词典,用gensim的Dictionary去做

接下来,我们需要对每一篇文档进行分词、去停词操作,这样我们得到一个二维列表,存放每篇文档的分好的词

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import csv
import re
import numpy as np
import guidedlda
from gensim.corpora import Dictionary
from gensim.corpora.dictionary import Dictionary
import jieba
import pandas as pd

# 读取除了id的所有列
news = pd.read_csv('../news.csv', encoding='utf-8').iloc[:, 1:]

STOP_WORDS = []
with open('../stopwords.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        STOP_WORDS.append(line)


def stopwordslist():
    stopwords = [line.strip() for line in open('../stopwords.txt', encoding='UTF-8').readlines()]
    return stopwords


# 对句子进行中文分词
def seg_depart(sentence):
    # 对文档中的每一行进行中文分词
    print("正在分词")
    sentence_depart = jieba.cut(sentence.strip())
    # 创建一个停用词列表
    stopwords = stopwordslist()
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr


# 给出文档路径
filename = "../news.csv"
outfilename = "out.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')

documents = []

for row in news.itertuples():
    documents.append(row.content)

all_words = []
# 将输出结果写入ou.txt中
for line in documents:
    line_seg = seg_depart(line)
    # str
    print(line_seg)
    all_words.append(line_seg.split())
    outputs.write(line_seg + '\n')
    print("-------------------正在分词和去停用词-----------")
outputs.close()
inputs.close()
print("删除停用词和分词成功!!!")


def print_dict(dic):
    for key in dic:
        print(dic[key])

# [[],[],[],...,[]]
print(len(all_words))
dic = Dictionary(all_words)
print(dic.dfs)
print(len(dic.dfs))
ldac = "news.ldac"

# 词袋
vocab = open('news.tokens', 'r')
vocabs = []
for i in vocab.readlines():
    i = i.strip('\n')
    vocabs.append(i)

for words in all_words:
    idac_dic = {}
    idac_count_dic = {}

    count = 0
    # words是当前文本分词后的列表
    for word in words:
        if word in vocabs:
            # count += 1
            if word not in idac_dic:
                idac_dic[word] = 1
                count += 1
            else:
                idac_dic[word] = idac_dic[word] + 1
        else:
            idac_dic[word] = 0

    for k, v in idac_dic.items():
        idac_count_dic[vocabs.index(k)] = v

    write_ldac = open(ldac, 'a', encoding='UTF-8')

    dic_len = len(idac_count_dic)
    start = 1
    write_ldac.write(str(count) + " ")
    for key, value in idac_count_dic.items():
        write_ldac.write(str(key) + ":" + str(value) + " ")
    write_ldac.write('\r\n')

这里就是先统计每篇文档的词出现的次数,然后去词袋(tokens)里找索引位置,生成ldac文件,这个文件的每一行的开头就是文档分词之后的数量

先写这么多,明天再补充

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章