【GuidedLDA】創建數據集預處理

安裝

pip install guidedlda

使用

這個就略了,官方文檔無腦運行就行,問題是如何換自己的數據集

數據集預處理

首先看一下他的數據集是啥樣的

nyt.ldac和nyt.tokens

nyt.tokens存放了所有文檔的所有詞(分詞之後的),這是沒有重複的,後面會將如何用工具給去重

nyt.ldac存放了每一篇文檔分詞之後每個詞在tokens中的位置:出現次數,每篇文檔的詞只做分詞和去停詞,不做去重

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import csv
import re
import numpy as np
import guidedlda
from gensim.corpora import Dictionary
from gensim.corpora.dictionary import Dictionary
import jieba
import pandas as pd

# 讀取除了id的所有列
news = pd.read_csv('../news.csv', encoding='utf-8').iloc[:, 1:]

STOP_WORDS = []
with open('../stopwords.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        STOP_WORDS.append(line)


def stopwordslist():
    stopwords = [line.strip() for line in open('../stopwords.txt', encoding='UTF-8').readlines()]
    return stopwords


# 對句子進行中文分詞
def seg_depart(sentence):
    # 對文檔中的每一行進行中文分詞
    print("正在分詞")
    sentence_depart = jieba.cut(sentence.strip())
    # 創建一個停用詞列表
    stopwords = stopwordslist()
    # 輸出結果爲outstr
    outstr = ''
    # 去停用詞
    for word in sentence_depart:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr


# 給出文檔路徑
filename = "../news.csv"
outfilename = "out.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')

documents = []

for row in news.itertuples():
    documents.append(row.content)

words = []
# 將輸出結果寫入ou.txt中
for line in documents:
    line_seg = seg_depart(line)
    # str
    print(line_seg)
    words.append(line_seg.split())
    outputs.write(line_seg + '\n')
    print("-------------------正在分詞和去停用詞-----------")
outputs.close()
inputs.close()
print("刪除停用詞和分詞成功!!!")


def print_dict(dic):
    for key in dic:
        print(dic[key])

# [[],[],[],...,[]]
print(len(words))
dic = Dictionary(words)
print(dic.dfs)
print(len(dic.dfs))
tokens = "news.tokens"

write_tokens = open(tokens, 'w', encoding='UTF-8')
for key in dic:
    write_tokens.write(dic[key])
    write_tokens.write('\r\n')

這一步,把所有文檔的詞分詞、去停詞、然後生成詞典,用gensim的Dictionary去做

接下來,我們需要對每一篇文檔進行分詞、去停詞操作,這樣我們得到一個二維列表,存放每篇文檔的分好的詞

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import csv
import re
import numpy as np
import guidedlda
from gensim.corpora import Dictionary
from gensim.corpora.dictionary import Dictionary
import jieba
import pandas as pd

# 讀取除了id的所有列
news = pd.read_csv('../news.csv', encoding='utf-8').iloc[:, 1:]

STOP_WORDS = []
with open('../stopwords.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        STOP_WORDS.append(line)


def stopwordslist():
    stopwords = [line.strip() for line in open('../stopwords.txt', encoding='UTF-8').readlines()]
    return stopwords


# 對句子進行中文分詞
def seg_depart(sentence):
    # 對文檔中的每一行進行中文分詞
    print("正在分詞")
    sentence_depart = jieba.cut(sentence.strip())
    # 創建一個停用詞列表
    stopwords = stopwordslist()
    # 輸出結果爲outstr
    outstr = ''
    # 去停用詞
    for word in sentence_depart:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr


# 給出文檔路徑
filename = "../news.csv"
outfilename = "out.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')

documents = []

for row in news.itertuples():
    documents.append(row.content)

all_words = []
# 將輸出結果寫入ou.txt中
for line in documents:
    line_seg = seg_depart(line)
    # str
    print(line_seg)
    all_words.append(line_seg.split())
    outputs.write(line_seg + '\n')
    print("-------------------正在分詞和去停用詞-----------")
outputs.close()
inputs.close()
print("刪除停用詞和分詞成功!!!")


def print_dict(dic):
    for key in dic:
        print(dic[key])

# [[],[],[],...,[]]
print(len(all_words))
dic = Dictionary(all_words)
print(dic.dfs)
print(len(dic.dfs))
ldac = "news.ldac"

# 詞袋
vocab = open('news.tokens', 'r')
vocabs = []
for i in vocab.readlines():
    i = i.strip('\n')
    vocabs.append(i)

for words in all_words:
    idac_dic = {}
    idac_count_dic = {}

    count = 0
    # words是當前文本分詞後的列表
    for word in words:
        if word in vocabs:
            # count += 1
            if word not in idac_dic:
                idac_dic[word] = 1
                count += 1
            else:
                idac_dic[word] = idac_dic[word] + 1
        else:
            idac_dic[word] = 0

    for k, v in idac_dic.items():
        idac_count_dic[vocabs.index(k)] = v

    write_ldac = open(ldac, 'a', encoding='UTF-8')

    dic_len = len(idac_count_dic)
    start = 1
    write_ldac.write(str(count) + " ")
    for key, value in idac_count_dic.items():
        write_ldac.write(str(key) + ":" + str(value) + " ")
    write_ldac.write('\r\n')

這裏就是先統計每篇文檔的詞出現的次數,然後去詞袋(tokens)裏找索引位置,生成ldac文件,這個文件的每一行的開頭就是文檔分詞之後的數量

先寫這麼多,明天再補充

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章