文檔分詞
在分詞的同時需要設置停用詞和自定義詞典
import jieba
jieba.load_userdict('userdict.txt') # 本地文檔
words_stop= [line.strip() for line in open('stop.txt','r',encoding ='utf-8').readlines()] # 本地文檔
words_stop =['',' ','\n','\r\n','\t']+words_stop
cut_txt = lambda x: [i for i in list(jieba.cut(str(x)) ) if i not in words_stop]
data_comment['words'] = data_comment['content'].progress_apply(cut_txt)
gensim--CBOW,skip-gram模型
import numpy as np
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
vector_size =100
# 裏面有很多參數需要深挖
#參數:sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
model = Word2Vec(common_texts, size=vector_size, window=5, min_count=1, workers=4,seed=123)
#查看詞向量
model['human']
# 保存
model.save("word2vec.model")
#加載模型
model = Word2Vec.load("word2vec.model")
#詞向量轉化爲句子向量
def w2v_to_d2v(model,common_texts,vector_size):
w2v_feat = np.zeros((len(common_texts), vector_size))
w2v_feat_avg = np.zeros((len(common_texts), vector_size))
i = 0
for line in common_texts:
num = 0
for word in line:
num += 1
vec = model.wv[word]
w2v_feat[i,:] += vec
w2v_feat_avg[i, :] = w2v_feat[i, :] / num
i += 1
return w2v_feat,w2v_feat_avg
w2v_feat,w2v_feat_avg = w2v_to_d2v(model,common_texts,vector_size)
TfidfVectorizer--TF-IDF權重
from gensim.test.utils import common_texts
from sklearn.feature_extraction.text import TfidfVectorizer
common_texts_list =[' '.join(x) for x in common_texts]
cv=TfidfVectorizer(binary=False,decode_error='ignore',stop_words='english')
vec=cv.fit_transform(common_texts_list)#傳入句子組成的list
生成新的tfidf值
vec_sample = cv.transform(['human interface computer'])
#獲取每句話的權重
arr=vec.toarray()
#獲取每個詞
word=cv.get_feature_names()
gensim--TF-IDF
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import TfidfModel
dictionary = Dictionary(common_texts)
#獲取每個詞語的位置
print(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in common_texts]
model = TfidfModel(corpus)
model.save("tfidf.model")
TfidfModel.load("tfidf.model")
corpus_tfidf = model[corpus]
#獲取第一個句子的tf-idf
tf_sample = corpus_tfidf[0]
gensim--DBOW_D2C模型
# 主要處理模型維數、增量訓練次
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
# dm=0 dbow模型,dm = 1模型
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
#多次訓練
model=Doc2Vec(dm=0,vector_size=10, window=5, min_count=2, workers=4)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.iter)
# 保存模型
model.save('d2v_cbow.model')
# 加載模型
model=Doc2Vec.load('d2v_cbow.model')
# 查看每個句子的向量
model.docvecs[1]
#獲取詞向量
X_d2v = np.array([model.docvecs[i] for i in range(len(common_texts))])
gensim--DM_D2C模型
# 主要處理模型維數、增量訓練次
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
# dm=0 dbow模型,dm = 1模型
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
#多次訓練
model=Doc2Vec(dm=1,vector_size=10, window=5, min_count=2, workers=4)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
#保存模型
model.save('d2v_dw.model')
# 模型加載
model=Doc2Vec.load('d2v_dw.model')
# 查看每個句子的向量
model.docvecs[0]
#獲取詞向量
X_d2v = np.array([model.docvecs[i] for i in range(len(common_texts))])
gensim--LDA模型
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
# Train the model on the corpus.
lda = LdaModel(common_corpus, id2word=common_dictionary,num_topics=10)
doc_topic = [a for a in lda[common_corpus]]
doc_topic[1]
for i in lda.print_topics(num_topics=10):
print(i)
# 利用上面生成的tfidf權重作爲語料
#方法一
from gensim.models import LdaModel
lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=10)
doc_topic = [a for a in lda[corpus_tfidf]]
#方法二
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
doc_topic = [a for a in lda[corpus_tfidf]]
def doc_topic_df(doc_topic):
df_list = []
for i in doc_topic:
top_list=[]
for j in i:
top_list.append(j[1])
df_list.append(top_list)
df_top = pd.DataFrame(df_list)
return df_top
# 轉化爲數據框
df_topic = doc_topic_df(doc_topic)
gensim-- LSI模型
import pandas as pd
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LsiModel
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
#也可以用tf-idf訓練的權重當做 common_corpus
model = LsiModel(common_corpus, id2word=common_dictionary)
#model = LsiModel(corpus_tfidf, id2word=common_dictionary)
vectorized_corpus = model[common_corpus] # vectorize input copus in BoW format
#獲取第一個句子的lsi
vectorized_corpus[0]
doc_lsi = [a for a in vectorized_corpus]
def doc_lsic_df(doc_lsi):
df_list = []
for i in doc_lsi:
top_list=[]
for j in i:
top_list.append(j[1])
df_list.append(top_list)
df_lsi = pd.DataFrame(df_list)
return df_lsi
#獲取數據框
df_doc_lsi = doc_lsic_df(doc_lsi)
特徵統計
這裏還可以做的事情有:
- 文本的詞的數目
- 文本中的一些關鍵詞做特徵,例如:酒駕、毒品等
- 文本中出現的日期
- 文本中出現的地點