NLP對文本數據處理的大總結--你值得擁有

文檔分詞

在分詞的同時需要設置停用詞和自定義詞典

import jieba
jieba.load_userdict('userdict.txt') # 本地文檔
words_stop= [line.strip() for line in open('stop.txt','r',encoding ='utf-8').readlines()] # 本地文檔
words_stop =['',' ','\n','\r\n','\t']+words_stop
cut_txt = lambda x: [i for i in list(jieba.cut(str(x)) ) if i not in words_stop] 
data_comment['words'] = data_comment['content'].progress_apply(cut_txt)

gensim--CBOW,skip-gram模型

import numpy as np
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
vector_size =100
# 裏面有很多參數需要深挖
#參數:sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
model = Word2Vec(common_texts, size=vector_size, window=5, min_count=1, workers=4,seed=123)
#查看詞向量
model['human']
# 保存
model.save("word2vec.model")

#加載模型
model = Word2Vec.load("word2vec.model")

#詞向量轉化爲句子向量
def w2v_to_d2v(model,common_texts,vector_size):
    w2v_feat = np.zeros((len(common_texts), vector_size))
    w2v_feat_avg = np.zeros((len(common_texts), vector_size))
    i = 0
    for line in common_texts:
        num = 0
        for word in line:
            num += 1
            vec = model.wv[word]
            w2v_feat[i,:] += vec
        w2v_feat_avg[i, :] = w2v_feat[i, :] / num
        i += 1
    return w2v_feat,w2v_feat_avg

w2v_feat,w2v_feat_avg = w2v_to_d2v(model,common_texts,vector_size)

TfidfVectorizer--TF-IDF權重

from gensim.test.utils import common_texts
from sklearn.feature_extraction.text import TfidfVectorizer
common_texts_list =[' '.join(x) for x in common_texts]
cv=TfidfVectorizer(binary=False,decode_error='ignore',stop_words='english')
vec=cv.fit_transform(common_texts_list)#傳入句子組成的list
生成新的tfidf值
vec_sample = cv.transform(['human interface computer'])
#獲取每句話的權重
arr=vec.toarray()
#獲取每個詞
word=cv.get_feature_names()

gensim--TF-IDF

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import TfidfModel
    
dictionary = Dictionary(common_texts) 
#獲取每個詞語的位置
print(dictionary.token2id) 
corpus = [dictionary.doc2bow(text) for text in common_texts]  
model = TfidfModel(corpus)  
model.save("tfidf.model")
TfidfModel.load("tfidf.model")
corpus_tfidf = model[corpus]
#獲取第一個句子的tf-idf
tf_sample = corpus_tfidf[0]

gensim--DBOW_D2C模型

# 主要處理模型維數、增量訓練次
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
# dm=0 dbow模型,dm = 1模型
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
#多次訓練
model=Doc2Vec(dm=0,vector_size=10, window=5, min_count=2, workers=4)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.iter)
# 保存模型
model.save('d2v_cbow.model')
# 加載模型
model=Doc2Vec.load('d2v_cbow.model')
# 查看每個句子的向量
model.docvecs[1]

#獲取詞向量
X_d2v = np.array([model.docvecs[i] for i in range(len(common_texts))])

gensim--DM_D2C模型

# 主要處理模型維數、增量訓練次
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
# dm=0 dbow模型,dm = 1模型
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
#多次訓練
model=Doc2Vec(dm=1,vector_size=10, window=5, min_count=2, workers=4)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
#保存模型
model.save('d2v_dw.model')
# 模型加載
model=Doc2Vec.load('d2v_dw.model')
# 查看每個句子的向量
model.docvecs[0]
#獲取詞向量
X_d2v = np.array([model.docvecs[i] for i in range(len(common_texts))])

gensim--LDA模型

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

# Train the model on the corpus.
lda = LdaModel(common_corpus, id2word=common_dictionary,num_topics=10)

doc_topic = [a for a in lda[common_corpus]]
doc_topic[1]
 
for i in lda.print_topics(num_topics=10):
    print(i)
    
# 利用上面生成的tfidf權重作爲語料
#方法一
from gensim.models import LdaModel
lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=10) 
doc_topic = [a for a in lda[corpus_tfidf]]   
#方法二  
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10) 
doc_topic = [a for a in lda[corpus_tfidf]]

def doc_topic_df(doc_topic):
    df_list = []   
    for i in  doc_topic:
        top_list=[]
        for j in i:
            top_list.append(j[1])
        df_list.append(top_list)
    df_top = pd.DataFrame(df_list)   
    return df_top
 # 轉化爲數據框  
df_topic = doc_topic_df(doc_topic) 
 

gensim-- LSI模型

import pandas as pd
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LsiModel

common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
#也可以用tf-idf訓練的權重當做 common_corpus
model = LsiModel(common_corpus, id2word=common_dictionary)
#model = LsiModel(corpus_tfidf, id2word=common_dictionary)
vectorized_corpus = model[common_corpus]  # vectorize input copus in BoW format

#獲取第一個句子的lsi
vectorized_corpus[0]
doc_lsi = [a for a in vectorized_corpus] 
def doc_lsic_df(doc_lsi):
    df_list = []   
    for i in  doc_lsi:
        top_list=[]
        for j in i:
            top_list.append(j[1])
        df_list.append(top_list)
    df_lsi = pd.DataFrame(df_list)   
    return df_lsi
#獲取數據框
df_doc_lsi = doc_lsic_df(doc_lsi)
    

特徵統計

這裏還可以做的事情有:

  1. 文本的詞的數目
  2. 文本中的一些關鍵詞做特徵,例如:酒駕、毒品等
  3. 文本中出現的日期
  4. 文本中出現的地點

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章