LDA:隱狄利克雷分配,常用於文本主題模型(主題分類、聚類)。注意LDA也是線性判別分析的縮寫
參考一篇文章:https://zhuanlan.zhihu.com/p/31470216
from pyspark.ml.clustering import LDA
# $example off$
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("LDAExample") \
.getOrCreate()
# Loads data.
dataset = spark.read.format("libsvm").load("sample_lda_libsvm_data.txt")
# optimizer:'online','em'
# k:topic數量
# learningOffset: 降低早期迭代的權重,值越大,早期迭代數量越少
# learningDecay: 學習率衰減,設置爲0.5-1之間以保證漸進收斂
# subsamplingRate:mini-batch採樣比例
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)
ll = model.logLikelihood(dataset)# 對數似然
lp = model.logPerplexity(dataset)# 對數困惑度
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))
model.vocabSize()# 詞彙量
topics = model.describeTopics(maxTermsPerTopic=3)# 每個主題前3權重大的詞彙
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)
# 轉換後的新列應該表示在10個主題上的權重,和爲1
transformed = model.transform(dataset)
transformed.show(truncate=False)