TextRank是在Google的PageRank算法啓發下,針對文本里的句子設計的權重算法,目標是自動摘要。它利用投票的原理,讓每一個單詞給它的鄰居(術語稱窗口)投贊成票,票的權重取決於自己的票數。這是一個“先有雞還是先有蛋”的悖論,PageRank採用矩陣迭代收斂的方式解決了這個悖論。TextRank也不例外。更多關於TextRank的介紹,博主推薦碼農場的文章:http://www.hankcs.com/nlp/textrank-algorithm-to-extract-the-keywords-java-implementation.html
TextRank原理簡單,本文不再講解,直接上代碼,對代碼裏關鍵部分會進行詳細註釋:
class TextRankWordSet extends Serializable{
def transform(document: Iterable[_]): mutable.HashMap[String, mutable.HashSet[String]] ={
val keyword = mutable.HashMap.empty[String, mutable.HashSet[String]]
val que = mutable.Queue.empty[String]
document.foreach { term =>
val word = term.toString
if (!keyword.contains(word)) {
/* 初始化,對每個分詞分配一個 HashSet 空間*/
keyword.put(word, mutable.HashSet.empty[String])
}
que.enqueue(word)
if (que.size > 5) {
que.dequeue()
}
for (w1 <- que) {
for (w2 <- que) {
if (!w1.equals(w2)) {
keyword.apply(w1).add(w2)
keyword.apply(w2).add(w1)
}
}
}
}
keyword
}
def transform[D <: Iterable[_]] (dataset: RDD[D]): RDD[mutable.HashMap[String, mutable.HashSet[String]]] = {
dataset.map(this.transform)
}
}
class TextRankKeyword extends Serializable{
var numKeyword: Int = 10 /* 關鍵詞個數 */
var d: Double = 0.85f /* 阻尼係數 */
var max_iter: Int = 200 /* 最大迭代次數 */
var min_diff: Double = 0.001f /* 最小變化區間 */
private final var index:Int = 0
/* 排序,根據分詞對應的權重,由高到底排序 */
def sortByValue(dataset: RDD[mutable.HashMap[String, Double]]): RDD[Seq[(String, Double)]] = {
dataset.map(doc => {
val mapDoc = doc.toSeq
println("mapDoc before Sort: ")
mapDoc.foreach(println)
mapDoc.sortWith(_._2 > _._2)
})
}
def rank(document: mutable.HashMap[String, mutable.HashSet[String]]): mutable.HashMap[String, Double] = {
var score = mutable.HashMap.empty[String, Double]
breakable {
for (iter <- 1 to max_iter) {
val tmpScore = mutable.HashMap.empty[String, Double]
var max_diff: Double = 0f
for (word <- document) {
tmpScore.put(word._1, 1 - d)
for (element <- word._2) {
val size = document.apply(element).size
if(0 == size) println("document.apply(element).size == 0 :element: " + element + "keyword: " + word._1)
if(word._1.equals(element)) println("word._1.equals(element): " + element + "keyword: " + word._1)
if ((!word._1.equals(element)) && (0 != size)) {
/* 計算,這裏計算方式可以和TextRank的公式對應起來 */
tmpScore.put(word._1, tmpScore.apply(word._1) + ((d / size) * score.getOrElse(word._1, 0.0d)))
}
}
/* 取出每次計算中變化最大的值,用於下面得比較,如果max_diff的變化低於min_diff,則停止迭代 */
max_diff = Math.max(max_diff, Math.abs(tmpScore.apply(word._1) - score.getOrElse(word._1, 0.0d)))
}
score = tmpScore
if(max_diff <= min_diff) break()
}
}
score
}
def rank(dataset: RDD[mutable.HashMap[String, mutable.HashSet[String]]]): RDD[mutable.HashMap[String, Double]] = {
dataset.map(this.rank)
}
}