package com.erongda.bigdata.spark.mllib.rmd
import com.erongda.bigdata.spark.ContantUtils
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 使用MovieLens 電影評分數據集,調用Spark MLlib 中協同過濾推薦算法ALS建立推薦模型:
* -a. 預測 用戶User 對 某個電影Product 評價
* -b. 爲某個用戶推薦10個電影Products
* -c. 爲某個電影推薦10個用戶Users
*
* TODO: 使用基於RDD的Spark MLlib機器學習庫API
*/
object MovieALSRmd {
def main(args: Array[String]): Unit = {
// TODO: 1. 構建SparkContext實例對象
val sparkConf = new SparkConf()
.setAppName("MovieALSRmd")
.setMaster("local[3]")
// .set("spark.driver.extraJavaOptions", "-Xss10m")
val sc = SparkContext.getOrCreate(sparkConf)
sc.setLogLevel("WARN")
// TODO: 2. 讀取 電影評分數據
val rawRatingsRDD: RDD[String] = sc.textFile(
ContantUtils.LOCAL_DATA_DIC + "/als/movielens/ml-100k/u.data")
println(s"Count = ${rawRatingsRDD.count()}")
println(s"First: \n ${rawRatingsRDD.first()}")
// TODO: 3. 數據轉換,構建RDD[Rating]
val ratingsRDD: RDD[Rating] = rawRatingsRDD
// 過濾不合格的數據
.filter(line => line.length > 0 && line.split("\t").length == 4)
.map(line => {
// 字符串分割
val Array(userId, moiveId, rating, _) = line.split("\t")
// 返回Rating實例對象
Rating(userId.toInt, moiveId.toInt, rating.toDouble)
})
// TODO: 4. 調用ALS算法中顯示訓練函數訓練模型
import org.apache.spark.mllib.recommendation.ALS
// 迭代次數爲20,特徵數爲10
val alsModel: MatrixFactorizationModel = ALS.train(ratings = ratingsRDD, rank = 10, iterations = 20)
// TODO: 模型評估
import org.apache.spark.mllib.evaluation.RegressionMetrics
val uprsRDD: RDD[((Int, Int), Double)] = ratingsRDD.map(tuple => ((tuple.user, tuple.product), tuple.rating))
// def predict(usersProducts: RDD[(Int, Int)]): RDD[Rating]
val predictUprs: RDD[((Int, Int), Double)] = alsModel
.predict(uprsRDD.map(_._1))
.map(tuple => ((tuple.user, tuple.product), tuple.rating))
val predictAndArtual: RDD[((Int, Int), (Double, Double))] = predictUprs.join(uprsRDD)
val metrics = new RegressionMetrics(predictAndArtual.map(_._2))
println(s"RMSE = ${metrics.rootMeanSquaredError}")
println(s"MSE = ${metrics.meanSquaredError}")
/**
* 獲取模型MatrixFactorizationModel就是裏面包含兩個矩陣:
* -a. 用戶因子矩陣
* alsModel.userFeatures
* -b. 產品因子矩陣
*
*/
// userId -> Features
val userFeatures: RDD[(Int, Array[Double])] = alsModel.userFeatures
// userFeatures.take(10).foreach(tuple => println(tuple._1 + " -> \n\t" + tuple._2.mkString(",")))
// productId -> Features
val productFeatures: RDD[(Int, Array[Double])] = alsModel.productFeatures
// productFeatures.take(10).foreach(tuple => println(tuple._1 + " -> \n\t" + tuple._2.mkString(",")))
// TODO 5. 推薦與預測評分
// a. 預測某個用戶對某個產品的評分 def predict(user: Int, product: Int): Double
val predictRating: Double = alsModel.predict(196, 242)
println(s"預測用戶196對電影242的評分:$predictRating")
println("----------------------------------------")
// b. 爲某個用戶推薦十部電影 def recommendProducts(user: Int, num: Int): Array[Rating]
val rmdMovies: Array[Rating] = alsModel.recommendProducts(196, 10)
rmdMovies.foreach(println)
println("----------------------------------------")
// c. 爲某個電影推薦10個用戶 def recommendUsers(product: Int, num: Int): Array[Rating]
val rmdUsers = alsModel.recommendUsers(242, 10)
rmdUsers.foreach(println)
// TODO: 6. 將訓練得到的模型進行保存,以便後期加載使用進行推薦
// override def save(sc: SparkContext, path: String): Unit
// alsModel.save(sc, ContantUtils.LOCAL_DATA_DIC + "/als/ml-als-model")
// TODO: 7. 從文件系統中記載保存的模型,用於推薦預測
// override def load(sc: SparkContext, path: String): MatrixFactorizationModel
val loadAlsModel: MatrixFactorizationModel = MatrixFactorizationModel
.load(sc, ContantUtils.LOCAL_DATA_DIC + "/als/ml-als-model")
// 使用加載預測
val loaPredictRating: Double = loadAlsModel.predict(196, 242)
println(s"加載模型預測用戶196對電影242的評分:$loaPredictRating")
// 爲了WEB UI監控,線程休眠
Thread.sleep(10000000)
// 關閉資源
sc.stop()
}
}