spark構建協同過濾ALS推薦模型

package com.erongda.bigdata.spark.mllib.rmd

import com.erongda.bigdata.spark.ContantUtils
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * 使用MovieLens 電影評分數據集,調用Spark MLlib 中協同過濾推薦算法ALS建立推薦模型:
  *   -a. 預測 用戶User 對 某個電影Product 評價
  *   -b. 爲某個用戶推薦10個電影Products
  *   -c. 爲某個電影推薦10個用戶Users
  *
  *  TODO: 使用基於RDD的Spark MLlib機器學習庫API
  */
object MovieALSRmd {

  def main(args: Array[String]): Unit = {

    // TODO: 1. 構建SparkContext實例對象
    val sparkConf = new SparkConf()
      .setAppName("MovieALSRmd")
      .setMaster("local[3]")
    //  .set("spark.driver.extraJavaOptions", "-Xss10m")
    val sc = SparkContext.getOrCreate(sparkConf)
    sc.setLogLevel("WARN")

    // TODO: 2. 讀取 電影評分數據
    val rawRatingsRDD: RDD[String] = sc.textFile(
      ContantUtils.LOCAL_DATA_DIC + "/als/movielens/ml-100k/u.data")
    println(s"Count = ${rawRatingsRDD.count()}")
    println(s"First: \n ${rawRatingsRDD.first()}")

    // TODO: 3. 數據轉換,構建RDD[Rating]
    val ratingsRDD: RDD[Rating] = rawRatingsRDD
      // 過濾不合格的數據
      .filter(line => line.length > 0 && line.split("\t").length == 4)
      .map(line => {
        // 字符串分割
        val Array(userId, moiveId, rating, _) = line.split("\t")
        // 返回Rating實例對象
        Rating(userId.toInt, moiveId.toInt, rating.toDouble)
      })

    // TODO: 4. 調用ALS算法中顯示訓練函數訓練模型
    import org.apache.spark.mllib.recommendation.ALS
    // 迭代次數爲20,特徵數爲10
    val alsModel: MatrixFactorizationModel = ALS.train(ratings = ratingsRDD, rank = 10, iterations = 20)

    // TODO: 模型評估
    import org.apache.spark.mllib.evaluation.RegressionMetrics

    val uprsRDD: RDD[((Int, Int), Double)] = ratingsRDD.map(tuple => ((tuple.user, tuple.product), tuple.rating))
    // def predict(usersProducts: RDD[(Int, Int)]): RDD[Rating]
    val predictUprs: RDD[((Int, Int), Double)] = alsModel
      .predict(uprsRDD.map(_._1))
      .map(tuple => ((tuple.user, tuple.product), tuple.rating))
    val predictAndArtual: RDD[((Int, Int), (Double, Double))] = predictUprs.join(uprsRDD)
    val metrics = new RegressionMetrics(predictAndArtual.map(_._2))

    println(s"RMSE = ${metrics.rootMeanSquaredError}")
    println(s"MSE = ${metrics.meanSquaredError}")


    /**
      * 獲取模型MatrixFactorizationModel就是裏面包含兩個矩陣:
      *      -a. 用戶因子矩陣
      *         alsModel.userFeatures
      *      -b. 產品因子矩陣
      *
      */
    // userId -> Features
    val userFeatures: RDD[(Int, Array[Double])] = alsModel.userFeatures
    // userFeatures.take(10).foreach(tuple => println(tuple._1 + " -> \n\t" + tuple._2.mkString(",")))
    // productId -> Features
    val productFeatures: RDD[(Int, Array[Double])] = alsModel.productFeatures
    // productFeatures.take(10).foreach(tuple => println(tuple._1 + " -> \n\t" + tuple._2.mkString(",")))


    // TODO 5. 推薦與預測評分
    // a. 預測某個用戶對某個產品的評分  def predict(user: Int, product: Int): Double
    val predictRating: Double = alsModel.predict(196, 242)
    println(s"預測用戶196對電影242的評分:$predictRating")

    println("----------------------------------------")

    // b. 爲某個用戶推薦十部電影  def recommendProducts(user: Int, num: Int): Array[Rating]
    val rmdMovies: Array[Rating] = alsModel.recommendProducts(196, 10)
    rmdMovies.foreach(println)

    println("----------------------------------------")

    // c. 爲某個電影推薦10個用戶  def recommendUsers(product: Int, num: Int): Array[Rating]
    val rmdUsers = alsModel.recommendUsers(242, 10)
    rmdUsers.foreach(println)

    // TODO: 6. 將訓練得到的模型進行保存,以便後期加載使用進行推薦
    // override def save(sc: SparkContext, path: String): Unit
    // alsModel.save(sc, ContantUtils.LOCAL_DATA_DIC + "/als/ml-als-model")

    // TODO: 7. 從文件系統中記載保存的模型,用於推薦預測
    // override def load(sc: SparkContext, path: String): MatrixFactorizationModel
    val loadAlsModel: MatrixFactorizationModel = MatrixFactorizationModel
      .load(sc, ContantUtils.LOCAL_DATA_DIC + "/als/ml-als-model")
    // 使用加載預測
    val loaPredictRating: Double = loadAlsModel.predict(196, 242)
    println(s"加載模型預測用戶196對電影242的評分:$loaPredictRating")

    // 爲了WEB UI監控,線程休眠
    Thread.sleep(10000000)

    // 關閉資源
    sc.stop()
  }

}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章