創建推薦引擎

假設有一個在線電影網站,公司希望運用大數據分析推薦引擎,增加會員觀看電影次數。

ALS算法:

ALS算法是基於模型的推薦算法。起基本思想是對稀疏矩陣進行模型分解,評估出缺失項的值,以此來得到一個基本的訓練模型。然後依照此模型可以針對新的用戶和物品數據進行評估。ALS是採用交替的最小二乘法來算出缺失項的。交替的最小二乘法是在最小二乘法的基礎上發展而來的。

根據用戶對剷平項目的評分分爲:

數據文件:

顯示評分:

  •   網站上用戶對某個產品進行評分,如1~5顆星。

隱式評分:

  •   不會請網站進行評分,但是會記錄用戶是否點選了某個產品。

創建Recommend項目:

1.創建Recommend.scala文件

2.導入鏈接庫

import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }

3.業務邏輯代碼

def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
    var choose = ""
    while (choose != "3") { //如果選擇3.離開,就結束運行程序
      print("請選擇要推薦類型  1.針對用戶推薦電影 2.針對電影推薦感興趣的用戶 3.離開?")
      choose = readLine() //讀取用戶輸入
      if (choose == "1") { //如果輸入1.針對用戶推薦電影
        print("請輸入用戶id?")
        val inputUserID = readLine() //讀取用戶ID
        RecommendMovies(model, movieTitle, inputUserID.toInt) //針對此用戶推薦電影
      } else if (choose == "2") { //如果輸入2.針對電影推薦感興趣的用戶
        print("請輸入電影的 id?")
        val inputMovieID = readLine() //讀取MovieID
        RecommendUsers(model, movieTitle, inputMovieID.toInt) //針對此電影推薦用戶
      }
    }
  }

main程序代碼分爲3部分:

  • 數據準備階段
  • 訓練階段
  • 推薦階段

4.SetLogger設置不顯示log信息

  def SetLogger = {
    Logger.getLogger("org").setLevel(Level.OFF)
    Logger.getLogger("com").setLevel(Level.OFF)
    System.setProperty("spark.ui.showConsoleProgress", "false")
    Logger.getRootLogger().setLevel(Level.OFF);
  }

5.創建PrepareData()函數

 def PrepareData(): (RDD[Rating], Map[Int, String]) = {

    val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
    //spark在迭代計算的過程中,會導致linage劇烈變長,所需的棧空間也急劇上升,最終爆棧,顯示指明checkpoint路徑,問題便可得到解決。
    sc.setCheckpointDir("checkpoint")
    //----------------------1.創建用戶評分數據-------------
    print("開始讀取用戶評分數據中...")
    //val DataDir = "data"
    //val rawUserData = sc.textFile(new File(DataDir, "u.data").toString) 
    val rawUserData = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.data")
    val rawRatings = rawUserData.map(_.split("\t").take(3))
    val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    println("共計:" + ratingsRDD.count.toString() + "條ratings")
    //----------------------2.創建電影ID與名稱對照表-------------
    print("開始讀取電影數據中...")
    //val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
    val itemRDD = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.item")
    val movieTitle = itemRDD.map(line => line.split("\\|").take(2)) 
      .map(array => (array(0).toInt, array(1))).collect().toMap
    //----------------------3.顯示數據記錄數-------------      
    val numRatings = ratingsRDD.count() 
    val numUsers = ratingsRDD.map(_.user).distinct().count() 
    val numMovies = ratingsRDD.map(_.product).distinct().count() 
    println("共計:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
    return (ratingsRDD, movieTitle)
  }

6.recommend推薦程序代碼

    def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
    var choose = ""
    while (choose != "3") { //如果選擇3.離開,就結束運行程序
      print("請選擇要推薦類型  1.針對用戶推薦電影 2.針對電影推薦感興趣的用戶 3.離開?")
      choose = readLine() //讀取用戶輸入
      if (choose == "1") { //如果輸入1.針對用戶推薦電影
        print("請輸入用戶id?")
        val inputUserID = readLine() //讀取用戶ID
        RecommendMovies(model, movieTitle, inputUserID.toInt) //針對此用戶推薦電影
      } else if (choose == "2") { //如果輸入2.針對電影推薦感興趣的用戶
        print("請輸入電影的 id?")
        val inputMovieID = readLine() //讀取MovieID
        RecommendUsers(model, movieTitle, inputMovieID.toInt) //針對此電影推薦用戶
      }
    }
  }

7.main函數

  def main(args: Array[String]) {
    //設置不要顯示多餘信息
    SetLogger
    println("==========數據準備階段===============")
    val (ratings, movieTitle) = PrepareData()
    println("==========訓練階段===============")
    print("開始使用 " + ratings.count() + "條評比數據進行訓練模型... ")
    val model = ALS.train(ratings, 20, 15, 0.1) 
    println("訓練完成!")
    println("==========推薦階段===============")
    recommend(model, movieTitle)
    println("完成")
  }

 8.Recommend.scala全部代碼

import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
 object Recommend {
  def main(args: Array[String]) {
    //設置不要顯示多餘信息
    SetLogger
    println("==========數據準備階段===============")
    val (ratings, movieTitle) = PrepareData()
    println("==========訓練階段===============")
    print("開始使用 " + ratings.count() + "條評比數據進行訓練模型... ")
    val model = ALS.train(ratings, 20, 15, 0.1) 
    println("訓練完成!")
    println("==========推薦階段===============")
    recommend(model, movieTitle)
    println("完成")
  }

    def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
    var choose = ""
    while (choose != "3") { //如果選擇3.離開,就結束運行程序
      print("請選擇要推薦類型  1.針對用戶推薦電影 2.針對電影推薦感興趣的用戶 3.離開?")
      choose = readLine() //讀取用戶輸入
      if (choose == "1") { //如果輸入1.針對用戶推薦電影
        print("請輸入用戶id?")
        val inputUserID = readLine() //讀取用戶ID
        RecommendMovies(model, movieTitle, inputUserID.toInt) //針對此用戶推薦電影
      } else if (choose == "2") { //如果輸入2.針對電影推薦感興趣的用戶
        print("請輸入電影的 id?")
        val inputMovieID = readLine() //讀取MovieID
        RecommendUsers(model, movieTitle, inputMovieID.toInt) //針對此電影推薦用戶
      }
    }
  }


  def SetLogger = {
    Logger.getLogger("org").setLevel(Level.OFF)
    Logger.getLogger("com").setLevel(Level.OFF)
    System.setProperty("spark.ui.showConsoleProgress", "false")
    Logger.getRootLogger().setLevel(Level.OFF);
  }

  def PrepareData(): (RDD[Rating], Map[Int, String]) = {

    val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
    //spark在迭代計算的過程中,會導致linage劇烈變長,所需的棧空間也急劇上升,最終爆棧,顯示指明checkpoint路徑,問題便可得到解決。
    sc.setCheckpointDir("checkpoint")
    //----------------------1.創建用戶評分數據-------------
    print("開始讀取用戶評分數據中...")
    //val DataDir = "data"
    //val rawUserData = sc.textFile(new File(DataDir, "u.data").toString) 
    val rawUserData = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.data")
    val rawRatings = rawUserData.map(_.split("\t").take(3))
    val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    println("共計:" + ratingsRDD.count.toString() + "條ratings")
    //----------------------2.創建電影ID與名稱對照表-------------
    print("開始讀取電影數據中...")
    //val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
    val itemRDD = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.item")
    val movieTitle = itemRDD.map(line => line.split("\\|").take(2)) 
      .map(array => (array(0).toInt, array(1))).collect().toMap
    //----------------------3.顯示數據記錄數-------------      
    val numRatings = ratingsRDD.count() 
    val numUsers = ratingsRDD.map(_.user).distinct().count() 
    val numMovies = ratingsRDD.map(_.product).distinct().count() 
    println("共計:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
    return (ratingsRDD, movieTitle)
  }

  def RecommendMovies(model: MatrixFactorizationModel, movieTitle: Map[Int, String], inputUserID: Int) = {
    val RecommendMovie = model.recommendProducts(inputUserID, 10) 
    var i = 1
    println("針對用戶id" + inputUserID + "推薦下列電影:")
    RecommendMovie.foreach { r => 
      println(i.toString() + "." + movieTitle(r.product) + "評分:" + r.rating.toString())
      i += 1
    }
  }

  def RecommendUsers(model: MatrixFactorizationModel, movieTitle: Map[Int, String], inputMovieID: Int) = {
    val RecommendUser = model.recommendUsers(inputMovieID, 10) 
    var i = 1
    println("針對電影 id" + inputMovieID + "電影名:" + movieTitle(inputMovieID.toInt) + "推薦下列用戶id:")
    RecommendUser.foreach { r => 
      println(i.toString + "用戶id:" + r.user + "   評分:" + r.rating)
      i = i + 1
    }
  }

}

9.運行 Recommend.scala

10.運行界面

11.針對用戶推薦電影

12.針對電影推薦給感興趣的人

注意:

 如果不加 sc.setCheckpointDir("checkpoint"),則會棧溢出 stackoverflow。

spark在迭代計算的過程中,會導致linage劇烈變長,所需的棧空間也急劇上升,最終爆棧了。。

這類問題解決方法如下:

在代碼中加入 sc.setCheckpointDir(path),顯示指明checkpoint路徑,問題便可得到解決。

參考鏈接:https://blog.csdn.net/asdfghjkl1993/article/details/78626439

13.創建AlsEvaluation.scala調校推薦引擎參數

分爲三個階段

  • 數據準備階段
  • 訓練評估階段
  • 測試階段

14.創建PrepareData()數據準備

def PrepareData(): (RDD[Rating], RDD[Rating], RDD[Rating]) = {

    val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
    //----------------------1.創建用戶評分數據-------------
    print("開始讀取用戶評分數據...")
    val DataDir = "data"
    val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)

    val rawRatings = rawUserData.map(_.split("\t").take(3))

    val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    println("共計:" + ratingsRDD.count.toString() + "條ratings")

    //----------------------2.創建電影ID與名稱對照表-------------
    print("開始讀取電影數據...")
    val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
    val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
      .map(array => (array(0).toInt, array(1))).collect().toMap
    //----------------------3.顯示數據記錄數-------------
    val numRatings = ratingsRDD.count()
    val numUsers = ratingsRDD.map(_.user).distinct().count()
    val numMovies = ratingsRDD.map(_.product).distinct().count()
    println("共計:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
    //----------------------4.以隨機方式將數據分爲3個部分並且返回-------------                
    println("將數據分爲")
    val Array(trainData, validationData, testData) = ratingsRDD.randomSplit(Array(0.8, 0.1, 0.1))

    println("  trainData:" + trainData.count() + "  validationData:" + validationData.count() + "  testData:" + testData.count())
    return (trainData, validationData, testData)
  }

15.進行訓練評估

  def trainValidation(trainData: RDD[Rating], validationData: RDD[Rating]): MatrixFactorizationModel = {
    println("-----評估 rank參數使用 ---------")
    evaluateParameter(trainData, validationData, "rank", Array(5, 10, 15, 20, 50, 100), Array(10), Array(0.1))
    println("-----評估 numIterations ---------")
    evaluateParameter(trainData, validationData, "numIterations", Array(10), Array(5, 10, 15, 20, 25), Array(0.1))
    println("-----評估 lambda ---------")
    evaluateParameter(trainData, validationData, "lambda", Array(10), Array(10), Array(0.05, 0.1, 1, 5, 10.0))
    println("-----所有參數交叉評估找出最好的參數組合---------")
    val bestModel = evaluateAllParameter(trainData, validationData, Array(5, 10, 15, 20, 25), Array(5, 10, 15, 20, 25), Array(0.05, 0.1, 1, 5, 10.0))
    return (bestModel)
  }

  def evaluateParameter(trainData: RDD[Rating], validationData: RDD[Rating],
                        evaluateParameter: String, rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]) =
    {

      var dataBarChart = new DefaultCategoryDataset()

      var dataLineChart = new DefaultCategoryDataset()
      for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) {

        val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)

        val parameterData =
          evaluateParameter match {
            case "rank"          => rank;
            case "numIterations" => numIterations;
            case "lambda"        => lambda
          }
        dataBarChart.addValue(rmse, evaluateParameter, parameterData.toString())
        dataLineChart.addValue(time, "Time", parameterData.toString())
      }

      Chart.plotBarLineChart("ALS evaluations " + evaluateParameter, evaluateParameter, "RMSE", 0.58, 5, "Time", dataBarChart, dataLineChart)
    }

16.Chart.plotBarLineChart繪製出柱形圖與折線圖

import org.jfree.chart._
import org.jfree.data.xy._
import org.jfree.data.category.DefaultCategoryDataset
import org.jfree.chart.axis.NumberAxis
import org.jfree.chart.axis._
import java.awt.Color
import org.jfree.chart.renderer.category.LineAndShapeRenderer;
import org.jfree.chart.plot.DatasetRenderingOrder;
import org.jfree.chart.labels.StandardCategoryToolTipGenerator;
import java.awt.BasicStroke

object Chart {
  def plotBarLineChart(Title: String, xLabel: String, yBarLabel: String, yBarMin: Double, yBarMax: Double, yLineLabel: String, dataBarChart : DefaultCategoryDataset, dataLineChart: DefaultCategoryDataset): Unit = {

    //畫出Bar Chart    
    val chart = ChartFactory
         .createBarChart(  
        "", // Bar Chart 標題
        xLabel, // X軸標題
        yBarLabel, // Bar Chart 標題 y軸標題l
        dataBarChart , // Bar Chart數據
        org.jfree.chart.plot.PlotOrientation.VERTICAL,//畫圖方向垂直
        true, // 包含 legend
        true, // 顯示tooltips
        false // 不要URL generator
        );
    //取得plot  
    val plot = chart.getCategoryPlot();
    plot.setBackgroundPaint(new Color(0xEE, 0xEE, 0xFF));
    plot.setDomainAxisLocation(AxisLocation.BOTTOM_OR_RIGHT);
    plot.setDataset(1, dataLineChart); plot.mapDatasetToRangeAxis(1, 1)
    //畫直方圖y軸
    val vn = plot.getRangeAxis(); vn.setRange(yBarMin, yBarMax);  vn.setAutoTickUnitSelection(true)
    //畫折線圖y軸   
    val axis2 = new NumberAxis(yLineLabel); plot.setRangeAxis(1, axis2);
    val renderer2 = new LineAndShapeRenderer()
    renderer2.setToolTipGenerator(new StandardCategoryToolTipGenerator());
    //設置先畫直方圖,再畫折線圖以免折線圖被蓋掉 
    plot.setRenderer(1, renderer2);plot.setDatasetRenderingOrder(DatasetRenderingOrder.FORWARD);
    //創建畫框   
    val frame = new ChartFrame(Title,chart); frame.setSize(500, 500);
    frame.pack(); frame.setVisible(true)
  }
}

17.trainModel訓練模型

 def trainModel(trainData: RDD[Rating], validationData: RDD[Rating], rank: Int, iterations: Int, lambda: Double): (Double, Double) = {
    val startTime = new DateTime()
    val model = ALS.train(trainData, rank, iterations, lambda)
    val endTime = new DateTime()
    val Rmse = computeRMSE(model, validationData)
    val duration = new Duration(startTime, endTime)
    println(f"訓練參數:rank:$rank%3d,iterations:$iterations%.2f ,lambda = $lambda%.2f 結果 Rmse=$Rmse%.2f" + "訓練需要時間" + duration.getMillis + "毫秒")
    (Rmse, duration.getStandardSeconds)
  }

 18.計算RMSE

EMSE是用來計算推薦系統對用戶喜好的預測,與用戶實際喜好的誤差平均值,通常RMSE越小代表誤差越小,即代表預測值與真實值越接近,準確度越高。

 

  def computeRMSE(model: MatrixFactorizationModel, RatingRDD: RDD[Rating]): Double = {

    val num = RatingRDD.count()
    val predictedRDD = model.predict(RatingRDD.map(r => (r.user, r.product)))
    val predictedAndRatings =
      predictedRDD.map(p => ((p.user, p.product), p.rating))
        .join(RatingRDD.map(r => ((r.user, r.product), r.rating)))
        .values
    math.sqrt(predictedAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / num)
  }

 19.evaluateAllParameter找出最佳的參數組合

  def evaluateAllParameter(trainData: RDD[Rating], validationData: RDD[Rating],
                           rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]): MatrixFactorizationModel =
    {
      val evaluations =
        for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) yield {
          val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
          (rank, numIterations, lambda, rmse)
        }
      val Eval = (evaluations.sortBy(_._4))
      val BestEval = Eval(0)
      println("最佳model參數:rank:" + BestEval._1 + ",iterations:" + BestEval._2 + "lambda" + BestEval._3 + ",結果rmse = " + BestEval._4)
      val bestModel = ALS.train(trainData, BestEval._1, BestEval._2, BestEval._3)
      (bestModel)
    }

 我們希望找出rank、numIterations、lambda,交叉評估找出最好的參數組合。

20.AlsEvaluation全部代碼

import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
import org.joda.time.format._
import org.joda.time._
import org.joda.time.Duration
import org.jfree.data.category.DefaultCategoryDataset
import org.apache.spark.mllib.regression.LabeledPoint

object AlsEvaluation {

  def main(args: Array[String]) {
    SetLogger
    println("==========數據準備階段===============")
    val (trainData, validationData, testData) = PrepareData()
    trainData.persist(); validationData.persist(); testData.persist()
    println("==========訓練驗證階段===============")
    val bestModel = trainValidation(trainData, validationData)
    println("==========測試階段===============")
    val testRmse = computeRMSE(bestModel, testData)
    println("使用testData測試bestModel," + "結果rmse = " + testRmse)
    trainData.unpersist(); validationData.unpersist(); testData.unpersist()
  }

  def trainValidation(trainData: RDD[Rating], validationData: RDD[Rating]): MatrixFactorizationModel = {
    println("-----評估 rank參數使用 ---------")
    evaluateParameter(trainData, validationData, "rank", Array(5, 10, 15, 20, 50, 100), Array(10), Array(0.1))
    println("-----評估 numIterations ---------")
    evaluateParameter(trainData, validationData, "numIterations", Array(10), Array(5, 10, 15, 20, 25), Array(0.1))
    println("-----評估 lambda ---------")
    evaluateParameter(trainData, validationData, "lambda", Array(10), Array(10), Array(0.05, 0.1, 1, 5, 10.0))
    println("-----所有參數交叉評估找出最好的參數組合---------")
    val bestModel = evaluateAllParameter(trainData, validationData, Array(5, 10, 15, 20, 25), Array(5, 10, 15, 20, 25), Array(0.05, 0.1, 1, 5, 10.0))
    return (bestModel)
  }
  def evaluateParameter(trainData: RDD[Rating], validationData: RDD[Rating],
                        evaluateParameter: String, rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]) =
    {

      var dataBarChart = new DefaultCategoryDataset()

      var dataLineChart = new DefaultCategoryDataset()
      for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) {

        val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)

        val parameterData =
          evaluateParameter match {
            case "rank"          => rank;
            case "numIterations" => numIterations;
            case "lambda"        => lambda
          }
        dataBarChart.addValue(rmse, evaluateParameter, parameterData.toString())
        dataLineChart.addValue(time, "Time", parameterData.toString())
      }

      Chart.plotBarLineChart("ALS evaluations " + evaluateParameter, evaluateParameter, "RMSE", 0.58, 5, "Time", dataBarChart, dataLineChart)
    }

  def evaluateAllParameter(trainData: RDD[Rating], validationData: RDD[Rating],
                           rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]): MatrixFactorizationModel =
    {
      val evaluations =
        for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) yield {
          val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
          (rank, numIterations, lambda, rmse)
        }
      val Eval = (evaluations.sortBy(_._4))
      val BestEval = Eval(0)
      println("最佳model參數:rank:" + BestEval._1 + ",iterations:" + BestEval._2 + "lambda" + BestEval._3 + ",結果rmse = " + BestEval._4)
      val bestModel = ALS.train(trainData, BestEval._1, BestEval._2, BestEval._3)
      (bestModel)
    }
  def PrepareData(): (RDD[Rating], RDD[Rating], RDD[Rating]) = {

    val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
    //spark在迭代計算的過程中,會導致linage劇烈變長,所需的棧空間也急劇上升,最終爆棧,顯示指明checkpoint路徑,問題便可得到解決。
    sc.setCheckpointDir("checkpoint")
    //----------------------1.創建用戶評分數據-------------
    print("開始讀取用戶評分數據...")
    val DataDir = "data"
    val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)

    val rawRatings = rawUserData.map(_.split("\t").take(3))

    val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    println("共計:" + ratingsRDD.count.toString() + "條ratings")

    //----------------------2.創建電影ID與名稱對照表-------------
    print("開始讀取電影數據...")
    val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
    val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
      .map(array => (array(0).toInt, array(1))).collect().toMap
    //----------------------3.顯示數據記錄數-------------
    val numRatings = ratingsRDD.count()
    val numUsers = ratingsRDD.map(_.user).distinct().count()
    val numMovies = ratingsRDD.map(_.product).distinct().count()
    println("共計:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
    //----------------------4.以隨機方式將數據分爲3個部分並且返回-------------                
    println("將數據分爲")
    val Array(trainData, validationData, testData) = ratingsRDD.randomSplit(Array(0.8, 0.1, 0.1))

    println("  trainData:" + trainData.count() + "  validationData:" + validationData.count() + "  testData:" + testData.count())
    return (trainData, validationData, testData)
  }

  def trainModel(trainData: RDD[Rating], validationData: RDD[Rating], rank: Int, iterations: Int, lambda: Double): (Double, Double) = {
    val startTime = new DateTime()
    val model = ALS.train(trainData, rank, iterations, lambda)
    val endTime = new DateTime()
    val Rmse = computeRMSE(model, validationData)
    val duration = new Duration(startTime, endTime)
    println(f"訓練參數:rank:$rank%3d,iterations:$iterations%.2f ,lambda = $lambda%.2f 結果 Rmse=$Rmse%.2f" + "訓練需要時間" + duration.getMillis + "毫秒")
    (Rmse, duration.getStandardSeconds)
  }

  def computeRMSE(model: MatrixFactorizationModel, RatingRDD: RDD[Rating]): Double = {

    val num = RatingRDD.count()
    val predictedRDD = model.predict(RatingRDD.map(r => (r.user, r.product)))
    val predictedAndRatings =
      predictedRDD.map(p => ((p.user, p.product), p.rating))
        .join(RatingRDD.map(r => ((r.user, r.product), r.rating)))
        .values
    math.sqrt(predictedAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / num)
  }

  def SetLogger = {
    Logger.getLogger("org").setLevel(Level.OFF)
    Logger.getLogger("com").setLevel(Level.OFF)
    System.setProperty("spark.ui.showConsoleProgress", "false")
    Logger.getRootLogger().setLevel(Level.OFF);
  }

}

21.運行AlsEvaluation

柱狀圖代表RMSE,折線圖代表時間。

評估rank參數的結果圖

評估numIterations 

 

評估lambda

 

經過訓練,所有參數交叉評估找出最好的參數組合

 

 22.修改Recommend.scala爲最佳參數組合

 

 

 

 

 

 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章