假設有一個在線電影網站,公司希望運用大數據分析推薦引擎,增加會員觀看電影次數。
ALS算法:
ALS算法是基於模型的推薦算法。起基本思想是對稀疏矩陣進行模型分解,評估出缺失項的值,以此來得到一個基本的訓練模型。然後依照此模型可以針對新的用戶和物品數據進行評估。ALS是採用交替的最小二乘法來算出缺失項的。交替的最小二乘法是在最小二乘法的基礎上發展而來的。
根據用戶對剷平項目的評分分爲:
數據文件:
顯示評分:
- 網站上用戶對某個產品進行評分,如1~5顆星。
隱式評分:
- 不會請網站進行評分,但是會記錄用戶是否點選了某個產品。
創建Recommend項目:
1.創建Recommend.scala文件
2.導入鏈接庫
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
3.業務邏輯代碼
def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
var choose = ""
while (choose != "3") { //如果選擇3.離開,就結束運行程序
print("請選擇要推薦類型 1.針對用戶推薦電影 2.針對電影推薦感興趣的用戶 3.離開?")
choose = readLine() //讀取用戶輸入
if (choose == "1") { //如果輸入1.針對用戶推薦電影
print("請輸入用戶id?")
val inputUserID = readLine() //讀取用戶ID
RecommendMovies(model, movieTitle, inputUserID.toInt) //針對此用戶推薦電影
} else if (choose == "2") { //如果輸入2.針對電影推薦感興趣的用戶
print("請輸入電影的 id?")
val inputMovieID = readLine() //讀取MovieID
RecommendUsers(model, movieTitle, inputMovieID.toInt) //針對此電影推薦用戶
}
}
}
main程序代碼分爲3部分:
- 數據準備階段
- 訓練階段
- 推薦階段
4.SetLogger設置不顯示log信息
def SetLogger = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("com").setLevel(Level.OFF)
System.setProperty("spark.ui.showConsoleProgress", "false")
Logger.getRootLogger().setLevel(Level.OFF);
}
5.創建PrepareData()函數
def PrepareData(): (RDD[Rating], Map[Int, String]) = {
val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
//spark在迭代計算的過程中,會導致linage劇烈變長,所需的棧空間也急劇上升,最終爆棧,顯示指明checkpoint路徑,問題便可得到解決。
sc.setCheckpointDir("checkpoint")
//----------------------1.創建用戶評分數據-------------
print("開始讀取用戶評分數據中...")
//val DataDir = "data"
//val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)
val rawUserData = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.data")
val rawRatings = rawUserData.map(_.split("\t").take(3))
val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
println("共計:" + ratingsRDD.count.toString() + "條ratings")
//----------------------2.創建電影ID與名稱對照表-------------
print("開始讀取電影數據中...")
//val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
val itemRDD = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.item")
val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
.map(array => (array(0).toInt, array(1))).collect().toMap
//----------------------3.顯示數據記錄數-------------
val numRatings = ratingsRDD.count()
val numUsers = ratingsRDD.map(_.user).distinct().count()
val numMovies = ratingsRDD.map(_.product).distinct().count()
println("共計:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
return (ratingsRDD, movieTitle)
}
6.recommend推薦程序代碼
def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
var choose = ""
while (choose != "3") { //如果選擇3.離開,就結束運行程序
print("請選擇要推薦類型 1.針對用戶推薦電影 2.針對電影推薦感興趣的用戶 3.離開?")
choose = readLine() //讀取用戶輸入
if (choose == "1") { //如果輸入1.針對用戶推薦電影
print("請輸入用戶id?")
val inputUserID = readLine() //讀取用戶ID
RecommendMovies(model, movieTitle, inputUserID.toInt) //針對此用戶推薦電影
} else if (choose == "2") { //如果輸入2.針對電影推薦感興趣的用戶
print("請輸入電影的 id?")
val inputMovieID = readLine() //讀取MovieID
RecommendUsers(model, movieTitle, inputMovieID.toInt) //針對此電影推薦用戶
}
}
}
7.main函數
def main(args: Array[String]) {
//設置不要顯示多餘信息
SetLogger
println("==========數據準備階段===============")
val (ratings, movieTitle) = PrepareData()
println("==========訓練階段===============")
print("開始使用 " + ratings.count() + "條評比數據進行訓練模型... ")
val model = ALS.train(ratings, 20, 15, 0.1)
println("訓練完成!")
println("==========推薦階段===============")
recommend(model, movieTitle)
println("完成")
}
8.Recommend.scala全部代碼
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
object Recommend {
def main(args: Array[String]) {
//設置不要顯示多餘信息
SetLogger
println("==========數據準備階段===============")
val (ratings, movieTitle) = PrepareData()
println("==========訓練階段===============")
print("開始使用 " + ratings.count() + "條評比數據進行訓練模型... ")
val model = ALS.train(ratings, 20, 15, 0.1)
println("訓練完成!")
println("==========推薦階段===============")
recommend(model, movieTitle)
println("完成")
}
def recommend(model: MatrixFactorizationModel, movieTitle: Map[Int, String]) = {
var choose = ""
while (choose != "3") { //如果選擇3.離開,就結束運行程序
print("請選擇要推薦類型 1.針對用戶推薦電影 2.針對電影推薦感興趣的用戶 3.離開?")
choose = readLine() //讀取用戶輸入
if (choose == "1") { //如果輸入1.針對用戶推薦電影
print("請輸入用戶id?")
val inputUserID = readLine() //讀取用戶ID
RecommendMovies(model, movieTitle, inputUserID.toInt) //針對此用戶推薦電影
} else if (choose == "2") { //如果輸入2.針對電影推薦感興趣的用戶
print("請輸入電影的 id?")
val inputMovieID = readLine() //讀取MovieID
RecommendUsers(model, movieTitle, inputMovieID.toInt) //針對此電影推薦用戶
}
}
}
def SetLogger = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("com").setLevel(Level.OFF)
System.setProperty("spark.ui.showConsoleProgress", "false")
Logger.getRootLogger().setLevel(Level.OFF);
}
def PrepareData(): (RDD[Rating], Map[Int, String]) = {
val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
//spark在迭代計算的過程中,會導致linage劇烈變長,所需的棧空間也急劇上升,最終爆棧,顯示指明checkpoint路徑,問題便可得到解決。
sc.setCheckpointDir("checkpoint")
//----------------------1.創建用戶評分數據-------------
print("開始讀取用戶評分數據中...")
//val DataDir = "data"
//val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)
val rawUserData = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.data")
val rawRatings = rawUserData.map(_.split("\t").take(3))
val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
println("共計:" + ratingsRDD.count.toString() + "條ratings")
//----------------------2.創建電影ID與名稱對照表-------------
print("開始讀取電影數據中...")
//val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
val itemRDD = sc.textFile("file:/home/hduser/SparkExample/Recommend/data/u.item")
val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
.map(array => (array(0).toInt, array(1))).collect().toMap
//----------------------3.顯示數據記錄數-------------
val numRatings = ratingsRDD.count()
val numUsers = ratingsRDD.map(_.user).distinct().count()
val numMovies = ratingsRDD.map(_.product).distinct().count()
println("共計:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
return (ratingsRDD, movieTitle)
}
def RecommendMovies(model: MatrixFactorizationModel, movieTitle: Map[Int, String], inputUserID: Int) = {
val RecommendMovie = model.recommendProducts(inputUserID, 10)
var i = 1
println("針對用戶id" + inputUserID + "推薦下列電影:")
RecommendMovie.foreach { r =>
println(i.toString() + "." + movieTitle(r.product) + "評分:" + r.rating.toString())
i += 1
}
}
def RecommendUsers(model: MatrixFactorizationModel, movieTitle: Map[Int, String], inputMovieID: Int) = {
val RecommendUser = model.recommendUsers(inputMovieID, 10)
var i = 1
println("針對電影 id" + inputMovieID + "電影名:" + movieTitle(inputMovieID.toInt) + "推薦下列用戶id:")
RecommendUser.foreach { r =>
println(i.toString + "用戶id:" + r.user + " 評分:" + r.rating)
i = i + 1
}
}
}
9.運行 Recommend.scala
10.運行界面
11.針對用戶推薦電影
12.針對電影推薦給感興趣的人
注意:
如果不加 sc.setCheckpointDir("checkpoint"),則會棧溢出 stackoverflow。
spark在迭代計算的過程中,會導致linage劇烈變長,所需的棧空間也急劇上升,最終爆棧了。。
這類問題解決方法如下:
在代碼中加入 sc.setCheckpointDir(path),顯示指明checkpoint路徑,問題便可得到解決。
參考鏈接:https://blog.csdn.net/asdfghjkl1993/article/details/78626439
13.創建AlsEvaluation.scala調校推薦引擎參數
分爲三個階段
- 數據準備階段
- 訓練評估階段
- 測試階段
14.創建PrepareData()數據準備
def PrepareData(): (RDD[Rating], RDD[Rating], RDD[Rating]) = {
val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
//----------------------1.創建用戶評分數據-------------
print("開始讀取用戶評分數據...")
val DataDir = "data"
val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)
val rawRatings = rawUserData.map(_.split("\t").take(3))
val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
println("共計:" + ratingsRDD.count.toString() + "條ratings")
//----------------------2.創建電影ID與名稱對照表-------------
print("開始讀取電影數據...")
val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
.map(array => (array(0).toInt, array(1))).collect().toMap
//----------------------3.顯示數據記錄數-------------
val numRatings = ratingsRDD.count()
val numUsers = ratingsRDD.map(_.user).distinct().count()
val numMovies = ratingsRDD.map(_.product).distinct().count()
println("共計:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
//----------------------4.以隨機方式將數據分爲3個部分並且返回-------------
println("將數據分爲")
val Array(trainData, validationData, testData) = ratingsRDD.randomSplit(Array(0.8, 0.1, 0.1))
println(" trainData:" + trainData.count() + " validationData:" + validationData.count() + " testData:" + testData.count())
return (trainData, validationData, testData)
}
15.進行訓練評估
def trainValidation(trainData: RDD[Rating], validationData: RDD[Rating]): MatrixFactorizationModel = {
println("-----評估 rank參數使用 ---------")
evaluateParameter(trainData, validationData, "rank", Array(5, 10, 15, 20, 50, 100), Array(10), Array(0.1))
println("-----評估 numIterations ---------")
evaluateParameter(trainData, validationData, "numIterations", Array(10), Array(5, 10, 15, 20, 25), Array(0.1))
println("-----評估 lambda ---------")
evaluateParameter(trainData, validationData, "lambda", Array(10), Array(10), Array(0.05, 0.1, 1, 5, 10.0))
println("-----所有參數交叉評估找出最好的參數組合---------")
val bestModel = evaluateAllParameter(trainData, validationData, Array(5, 10, 15, 20, 25), Array(5, 10, 15, 20, 25), Array(0.05, 0.1, 1, 5, 10.0))
return (bestModel)
}
def evaluateParameter(trainData: RDD[Rating], validationData: RDD[Rating],
evaluateParameter: String, rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]) =
{
var dataBarChart = new DefaultCategoryDataset()
var dataLineChart = new DefaultCategoryDataset()
for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) {
val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
val parameterData =
evaluateParameter match {
case "rank" => rank;
case "numIterations" => numIterations;
case "lambda" => lambda
}
dataBarChart.addValue(rmse, evaluateParameter, parameterData.toString())
dataLineChart.addValue(time, "Time", parameterData.toString())
}
Chart.plotBarLineChart("ALS evaluations " + evaluateParameter, evaluateParameter, "RMSE", 0.58, 5, "Time", dataBarChart, dataLineChart)
}
16.Chart.plotBarLineChart繪製出柱形圖與折線圖
import org.jfree.chart._
import org.jfree.data.xy._
import org.jfree.data.category.DefaultCategoryDataset
import org.jfree.chart.axis.NumberAxis
import org.jfree.chart.axis._
import java.awt.Color
import org.jfree.chart.renderer.category.LineAndShapeRenderer;
import org.jfree.chart.plot.DatasetRenderingOrder;
import org.jfree.chart.labels.StandardCategoryToolTipGenerator;
import java.awt.BasicStroke
object Chart {
def plotBarLineChart(Title: String, xLabel: String, yBarLabel: String, yBarMin: Double, yBarMax: Double, yLineLabel: String, dataBarChart : DefaultCategoryDataset, dataLineChart: DefaultCategoryDataset): Unit = {
//畫出Bar Chart
val chart = ChartFactory
.createBarChart(
"", // Bar Chart 標題
xLabel, // X軸標題
yBarLabel, // Bar Chart 標題 y軸標題l
dataBarChart , // Bar Chart數據
org.jfree.chart.plot.PlotOrientation.VERTICAL,//畫圖方向垂直
true, // 包含 legend
true, // 顯示tooltips
false // 不要URL generator
);
//取得plot
val plot = chart.getCategoryPlot();
plot.setBackgroundPaint(new Color(0xEE, 0xEE, 0xFF));
plot.setDomainAxisLocation(AxisLocation.BOTTOM_OR_RIGHT);
plot.setDataset(1, dataLineChart); plot.mapDatasetToRangeAxis(1, 1)
//畫直方圖y軸
val vn = plot.getRangeAxis(); vn.setRange(yBarMin, yBarMax); vn.setAutoTickUnitSelection(true)
//畫折線圖y軸
val axis2 = new NumberAxis(yLineLabel); plot.setRangeAxis(1, axis2);
val renderer2 = new LineAndShapeRenderer()
renderer2.setToolTipGenerator(new StandardCategoryToolTipGenerator());
//設置先畫直方圖,再畫折線圖以免折線圖被蓋掉
plot.setRenderer(1, renderer2);plot.setDatasetRenderingOrder(DatasetRenderingOrder.FORWARD);
//創建畫框
val frame = new ChartFrame(Title,chart); frame.setSize(500, 500);
frame.pack(); frame.setVisible(true)
}
}
17.trainModel訓練模型
def trainModel(trainData: RDD[Rating], validationData: RDD[Rating], rank: Int, iterations: Int, lambda: Double): (Double, Double) = {
val startTime = new DateTime()
val model = ALS.train(trainData, rank, iterations, lambda)
val endTime = new DateTime()
val Rmse = computeRMSE(model, validationData)
val duration = new Duration(startTime, endTime)
println(f"訓練參數:rank:$rank%3d,iterations:$iterations%.2f ,lambda = $lambda%.2f 結果 Rmse=$Rmse%.2f" + "訓練需要時間" + duration.getMillis + "毫秒")
(Rmse, duration.getStandardSeconds)
}
18.計算RMSE
EMSE是用來計算推薦系統對用戶喜好的預測,與用戶實際喜好的誤差平均值,通常RMSE越小代表誤差越小,即代表預測值與真實值越接近,準確度越高。
def computeRMSE(model: MatrixFactorizationModel, RatingRDD: RDD[Rating]): Double = {
val num = RatingRDD.count()
val predictedRDD = model.predict(RatingRDD.map(r => (r.user, r.product)))
val predictedAndRatings =
predictedRDD.map(p => ((p.user, p.product), p.rating))
.join(RatingRDD.map(r => ((r.user, r.product), r.rating)))
.values
math.sqrt(predictedAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / num)
}
19.evaluateAllParameter找出最佳的參數組合
def evaluateAllParameter(trainData: RDD[Rating], validationData: RDD[Rating],
rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]): MatrixFactorizationModel =
{
val evaluations =
for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) yield {
val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
(rank, numIterations, lambda, rmse)
}
val Eval = (evaluations.sortBy(_._4))
val BestEval = Eval(0)
println("最佳model參數:rank:" + BestEval._1 + ",iterations:" + BestEval._2 + "lambda" + BestEval._3 + ",結果rmse = " + BestEval._4)
val bestModel = ALS.train(trainData, BestEval._1, BestEval._2, BestEval._3)
(bestModel)
}
我們希望找出rank、numIterations、lambda,交叉評估找出最好的參數組合。
20.AlsEvaluation全部代碼
import java.io.File
import scala.io.Source
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.mllib.recommendation.{ ALS, Rating, MatrixFactorizationModel }
import org.joda.time.format._
import org.joda.time._
import org.joda.time.Duration
import org.jfree.data.category.DefaultCategoryDataset
import org.apache.spark.mllib.regression.LabeledPoint
object AlsEvaluation {
def main(args: Array[String]) {
SetLogger
println("==========數據準備階段===============")
val (trainData, validationData, testData) = PrepareData()
trainData.persist(); validationData.persist(); testData.persist()
println("==========訓練驗證階段===============")
val bestModel = trainValidation(trainData, validationData)
println("==========測試階段===============")
val testRmse = computeRMSE(bestModel, testData)
println("使用testData測試bestModel," + "結果rmse = " + testRmse)
trainData.unpersist(); validationData.unpersist(); testData.unpersist()
}
def trainValidation(trainData: RDD[Rating], validationData: RDD[Rating]): MatrixFactorizationModel = {
println("-----評估 rank參數使用 ---------")
evaluateParameter(trainData, validationData, "rank", Array(5, 10, 15, 20, 50, 100), Array(10), Array(0.1))
println("-----評估 numIterations ---------")
evaluateParameter(trainData, validationData, "numIterations", Array(10), Array(5, 10, 15, 20, 25), Array(0.1))
println("-----評估 lambda ---------")
evaluateParameter(trainData, validationData, "lambda", Array(10), Array(10), Array(0.05, 0.1, 1, 5, 10.0))
println("-----所有參數交叉評估找出最好的參數組合---------")
val bestModel = evaluateAllParameter(trainData, validationData, Array(5, 10, 15, 20, 25), Array(5, 10, 15, 20, 25), Array(0.05, 0.1, 1, 5, 10.0))
return (bestModel)
}
def evaluateParameter(trainData: RDD[Rating], validationData: RDD[Rating],
evaluateParameter: String, rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]) =
{
var dataBarChart = new DefaultCategoryDataset()
var dataLineChart = new DefaultCategoryDataset()
for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) {
val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
val parameterData =
evaluateParameter match {
case "rank" => rank;
case "numIterations" => numIterations;
case "lambda" => lambda
}
dataBarChart.addValue(rmse, evaluateParameter, parameterData.toString())
dataLineChart.addValue(time, "Time", parameterData.toString())
}
Chart.plotBarLineChart("ALS evaluations " + evaluateParameter, evaluateParameter, "RMSE", 0.58, 5, "Time", dataBarChart, dataLineChart)
}
def evaluateAllParameter(trainData: RDD[Rating], validationData: RDD[Rating],
rankArray: Array[Int], numIterationsArray: Array[Int], lambdaArray: Array[Double]): MatrixFactorizationModel =
{
val evaluations =
for (rank <- rankArray; numIterations <- numIterationsArray; lambda <- lambdaArray) yield {
val (rmse, time) = trainModel(trainData, validationData, rank, numIterations, lambda)
(rank, numIterations, lambda, rmse)
}
val Eval = (evaluations.sortBy(_._4))
val BestEval = Eval(0)
println("最佳model參數:rank:" + BestEval._1 + ",iterations:" + BestEval._2 + "lambda" + BestEval._3 + ",結果rmse = " + BestEval._4)
val bestModel = ALS.train(trainData, BestEval._1, BestEval._2, BestEval._3)
(bestModel)
}
def PrepareData(): (RDD[Rating], RDD[Rating], RDD[Rating]) = {
val sc = new SparkContext(new SparkConf().setAppName("RDF").setMaster("local[4]"))
//spark在迭代計算的過程中,會導致linage劇烈變長,所需的棧空間也急劇上升,最終爆棧,顯示指明checkpoint路徑,問題便可得到解決。
sc.setCheckpointDir("checkpoint")
//----------------------1.創建用戶評分數據-------------
print("開始讀取用戶評分數據...")
val DataDir = "data"
val rawUserData = sc.textFile(new File(DataDir, "u.data").toString)
val rawRatings = rawUserData.map(_.split("\t").take(3))
val ratingsRDD = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
println("共計:" + ratingsRDD.count.toString() + "條ratings")
//----------------------2.創建電影ID與名稱對照表-------------
print("開始讀取電影數據...")
val itemRDD = sc.textFile(new File(DataDir, "u.item").toString)
val movieTitle = itemRDD.map(line => line.split("\\|").take(2))
.map(array => (array(0).toInt, array(1))).collect().toMap
//----------------------3.顯示數據記錄數-------------
val numRatings = ratingsRDD.count()
val numUsers = ratingsRDD.map(_.user).distinct().count()
val numMovies = ratingsRDD.map(_.product).distinct().count()
println("共計:ratings: " + numRatings + " User " + numUsers + " Movie " + numMovies)
//----------------------4.以隨機方式將數據分爲3個部分並且返回-------------
println("將數據分爲")
val Array(trainData, validationData, testData) = ratingsRDD.randomSplit(Array(0.8, 0.1, 0.1))
println(" trainData:" + trainData.count() + " validationData:" + validationData.count() + " testData:" + testData.count())
return (trainData, validationData, testData)
}
def trainModel(trainData: RDD[Rating], validationData: RDD[Rating], rank: Int, iterations: Int, lambda: Double): (Double, Double) = {
val startTime = new DateTime()
val model = ALS.train(trainData, rank, iterations, lambda)
val endTime = new DateTime()
val Rmse = computeRMSE(model, validationData)
val duration = new Duration(startTime, endTime)
println(f"訓練參數:rank:$rank%3d,iterations:$iterations%.2f ,lambda = $lambda%.2f 結果 Rmse=$Rmse%.2f" + "訓練需要時間" + duration.getMillis + "毫秒")
(Rmse, duration.getStandardSeconds)
}
def computeRMSE(model: MatrixFactorizationModel, RatingRDD: RDD[Rating]): Double = {
val num = RatingRDD.count()
val predictedRDD = model.predict(RatingRDD.map(r => (r.user, r.product)))
val predictedAndRatings =
predictedRDD.map(p => ((p.user, p.product), p.rating))
.join(RatingRDD.map(r => ((r.user, r.product), r.rating)))
.values
math.sqrt(predictedAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / num)
}
def SetLogger = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("com").setLevel(Level.OFF)
System.setProperty("spark.ui.showConsoleProgress", "false")
Logger.getRootLogger().setLevel(Level.OFF);
}
}
21.運行AlsEvaluation
柱狀圖代表RMSE,折線圖代表時間。
評估rank參數的結果圖
評估numIterations
評估lambda
經過訓練,所有參數交叉評估找出最好的參數組合
22.修改Recommend.scala爲最佳參數組合