基於spark機器學習---------物品推薦
物品推薦
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.jblas.DoubleMatrix
/**
* Created by LXM55 on 2016/1/26.
* 物品推薦
*/
object ItemRecommend {
def main(args: Array[String]) {
val sc = new SparkContext("local", "ItemRecommend Test")
val rawData = sc.textFile("testdata/u.data")
val rawRatings =rawData.map(_.split("\t").take(3))
val ratings = rawRatings.map{
case Array(user,movie,rating) =>
Rating(user.toInt,movie.toInt,rating.toDouble)
}
val model = ALS.train(ratings,50,10,0.01);
val aMatrix = new DoubleMatrix(Array(1.0,2.0,3.0))
val itemId = 567
val itemFactor = model.productFeatures.lookup(itemId).head
val itemVector = new DoubleMatrix(itemFactor)
val a = cosineSimilarity(itemVector,itemVector)
println("aaa------->"+a)
//求各個物品的餘弦相似度
val sims = model.productFeatures.map {
case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector,itemVector)
(id,sim)
}
//取出567最相似的前10個物品
val K = 10
val sortedSims = sims.top(K)(Ordering.by[(Int,Double),Double]{
case (id,similarity) =>similarity})
println(sortedSims.take(10).mkString("\n"))
}
def cosineSimilarity(vec1:DoubleMatrix,vec2:DoubleMatrix):Double = {
vec1.dot(vec2)/(vec1.norm2()*vec2.norm2())
}
}
評價推薦結果:
package com.bailian.bigdata
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.jblas.DoubleMatrix
/**
* Created by LXM55 on 2016/1/26.
* 檢查推薦的相似物品
*/
object CheckItemRecommend {
def main(args: Array[String]) {
val sc = new SparkContext("local", "CheckItemRecommend Test")
val rawData = sc.textFile("testdata/u.data")
val rawRatings =rawData.map(_.split("\t").take(3))
val ratings = rawRatings.map{
case Array(user,movie,rating) =>
Rating(user.toInt,movie.toInt,rating.toDouble)
}
val model = ALS.train(ratings,50,10,0.01);
val aMatrix = new DoubleMatrix(Array(1.0,2.0,3.0))
val movies = sc.textFile("testdata/u.item")
val tittles = movies.map(line => line.split("\\|").take(2))
.map(array =>(array(0).toInt,array(1)))
.collectAsMap()
val itemId = 567
println("給定的電影名稱爲: " + tittles(itemId))
val itemFactor = model.productFeatures.lookup(itemId).head
val itemVector = new DoubleMatrix(itemFactor)
cosineSimilarity(itemVector,itemVector)
//求各個物品的餘弦相似度
val sims = model.productFeatures.map {
case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector,itemVector)
(id,sim)
}
val K = 10
//被推薦那些電影名稱及相應相似度
val sortedSims2 = sims.top(K+1)(Ordering.by[(Int,Double),Double]{
case (id,similarity) =>similarity
})
val result = sortedSims2.slice(1,11).map{
case (id,sim) =>(tittles(id),sim)
}.mkString("\n")
println("被推薦電影爲---》 "+result)
}
def cosineSimilarity(vec1:DoubleMatrix,vec2:DoubleMatrix):Double = {
vec1.dot(vec2)/(vec1.norm2()*vec2.norm2())
}
}
推薦模型效果的評估:K值平均準確率
package com.bailian.bigdata
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.jblas.DoubleMatrix
/**
* Created by LXM55 on 2016/1/26.
* 推薦模型效果的評估:K值平均準確率
*/
object MAPK {
def main(args: Array[String]) {
val sc = new SparkContext("local", "MAPK Test")
val rawData = sc.textFile("testdata/u.data")
val rawRatings = rawData.map(_.split("\t").take(3))
val ratings = rawRatings.map{
case Array(user,movie,rating) =>
Rating(user.toInt,movie.toInt,rating.toDouble)
}
val model = ALS.train(ratings,50,10,0.01);
val moviesForUser = ratings.keyBy(_.user).lookup(789)
//提取用戶實際評價過的電影ID
val actualMovies = moviesForUser.map(_.product)
println("actualMovies: " + actualMovies)
//提取推薦的用戶列表,K爲10
val k =10
val userId = 789
val topKRecs = model.recommendProducts(userId,k)
val predictedMovies = topKRecs.map(_.product)
val predictedUsers = topKRecs.map(_.user)
println("predictedMovies: " + predictedMovies.mkString("\n"))
// println("predictedUsers: " + predictedUsers.mkString("\n"))
//計算平均準確率
val apk10 = avgPrecisionK(actualMovies,predictedMovies,10)
println("apk10: "+apk10)
val itemFactors = model.productFeatures.map{
case(id,factor) => factor
}.collect()
val itemMatrix = new DoubleMatrix(itemFactors)
println(itemMatrix.rows,itemMatrix.columns)
val imBroacast = sc.broadcast(itemMatrix)
println("imBroacast: " + imBroacast)
//每一個用戶ID及各自對應的電影ID構成RDD
val allRecs = model.userFeatures.map{case (userId,array) =>
val userVector = new DoubleMatrix(array)
val scores = imBroacast.value.mmul(userVector)
val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1)
val recommendedIds = sortedWithId.map(_._2 + 1).toSeq
(userId,recommendedIds)
}
//獲取每一個用戶及對應的電影ID
val userMovies = ratings.map{
case Rating(user,product,rating) =>
(user,product)
}.groupBy(_._1)
//println("userMovies---->"+userMovies)
//通過join得到:用戶都有一個實際和預測的那些電影ID
val MAPK = allRecs.join(userMovies).map{case (userId,(predicted,actualWithIds )) =>
val actual = actualWithIds.map(_._2).toSeq
avgPrecisionK(actual,predicted,k)
}.reduce(_ + _) /allRecs.count()
println("MAPK--------->" + MAPK)
//下面使用spark MLlib內置的評估函數
}
//APK代碼實現
def avgPrecisionK(actual:Seq[Int],predicted:Seq[Int],k:Int):Double = {
val predK = predicted.take(k)
var score = 0.0
var numHits = 0.0
for((p,i) <- predK.zipWithIndex){
if(actual.contains(p)){
numHits += 1.0
score += numHits/(i.toDouble + 1.0)
}
}
if(actual.isEmpty){
1.0
}else{
score/math.min(actual.size,k).toDouble
}
}
}
注意:參照spark機器學習這本書的例子實現的 有不足之處請指出