通過自定義分區的方式,實現對點擊流日誌統計,並取出每個模塊中點擊排行前三的鏈接。
點擊日誌樣本:
20160321101954 http://java.study.163.com/java/course/javaee.shtml20160321101954 http://java.study.163.com/java/course/android.shtml
20160321101954 http://java.study.163.com/java/video.shtml
20160321101954 http://java.study.163.com/java/teacher.shtml
20160321101954 http://java.study.163.com/java/course/android.shtml
20160321101954 http://php.study.163.com/php/teacher.shtml
20160321101954 http://net.study.163.com/net/teacher.shtml
/**
* Created by zn on 2017/5/4.
*/
object UrlCountPartition {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("UrlCountPartition").setMaster("local[2]")
val sc = new SparkContext(conf)
//rdd1將數據切分,元組中放的是(URL, 1)
val rdd1 = sc.textFile("/Users/ning/Downloads/wangyiyun.log").map(line => {
val f = line.split("\t")
(f(1), 1)
})
val rdd2 = rdd1.reduceByKey(_ + _)
val rdd3 = rdd2.map(t => {
val url = t._1
val host = new URL(url).getHost
(host, (url, t._2))
})
val ints = rdd3.map(_._1).distinct().collect()
//使用自定義分區
val hostParitioner = new HostParitioner(ints)
// val rdd4 = rdd3.partitionBy(new HashPartitioner(ints.length))
val rdd4 = rdd3.partitionBy(hostParitioner).mapPartitions(it => {
it.toList.sortBy(_._2._2).reverse.take(2).iterator
})
rdd4.saveAsTextFile("/Users/ning/Downloads/out.spark")
//println(rdd4.collect().toBuffer)
sc.stop()
}
}
/**
* 決定了數據到哪個分區裏面
* @param ins
*/
class HostParitioner(ins: Array[String]) extends Partitioner {
val parMap = new mutable.HashMap[String, Int]()
var count = 0
for(i <- ins){
parMap += (i -> count)
count += 1
}
override def numPartitions: Int = ins.length
override def getPartition(key: Any): Int = {
parMap.getOrElse(key.toString, 0)
}
}