Spark Streaming物品排名

Scala:

package cn.spark.study.streaming

import org.apache.spark.SparkConf
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.hive.HiveContext

object Top5HotProduct {
  
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setMaster("local[2]")  
        .setAppName("Top5HotProduct")
    val ssc = new StreamingContext(conf, Seconds(1)) // 1秒中計算一次數據
    
    val productClickLogsDStream = ssc.socketTextStream("spark1", 7777)
    // 輸入數據格式:姓名 商品 類別
    // 將數據轉換爲 (類別_商品, 1)的格式,便於以“類別_商品”爲key進行聚合統計次數
    val categoryProductPairsDStream = productClickLogsDStream
        .map { productClickLog => (productClickLog.split(" ")(2) + "_" + productClickLog.split(" ")(1), 1)}
    // 每隔10秒統計最近60秒的點擊次數
    val categoryProductCountsDStream = categoryProductPairsDStream.reduceByKeyAndWindow(
        (v1: Int, v2: Int) => v1 + v2, 
        Seconds(60), 
        Seconds(10))  
    
    categoryProductCountsDStream.foreachRDD(categoryProductCountsRDD => {
      val categoryProductCountRowRDD = categoryProductCountsRDD.map(tuple => {
        val category = tuple._1.split("_")(0)
        val product = tuple._1.split("_")(1)  
        val count = tuple._2
        Row(category, product, count)  // 將數據轉換爲 (類別,商品,數量)的格式,便於後續使用sql查詢
      })
      
      val structType = StructType(Array(
          StructField("category", StringType, true),
          StructField("product", StringType, true),
          StructField("click_count", IntegerType, true)))
          
      val hiveContext = new HiveContext(categoryProductCountsRDD.context)
      
      val categoryProductCountDF = hiveContext.createDataFrame(categoryProductCountRowRDD, structType)  //構建DF
      
      categoryProductCountDF.registerTempTable("product_click_log")  // 註冊爲臨時表
      
      val top3ProductDF = hiveContext.sql(
            "SELECT category,product,click_count "
            + "FROM ("
              + "SELECT "
                + "category,"
                + "product,"
                + "click_count,"
                + "row_number() OVER (PARTITION BY category ORDER BY click_count DESC) rank " // 按類別分組排序
              + "FROM product_click_log"  
            + ") tmp "
            + "WHERE rank<=5")
            
      top3ProductDF.show()
    })
    
    ssc.start()
    ssc.awaitTermination()
  }
  
}

Java:

package cn.spark.study.streaming;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import scala.Tuple2;

/**
 * 與Spark SQL整合使用,top3熱門商品實時統計
 * @author Administrator
 *
 */
public class Top5HotProduct {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf()
				.setMaster("local[2]")
				.setAppName("Top5HotProduct");  
		JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));
		
		JavaReceiverInputDStream<String> productClickLogsDStream = jssc.socketTextStream("spark1", 9999);
		
		JavaPairDStream<String, Integer> categoryProductPairsDStream = productClickLogsDStream
				.mapToPair(new PairFunction<String, String, Integer>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<String, Integer> call(String productClickLog)
							throws Exception {
						String[] productClickLogSplited = productClickLog.split(" "); 
						return new Tuple2<String, Integer>(productClickLogSplited[2] + "_" + 
								productClickLogSplited[1], 1);
					}
					
				});
		

		JavaPairDStream<String, Integer> categoryProductCountsDStream = 
				categoryProductPairsDStream.reduceByKeyAndWindow(
						
						new Function2<Integer, Integer, Integer>() {

							private static final long serialVersionUID = 1L;
				
							@Override
							public Integer call(Integer v1, Integer v2) throws Exception {
								return v1 + v2;
							}
							
						}, Durations.seconds(60), Durations.seconds(10));  
		

		categoryProductCountsDStream.foreachRDD(new Function<JavaPairRDD<String,Integer>, Void>() {
			
			private static final long serialVersionUID = 1L;

			@Override
			public Void call(JavaPairRDD<String, Integer> categoryProductCountsRDD) throws Exception {
				// 將該RDD,轉換爲JavaRDD<Row>的格式
				JavaRDD<Row> categoryProductCountRowRDD = categoryProductCountsRDD.map(
						
						new Function<Tuple2<String,Integer>, Row>() {

							private static final long serialVersionUID = 1L;

							@Override
							public Row call(Tuple2<String, Integer> categoryProductCount)
									throws Exception {
								String category = categoryProductCount._1.split("_")[0];
								String product = categoryProductCount._1.split("_")[1];
								Integer count = categoryProductCount._2;
								return RowFactory.create(category, product, count);   
							}
							
						});
				
				// 然後,執行DataFrame轉換
				List<StructField> structFields = new ArrayList<StructField>();
				structFields.add(DataTypes.createStructField("category", DataTypes.StringType, true)); 
				structFields.add(DataTypes.createStructField("product", DataTypes.StringType, true));  
				structFields.add(DataTypes.createStructField("click_count", DataTypes.IntegerType, true));  
				StructType structType = DataTypes.createStructType(structFields);
				
				HiveContext hiveContext = new HiveContext(categoryProductCountsRDD.context());
				
				DataFrame categoryProductCountDF = hiveContext.createDataFrame(
						categoryProductCountRowRDD, structType);
				
				// 將60秒內的每個種類的每個商品的點擊次數的數據,註冊爲一個臨時表
				categoryProductCountDF.registerTempTable("product_click_log");  
				
				// 執行SQL語句,針對臨時表,統計出來每個種類下,點擊次數排名前5的熱門商品
				DataFrame top3ProductDF = hiveContext.sql(
						"SELECT category,product,click_count "
						+ "FROM ("
							+ "SELECT "
								+ "category,"
								+ "product,"
								+ "click_count,"
								+ "row_number() OVER (PARTITION BY category ORDER BY click_count DESC) rank "
							+ "FROM product_click_log"  
						+ ") tmp "
						+ "WHERE rank<=5");
				
				top3ProductDF.show();      
				
				return null;
			}
			
		});
		
		jssc.start();
		jssc.awaitTermination();
		jssc.close();
	}
	
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章