Spark Core求topN案例

 

 

案例需求:

1、對文本文件內的數字,取最大的前3個。
2、對每個班級內的學生成績,取出前3名。(分組取topN)

實例:

Java版本:topN和分組TopN

import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

/**
 * 取最大的前3個數字
 * @author Administrator
 *
 */
public class Top3 {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf()
				.setAppName("Top3")
				.setMaster("local");  
		JavaSparkContext sc = new JavaSparkContext(conf);
	
		JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//top.txt");
		
		JavaPairRDD<Integer, String> pairs = lines.mapToPair(
				
				new PairFunction<String, Integer, String>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<Integer, String> call(String t) throws Exception {
						return new Tuple2<Integer, String>(Integer.valueOf(t), t);
					}
					
				});
		
		JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false);
		
		JavaRDD<Integer> sortedNumbers = sortedPairs.map(
				
				new Function<Tuple2<Integer,String>, Integer>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Integer call(Tuple2<Integer, String> v1) throws Exception {
						return v1._1;
					}
					
				});
		
		List<Integer> sortedNumberList = sortedNumbers.take(3);
		
		for(Integer num : sortedNumberList) {
			System.out.println(num);
		}
		
		sc.close();
	}
	
}
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

/**
 * 分組取top3
 * @author Administrator
 *
 */
public class GroupTop3 {
	
	public static void main(String[] args) {
		SparkConf conf = new SparkConf()
				.setAppName("Top3")
				.setMaster("local");  
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//score.txt");
		
		JavaPairRDD<String, Integer> pairs = lines.mapToPair(
				
				new PairFunction<String, String, Integer>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<String, Integer> call(String line) throws Exception {
						String[] lineSplited = line.split(" ");  
						return new Tuple2<String, Integer>(lineSplited[0], 
								Integer.valueOf(lineSplited[1]));
					}
					
				});
		
		JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey();
		
		JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair(
				
				new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Tuple2<String, Iterable<Integer>> call(
							Tuple2<String, Iterable<Integer>> classScores)
							throws Exception {
						Integer[] top3 = new Integer[3];
						
						String className = classScores._1;
						Iterator<Integer> scores = classScores._2.iterator();
						
						while(scores.hasNext()) {
							Integer score = scores.next();
							
							for(int i = 0; i < 3; i++) {
								if(top3[i] == null) {
									top3[i] = score;
									break;
								} else if(score > top3[i]) {
									for(int j = 2; j > i; j--) {
										top3[j] = top3[j - 1];  
									}
									
									top3[i] = score;
									
									break;
								} 
							}
						}
						
						return new Tuple2<String, 
								Iterable<Integer>>(className, Arrays.asList(top3));    
					}
					
				});
		
		top3Score.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
			
			private static final long serialVersionUID = 1L;

			@Override
			public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
				System.out.println("class: " + t._1);  
				Iterator<Integer> scoreIterator = t._2.iterator();
				while(scoreIterator.hasNext()) {
					Integer score = scoreIterator.next();
					System.out.println(score);  
				}
				System.out.println("=======================================");   
			}
			
		});
		
		sc.close();
	}
	
}

Scala版本:

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext


object Top3 {
  
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setAppName("Top3")
        .setMaster("local")  
    val sc = new SparkContext(conf)
    
    val lines = sc.textFile("C://Users//Administrator//Desktop//top.txt", 1)
    val pairs = lines.map { line => (line.toInt, line) }
    val sortedPairs = pairs.sortByKey(false)
    val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1)  
    val top3Number = sortedNumbers.take(3)
    
    for(num <- top3Number) {
      println(num)  
    }
  }
  
}

scala版本分組取topN:

import org.apache.spark.{SparkConf, SparkContext}

object GroupTopN {

  def main(args: Array[String]){
    val conf = new SparkConf().setAppName("groupTopN").setMaster("local")
    val sc = new SparkContext(conf)
    //數據在代碼末尾
    val baseRDD = sc.textFile("D:\\score.txt").cache()
    val pairRDD = baseRDD.map(line => (line.split(" ")(0),line.split(" ")(1).toInt))

    val groupRDD = pairRDD.groupByKey()

    groupRDD.foreach(println(_))

    val grouptop5RDD = groupRDD.map(line => {
      val top5 = new Array[Int](5)
      line._2.foreach(score =>{
        var i = 0
        var flag = true
        for(i <- 0 until 5 if flag){
            if(top5(i)<score){
              var tmp = top5(i)
              top5(i) = score
              var j = 0
              for(j <- i+1 until top5.size){
                if(top5(j) < tmp){
                  var temp = top5(j)
                  top5(j) = tmp
                  tmp = temp
                }
              }
              flag = false
            }
        }
      })
      (line._1,top5)
    })

    grouptop5RDD.foreach(line =>{
      var i = 0
      val top = line._2
      println("================================")
      for (i <- 0 until top.size){
        println(line._1+" : "+top(i))
      }
    })
    
  }
}

/*
class1 77
class3 67
class2 81
class3 73
class1 91
class2 69
class2 97
class3 90
class1 57
class3 55
class1 57
class2 81
class1 93
class3 79
class2 81
class1 99
class3 99
class1 91
class2 81
class1 99
class3 79
class2 81
 */

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章