Spark 排序原理

Spark基本排序原理

  • 經典wordcount排序原理,單詞個數降序

Java版BasicSort

public class BasicSort {
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName(BasicSort.class.getSimpleName()).setMaster("local[2]");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> linesRDD = sc.textFile("E:/test/word.txt");
    JavaRDD<String>  wordsRDD= linesRDD.flatMap(x -> Arrays.asList( x.split(" ")));
    JavaPairRDD<String, Integer> pairsRDD = wordsRDD.mapToPair(x -> new Tuple2<String, Integer>(x, 1));
    JavaPairRDD<String, Integer> rwordsRDD = pairsRDD.reduceByKey((v1, v2) -> v1 + v2);
    List<Tuple2<String, Integer>> collect = rwordsRDD.mapToPair(x -> new Tuple2<Integer, String>(x._2, x._1))
            .sortByKey(false)
            .map(x -> new Tuple2<String, Integer>(x._2, x._1))
            .collect();
    for (Tuple2<String,Integer> x: collect) {
        System.out.println(x._1() + "---->" + x._2());
    }
}
}

Scala版本BasicSort

object wordcount {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("wordcount").setMaster("local")
    val sc = new SparkContext(conf)
    val wcRDD = sc.textFile("E:/test/word.txt").flatMap(_.split(" "))
      .map((_,1)).reduceByKey(_+_)
    val collect = wcRDD.map( x =>(x._2,x._1)).sortByKey(false).map(x => (x._2,x._1)).collect
    collect.foreach(x => println(x))
    }
}

-Spark二次排序

Java版本 Spark二次排序

public class SecondSortApp {
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName(SecondSortApp.class.getSimpleName()).setMaster("local[2]");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> lines = sc.textFile("E:/test/sort.txt");
    List<Tuple2<SecondSort, String>> collect = lines.mapToPair(line -> new Tuple2<SecondSort, String>(new SecondSort(line.split(" ")[0], line.split(" ")[1]), line))
            .sortByKey().collect();
    for (Tuple2<SecondSort, String> t: collect) {
        System.out.println(t._2());
    }
}
}

class SecondSort implements     Comparable<SecondSort>,Serializable{
private  int first;
private  int sencod;

public SecondSort(int first, int sencod) {
    this.first = first;
    this.sencod = sencod;
}

public SecondSort(String first,String second) {
    this.first = Integer.valueOf(first.trim());
    this.sencod = Integer.valueOf(second.trim());
}
public SecondSort() {
}

public int getFirst() {
    return first;
}

public void setFirst(int first) {
    this.first = first;
}

public int getSencod() {
    return sencod;
}

public void setSencod(int sencod) {
    this.sencod = sencod;
}

@Override
public int compareTo(SecondSort o) {
    int ret = first -o.first;
    if (ret == 0){
        ret = o.sencod - sencod;
    }
    return ret;
}
}

Scala版本二次排序

object SecondSortAPP {
def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("SecondSortAPP").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val linesRDD = sc.textFile("E:/test/sort.txt")
    val collect = linesRDD.map(line =>(new SecondSort(line.split(" ")(0),line.split(" ")(1)),line)).sortByKey().collect()
    collect.foreach(x => println(x._2))
  }
}
class SecondSort(val first:String,val second:String) extends Ordered[SecondSort] with Serializable{
      def getFirst() = first
      def getSecond() = second
      override def compare(that: SecondSort): Int = {
        var ret = first.compareTo(that.first)
    if(ret == 0){
      ret = second.compareTo(that.second)
    }
    ret
  }
}

-Spark topN

Java版本Spark topN問題

public class TopN {
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName(TopN.class.getName()).setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> linesRDD = sc.textFile("E:/test/topn.txt");
    int topn =  Integer.valueOf(args[0]);
    Broadcast<Integer> topN = sc.broadcast(topn);
    JavaPairRDD<String, Iterable<String>> result = linesRDD.mapToPair(line -> new Tuple2<String, String>(line.split(" ")[0], line.split(" ")[1]))
            .groupByKey().mapToPair(x -> {
                TreeSet<String> set = new TreeSet<String>(new Mycomparator() {
                    @Override
                    public int compare(String o1, String o2) {
                        int ret = o1.compareTo(o2);
                        if (ret == 0){
                            //不去重
                            ret = 1;
                        }
                        return ret;
                    }
                });
                for (String sorce : x._2()) {
                    set.add(sorce);
                    if (set.size() > topN.value()) {
                        set.pollLast();
                    }
                }
                return new Tuple2<String, Iterable<String>>(x._1, set);
            }).sortByKey();
    result.foreach(x -> System.out.println(x));
 }
}
//因爲對象序列化在這裏沒有用,比較器也需要序列化
interface Mycomparator extends Comparator<String>,Serializable{}

Scala版 Spark topN問題

object TopN {
 def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TopN").setMaster("local[2]")
val sc = new SparkContext(conf)
val linesRDD  = sc.textFile("E:/test/topn.txt")
linesRDD.map(line => new Tuple2[String,String](line.split(" ")(0),line.split(" ")(1))).groupByKey()
    .sortByKey().map(x => MyTopN(3,x)).foreach(x => println(x))

}
ing]]):Tuple2[String,Iterable[String]] ={
//特別注意,他媽逼scala 與 java TreeSet自定義比較器的時候,不一樣
//Java直接從第一個括號傳進去,Scala要另起一括號,操
var set = mutable.TreeSet[String]()(new Ordering[String]() {
  override def compare(x: String, y: String): Int = {
    var ret = x.compareTo(y)
    if(ret == 0){
      ret = 1
    }
    ret
  }
})
for(s <-tuple._2){
  set += s
  if (set.size>topn){
    set = set.take(topn)
  }
}
new Tuple2[String,mutable.Iterable[String]](tuple._1,set)
  }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章