Spark高級排序

1、基礎排序算法

sortByKey 如wordCount中一樣

2、二次排序算法
java版：
自定義排序的Key值

package com.quell.spark;

import scala.Serializable;
import scala.math.Ordered;

public class SecondSort implements Ordered<SecondSort>, Serializable {
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        SecondSort that = (SecondSort) o;
        if (first != that.first) return false;
        return second == that.second;
    }
    @Override
    public int hashCode() {
        int result = first;
        result = 31 * result + second;
        return result;
    }
    private int first;
    private int second;
    public int getFirst() {
        return first;
    }
    public void setFirst(int first) {
        this.first = first;
    }
    public int getSecond() {
        return second;
    }
    public void setSecond(int second) {
        this.second = second;
    }

    public SecondSort(int first,int second){
        this.first=first;
        this.second=second;
    }
    @Override
    public int compare(SecondSort that) {
        if (this.first-that.getFirst()!=0){
            return this.first-that.getFirst();
        }else {
            return this.second-that.getSecond();
        }
    }
    @Override
    public boolean $less(SecondSort that) {
        if (this.first<that.getFirst()){
            return true;
        }else if (this.first==that.getFirst() && this.second<that.getSecond()){
            return  true;
        }
        return false;
    }

    @Override
    public boolean $greater(SecondSort that) {
        if (this.first>that.getFirst()){
            return true;
        }else if (this.first==that.getFirst() && this.second>that.getSecond()){
            return  true;
        }
        return false;

    }

    @Override
    public boolean $less$eq(SecondSort that) {
        if (this.$less(that)){
            return true;
        }else if (this.first==that.getFirst() && this.second==that.getSecond()){
            return  true;
        }
        return false;
    }
    @Override
    public boolean $greater$eq(SecondSort that) {
       if (this.$greater(that)){
           return true;
       }else if (this.first==that.getFirst() && this.second==that.getSecond()){
           return true;
       }
           return false;
    }

    @Override
    public int compareTo(SecondSort that) {
        if (this.first-that.getFirst()!=0){
            return this.first-that.getFirst();
        }else {
            return this.second-that.getSecond();
        }
    }
}

package com.quell.spark;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

/**
 * 二次排序，具體的實現步驟：
 * 1.安裝Ordered和Serializable接口實現自定義排序的Key
 * 2.將要二次排序的文件加載進來生成<Key,Value>類型的RDD
 * 3.使用sortByKey基於自定義的Key進行二次排序
 * 4.去除掉排序的Key,只保留排序的結果
 *
 */
public class SecondSortApp {

    public static void main(String[] args){
        SparkConf conf=new SparkConf().setAppName("SecondarySortApp").setMaster("local");
        JavaSparkContext sc=new JavaSparkContext(conf);//其底層實際上就是Scala的SparkContext
        JavaRDD<String> lines = sc.textFile("E://test/sort.txt");

        JavaPairRDD<SecondSort, String> pairs = lines.mapToPair(new PairFunction<String, SecondSort, String>() {

            @Override
            public Tuple2<SecondSort, String> call(String line) throws Exception {
                String[] splited=line.split(" ");
                SecondSort key=new SecondSort(Integer.valueOf(splited[0]),Integer.valueOf(splited[1]));
                return new Tuple2<SecondSort, String>(key,line);
            }
        });  //將前兩個數作爲Key

        JavaPairRDD<SecondSort,String> sorted= pairs.sortByKey();    //完成二次排序
        //過濾排序後自定義的key,保留排序的結果
        JavaRDD<String> secondSorted=sorted.map(new Function<Tuple2<SecondSort,String>,String>() {

            @Override
            public String call(Tuple2<SecondSort, String> Sortcontent) throws Exception {
                System.out.println("sortedContent._1 "+(Sortcontent._1));
                System.out.println("sortedContent._2 "+Sortcontent._2);
                return Sortcontent._2;
            }
        });
        secondSorted.foreach(new VoidFunction<String>(){
            @Override
            public void call(String sorteds) throws Exception {
                System.out.println(sorteds);
            }
        });
    }
}

Scala版

package com.quell.spark
class SecondarySortKey(val first: Int, val second:Int) extends Ordered[SecondarySortKey] with Serializable{
   def compare(other:SecondarySortKey):Int={
        if (this.first-other.first!=0){
            this.first-other.first
        }else{
          this
        }.second-other.second
    }
}

package com.quell.spark

import org.apache.spark.{SparkConf, SparkContext}

object SecondarySortApp {
    def main(args:Array[String]): Unit ={
      val sc=sparkContext("sort")
      val lines=sc.textFile("e:/test/sort.txt")
      val pairWithSortKey=lines.map(line =>(
        new SecondarySortKey(line.split(" ")(0).toInt,line.split(" ")(1).toInt),line
        ))
      val sorted=pairWithSortKey.sortByKey(true)
      val sortResult = sorted.map(sortedLine => sortedLine._2)
      sortResult.collect().foreach(println)
    }
  def sparkContext(name:String)={
    val conf = new SparkConf().setAppName("sort").setMaster("local")
    val sc = new SparkContext(conf) //創建SparkContent,第一個RDD創建的唯一入口，也是Driver的靈魂，也是通往集羣的唯一通道
    sc
  }
}

學習於：

ＤＴ大數據夢工廠
新浪微博：www.weibo.com/ilovepains/
微信公衆號：DT_Spark
博客：http://.blog.sina.com.cn/ilovepains
TEL:18610086859
Email:[email protected]

sflotus

發佈了36 篇原創文章 · 獲贊 7 · 訪問量 3萬+

私信關注

Spark計算過程分析

IEDA下開發Spark

從Spark架構中透視Job

K-means聚類算法初探

RDD內幕解密

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結