1、基礎排序算法
sortByKey 如wordCount中一樣
2、二次排序算法
java版:
自定義排序的Key值
package com.quell.spark;
import scala.Serializable;
import scala.math.Ordered;
public class SecondSort implements Ordered<SecondSort>, Serializable {
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SecondSort that = (SecondSort) o;
if (first != that.first) return false;
return second == that.second;
}
@Override
public int hashCode() {
int result = first;
result = 31 * result + second;
return result;
}
private int first;
private int second;
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
public SecondSort(int first,int second){
this.first=first;
this.second=second;
}
@Override
public int compare(SecondSort that) {
if (this.first-that.getFirst()!=0){
return this.first-that.getFirst();
}else {
return this.second-that.getSecond();
}
}
@Override
public boolean $less(SecondSort that) {
if (this.first<that.getFirst()){
return true;
}else if (this.first==that.getFirst() && this.second<that.getSecond()){
return true;
}
return false;
}
@Override
public boolean $greater(SecondSort that) {
if (this.first>that.getFirst()){
return true;
}else if (this.first==that.getFirst() && this.second>that.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less$eq(SecondSort that) {
if (this.$less(that)){
return true;
}else if (this.first==that.getFirst() && this.second==that.getSecond()){
return true;
}
return false;
}
@Override
public boolean $greater$eq(SecondSort that) {
if (this.$greater(that)){
return true;
}else if (this.first==that.getFirst() && this.second==that.getSecond()){
return true;
}
return false;
}
@Override
public int compareTo(SecondSort that) {
if (this.first-that.getFirst()!=0){
return this.first-that.getFirst();
}else {
return this.second-that.getSecond();
}
}
}
package com.quell.spark;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* 二次排序,具體的實現步驟:
* 1.安裝Ordered和Serializable接口實現自定義排序的Key
* 2.將要二次排序的文件加載進來生成<Key,Value>類型的RDD
* 3.使用sortByKey基於自定義的Key進行二次排序
* 4.去除掉排序的Key,只保留排序的結果
*
*/
public class SecondSortApp {
public static void main(String[] args){
SparkConf conf=new SparkConf().setAppName("SecondarySortApp").setMaster("local");
JavaSparkContext sc=new JavaSparkContext(conf);//其底層實際上就是Scala的SparkContext
JavaRDD<String> lines = sc.textFile("E://test/sort.txt");
JavaPairRDD<SecondSort, String> pairs = lines.mapToPair(new PairFunction<String, SecondSort, String>() {
@Override
public Tuple2<SecondSort, String> call(String line) throws Exception {
String[] splited=line.split(" ");
SecondSort key=new SecondSort(Integer.valueOf(splited[0]),Integer.valueOf(splited[1]));
return new Tuple2<SecondSort, String>(key,line);
}
}); //將前兩個數作爲Key
JavaPairRDD<SecondSort,String> sorted= pairs.sortByKey(); //完成二次排序
//過濾排序後自定義的key,保留排序的結果
JavaRDD<String> secondSorted=sorted.map(new Function<Tuple2<SecondSort,String>,String>() {
@Override
public String call(Tuple2<SecondSort, String> Sortcontent) throws Exception {
System.out.println("sortedContent._1 "+(Sortcontent._1));
System.out.println("sortedContent._2 "+Sortcontent._2);
return Sortcontent._2;
}
});
secondSorted.foreach(new VoidFunction<String>(){
@Override
public void call(String sorteds) throws Exception {
System.out.println(sorteds);
}
});
}
}
Scala版
package com.quell.spark
class SecondarySortKey(val first: Int, val second:Int) extends Ordered[SecondarySortKey] with Serializable{
def compare(other:SecondarySortKey):Int={
if (this.first-other.first!=0){
this.first-other.first
}else{
this
}.second-other.second
}
}
package com.quell.spark
import org.apache.spark.{SparkConf, SparkContext}
object SecondarySortApp {
def main(args:Array[String]): Unit ={
val sc=sparkContext("sort")
val lines=sc.textFile("e:/test/sort.txt")
val pairWithSortKey=lines.map(line =>(
new SecondarySortKey(line.split(" ")(0).toInt,line.split(" ")(1).toInt),line
))
val sorted=pairWithSortKey.sortByKey(true)
val sortResult = sorted.map(sortedLine => sortedLine._2)
sortResult.collect().foreach(println)
}
def sparkContext(name:String)={
val conf = new SparkConf().setAppName("sort").setMaster("local")
val sc = new SparkContext(conf) //創建SparkContent,第一個RDD創建的唯一入口,也是Driver的靈魂,也是通往集羣的唯一通道
sc
}
}
學習於:
DT大數據夢工廠
新浪微博:www.weibo.com/ilovepains/
微信公衆號:DT_Spark
博客:http://.blog.sina.com.cn/ilovepains
TEL:18610086859
Email:[email protected]