Java版:
package cn.spark.sparktest; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.*; import scala.Tuple2; import java.util.Arrays; public class sortWordcount { public static void main(String[] args){ SparkConf conf = new SparkConf() .setMaster("local") .setAppName("sortWordcount"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("C://User//Desktop//spark.txt"); JavaRDD<String> mid = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }); JavaPairRDD<String ,Integer> pairs = mid.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<String, Integer>(s,1); } }); JavaPairRDD<String,Integer> redu = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i, Integer h) throws Exception { return i + h; } }); // 到這裏爲止,就得到了每個單詞出現的次數 // 但是,問題是,我們的新需求,是要按照每個單詞出現次數的順序,降序排序 // wordCounts RDD內的元素是什麼?應該是這種格式的吧:(hello, 3) (you, 2) // 我們需要將RDD轉換成(3, hello) (2, you)的這種格式,才能根據單詞出現次數進行排序把! // 進行key-value的反轉映射 JavaPairRDD<Integer,String> tran = redu.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { @Override public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception { return new Tuple2<Integer, String>(t._2,t._1); } }); // 按照key進行排序 JavaPairRDD<Integer , String> sort = tran.sortByKey(false); // 再次將value-key進行反轉映射 JavaPairRDD<String,Integer> tran1 = sort.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() { @Override public Tuple2<String, Integer> call(Tuple2<Integer, String> s) throws Exception { return new Tuple2<String, Integer>(s._2,s._1); } }); // 到此爲止,我們獲得了按照單詞出現次數排序後的單詞計數 // 打印出來 tran1.foreach(new VoidFunction<Tuple2<String, Integer>>() { @Override public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception { System.out.println(stringIntegerTuple2); } }); sc.close(); } }
測試:
Scala版:
package cn.spark.study.core import org.apache.spark.{SparkConf, SparkContext} object sortWordcount { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setAppName("sortWordcount") .setMaster("local") val sc = new SparkContext(conf) val lines = sc.textFile("C://Users//gaochen//Desktop//spark.txt") val flat = lines.flatMap(x => x.split(" ")) val mp =flat.map(x => (x ,1)) val red = mp.reduceByKey( _ + _) val tran = red.map(x => (x._2,x._1)) val sortCount = tran.sortByKey(false) val tran1 = sortCount.map(x => (x._2,x._1)) tran1.foreach(x => println(x._1,x._2)) } }
測試: