- wordcount: 統計詞頻, 排序 (jdk7, jdk8-lambda表達式)
- 歷年溫度最值: max, min, avg
<!--maven依賴 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.0</version>
</dependency>
jdk8 : lambda表達式
idea中添加maven jdk8構建依賴
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
public static void main(String[] args){
//conf
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("wc");
//context
JavaSparkContext context = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = context.textFile("/home/wang/txt/word.txt");
//lambda: 表達式
JavaRDD<String> rdd2 = rdd1.flatMap(s -> Arrays.asList(s.split(" ")).iterator());
JavaPairRDD<String, Integer> rdd3 = rdd2.mapToPair(s -> new Tuple2<String, Integer>(s, 1));
JavaPairRDD<String, Integer> rdd4 = rdd3.reduceByKey((x, y) -> x + y);
//按單詞升序
List<Tuple2<String, Integer>> list1 = rdd4.sortByKey(true).collect();
//按詞頻降序
JavaPairRDD<String, Integer> rdd5 = rdd4.mapToPair(x -> new Tuple2<Integer, String>(x._2, x._1))
.sortByKey()
.mapToPair(x -> new Tuple2<String, Integer>(x._2, x._1));
List<Tuple2<String, Integer>> list2 = rdd5.collect();
}
jdk7 : spark api
public static void main(String[] args){
//conf, context........ 同上
//1, a b c ===> split( ) : string[] {a,b,c}
JavaRDD<String> rdd2 = rdd1.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String s) throws Exception {
String[] arr = s.split(" ");
return Arrays.asList(arr).iterator();
}
});
//2, string[] {a,b,c}==>(a,1),(b,1)
JavaPairRDD<String, Integer> rdd3 = rdd2.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s, 1);
}
}).filter(new Function<Tuple2<String, Integer>, Boolean>() {//過濾空字符
public Boolean call(Tuple2<String, Integer> v1) throws Exception {
return v1._1.trim().length()>0 ;
}
});
//3, (a,1),(b,1) ==> reduceByKey: (a,4), (b,3)
JavaPairRDD<String, Integer> rdd4 = rdd3.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
//4.1 排序( 字母生序)
JavaPairRDD<String, Integer> rddRes = rdd4.sortByKey();
List<Tuple2<String, Integer>> list1 = rddRes.collect();
//4.2排序( 詞頻降序)
JavaPairRDD<Integer, String> sortRdd1 = rdd4.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
public Tuple2<Integer, String> call(Tuple2<String, Integer> tup) throws Exception {
return new Tuple2<Integer, String>(tup._2, tup._1);
}
});
JavaPairRDD<String, Integer> sortRdd2 = sortRdd1.sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
public Tuple2<String, Integer> call(Tuple2<Integer, String> tup) throws Exception {
return new Tuple2<String, Integer>(tup._2, tup._1);
}
});
List<Tuple2<String, Integer>> list2 = sortRdd2.collect();
}