記錄利用spark core的函數,完成一些map reduce功能的練習,spark core有Transformation和Action兩種算子,Transformation完成中間轉變過程,不會把運算真的算出來,Action纔會最終把運算計算出來,所以運算必須以Action算子作爲結束。
Transformation算子:
map、filter、 flatMap、groupByKey 、reduceByKey、sortByKey、 cogroup。
Action算子:
reduce()、collect()、 count()、take()、save()、countByKey()。
0、共有的方法:
需要利用JavaSparkContext把數據編程spark的RDD數據,然後才能利用spark算子處理。
public static JavaSparkContext getSC(){
SparkConf sparkConf = new SparkConf().setAppName("transformation").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
return sc;
}
1、單詞計數
public static void wordCount(){
// 製作數據集:
List data = Arrays.asList("Google Bye GoodBye Hadoop code", "Java code Bye");
// 將數據轉化爲RDD
JavaSparkContext sc = getSC();
JavaRDD lines = sc.parallelize(data);
// 轉化邏輯:
// 一行行轉化爲 "Google", "Bye"...
// 然後轉爲: ("Google", 1) 的key-value對
// 最後根據 key 進行合併
JavaRDD words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator call(String lines) throws Exception {
return Arrays.asList(lines.split(" ")).iterator();
}
});
JavaPairRDD word = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2 call(String word) throws Exception {
return new Tuple2(word, 1);
}
});
JavaPairRDD wordCnt = word.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
wordCnt.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> o) throws Exception {
System.out.println(o._1 + ":" + o._2);
}
});
}
/* 輸出:
Bye:2
Google:1
Java:1
code:2
GoodBye:1
Hadoop:1
*/
2、倒排索引
單詞作爲Key,文檔的ids作爲value,查看單詞在哪篇文檔中出現過。
public static void invertedIndex(){
// 製作數據
List data = Arrays.asList(new Tuple2<>(1, "This is the content of document 1 it is very short"),
new Tuple2<>(2, "This is the content of document 2 it is very long bilabial"),
new Tuple2<>(3, "This is the a document of 3 I love programming"));
JavaSparkContext sc = getSC();
JavaPairRDD<Integer, String> docStr = sc.parallelizePairs(data);
// 轉化邏輯:
// 用map 將 數據轉爲 (單詞,文檔id)的key-value對
// 用groupByKey 根據單詞集合,再用sort排序
JavaPairRDD<String, Integer> strDocID = docStr.flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, String>, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(Tuple2<Integer, String> integerStringTuple2) throws Exception {
List<String> word = Arrays.asList(integerStringTuple2._2.split(" "));
List<Tuple2<String, Integer>> wordDocID = new ArrayList<>();
// 這裏用Map來完成去重的工作,如果有更好的(key,values)去除values重複的方法,請指教一下
Map<String, Integer> myMap = new HashMap<>();
for (String s : word) {
if(!myMap.containsKey(word)){
myMap.put(s, integerStringTuple2._1);
}
}
for (Map.Entry<String, Integer> stringIntegerEntry : myMap.entrySet()) {
wordDocID.add(new Tuple2<>(stringIntegerEntry.getKey(), stringIntegerEntry.getValue()));
}
return wordDocID.iterator();
}
});
JavaPairRDD wordIDs = strDocID.groupByKey();
JavaPairRDD wordIDsSort = wordIDs.sortByKey(true);
wordIDsSort.foreach(new VoidFunction<Tuple2<String, Iterable>>() {
@Override
public void call(Tuple2<String, Iterable> o) throws Exception {
System.out.print(o._1 + ":");
Iterator it = o._2.iterator();
while(it.hasNext()){
System.out.print(it.next() + ",");
}
System.out.println("");
}
});
}
/* 輸出:
1:1,
2:2,
3:3,
I:3,
This:1,2,3,
a:3,
bilabial:2,
content:1,2,
document:1,2,3,
is:1,2,3,
it:1,2,
long:2,
love:3,
of:1,2,3,
programming:3,
short:1,
the:1,2,3,
very:1,2,
*/
3、N-Gram
N-Gram 相N個單詞組成詞組,所有的詞組出現的次數
public static void nGramSimple(){
// 製作數據:
List data = Arrays.asList("abcabc", "abcabc", "bbcabc");
final int N = 3;
JavaSparkContext sc = getSC();
JavaRDD nGramData = sc.parallelize(data);
// 轉化邏輯:
// (NGram, 1) -> reduceByKey
JavaPairRDD nGram = nGramData.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String str) throws Exception {
List<Tuple2<String, Integer>> pairList = new ArrayList<>();
for(int index = 0; index < str.length() - N + 1; ++index){
pairList.add(new Tuple2<>(str.substring(index, index + N), 1));
}
return pairList.iterator();
}
});
JavaPairRDD nGramCnt = nGram.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
nGramCnt.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> o) throws Exception {
System.out.println(o._1 + ":" + o._2);
}
});
}
4、最常出現的前K個單詞
public static void topKFrequentWords(){
List data = Arrays.asList("a b c d a a a a", "b b f f e e c b b b", "g h i j k f f f");
final int N = 3;
JavaSparkContext sc = getSC();
// 轉化邏輯:
// 先轉化爲 (word, 1) 的 key-value對,然後reduceByKey
// 然後 用mapPartitions 在每個分區內 維護一個大小爲K的小頂堆
// 最後將這些小頂堆的元素 取出,變爲一個較小的列表,遍歷它,同時維護一個大小爲K的小頂堆,最後小頂堆爲前K高頻詞
JavaRDD topKData = sc.parallelize(data);
JavaPairRDD word = topKData.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String str) throws Exception {
List<String> words = Arrays.asList(str.split(" "));
List<Tuple2<String, Integer>> wordPair = new ArrayList<>();
for (String s : words) {
wordPair.add(new Tuple2<>(s, 1));
}
return wordPair.iterator();
}
});
JavaPairRDD wordCnt = word.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
class TopKKey implements Ordered<TopKKey>, Serializable{
private String word;
private Integer cnt;
public void setWord(String word) {
this.word = word;
}
public void setCnt(Integer cnt) {
this.cnt = cnt;
}
public String getWord() {
return word;
}
public Integer getCnt() {
return cnt;
}
public TopKKey(String word, int cnt) {
this.word = word;
this.cnt = cnt;
}
@Override
public int compare(TopKKey that) {
return this.getCnt().compareTo(that.getCnt());
}
@Override
public int compareTo(TopKKey that) {
return this.getCnt().compareTo(that.getCnt());
}
@Override
public boolean $less(TopKKey that) {
return false;
}
@Override
public boolean $greater(TopKKey that) {
return false;
}
@Override
public boolean $less$eq(TopKKey that) {
return false;
}
@Override
public boolean $greater$eq(TopKKey that) {
return false;
}
}
JavaRDD topKHeaps = wordCnt.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String, Integer>>, Iterator<TopKKey>>() {
@Override
public Iterator call(Iterator<Tuple2<String, Integer>> wordCount) throws Exception {
PriorityQueue<TopKKey> Q = new PriorityQueue<>();
while(wordCount.hasNext()){
Tuple2<String, Integer> t = wordCount.next();
TopKKey tk = new TopKKey(t._1, t._2);
if(Q.size() < N){
Q.add(tk);
}else{
TopKKey peek = Q.peek();
if(tk.compareTo(peek) > 0){
Q.poll();
Q.add(tk);
}
}
}
List list = new ArrayList();
for (TopKKey topKKey : Q) {
list.add(topKKey);
}
return list.iterator();
}
});
List<TopKKey> topKValues = topKHeaps.collect();
PriorityQueue<TopKKey> topKHeap = new PriorityQueue<>();
for (TopKKey value : topKValues) {
if(topKHeap.size() < N){
topKHeap.add(value);
}else{
TopKKey peek = topKHeap.peek();
if(value.compareTo(peek) > 0){
topKHeap.poll();
topKHeap.add(value);
}
}
}
for (TopKKey topKKey : topKHeap) {
System.out.println(topKKey.getWord() + ":" + topKKey.getCnt());
}
}
5、二次排序
public class SecondSortJava {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("wordCountApp").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
List list = Arrays.asList("class1 67","class2 89","class1 78",
"class2 90","class1 99","class3 34","class3 89");
JavaRDD rdd = sc.parallelize(list);
JavaPairRDD beginSortValues = rdd.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2 call(String line) throws Exception {
String first = line.split(" ")[0];
int second = Integer.parseInt(line.split(" ")[1]);
SecondSortKey secondSortKey = new SecondSortKey(first, second);
return new Tuple2(secondSortKey, line);
}
});
JavaPairRDD sortValues = beginSortValues.sortByKey(false);
sortValues.foreach(new VoidFunction<Tuple2<SecondSortKey, String>>(){
@Override
public void call(Tuple2 o) throws Exception {
System.out.println(o._2);
}
});
}
}
// ^ + I 實現接口中的虛擬方法
class SecondSortKey implements Ordered<SecondSortKey>, Serializable{
private String first;
private int second;
public SecondSortKey(String first, int second) {
this.first = first;
this.second = second;
}
// ⌘N setter getter方法
public void setFirst(String first) {
this.first = first;
}
public void setSecond(int second) {
this.second = second;
}
public String getFirst() {
return first;
}
public int getSecond() {
return second;
}
@Override
public int compareTo(SecondSortKey that) {
int comp = this.getFirst().compareTo(that.getFirst());
if(comp == 0){
return Integer.valueOf(this.getSecond()).compareTo(that.getSecond());
}
return comp;
}
@Override
public int compare(SecondSortKey that) {
int comp = this.getFirst().compareTo(that.getFirst());
if(comp == 0){
return Integer.valueOf(this.getSecond()).compareTo(that.getSecond());
}
return comp;
}
@Override
public boolean $less(SecondSortKey that) {
return false;
}
@Override
public boolean $greater(SecondSortKey that) {
return false;
}
@Override
public boolean $less$eq(SecondSortKey that) {
return false;
}
@Override
public boolean $greater$eq(SecondSortKey that) {
return false;
}
}