Spark 實現常用的map reduce功能 (Java版本)

記錄利用spark core的函數,完成一些map reduce功能的練習,spark core有Transformation和Action兩種算子,Transformation完成中間轉變過程,不會把運算真的算出來,Action纔會最終把運算計算出來,所以運算必須以Action算子作爲結束。

Transformation算子:
map、filter、 flatMap、groupByKey 、reduceByKey、sortByKey、 cogroup。
Action算子:
reduce()、collect()、 count()、take()、save()、countByKey()。

0、共有的方法:

需要利用JavaSparkContext把數據編程spark的RDD數據,然後才能利用spark算子處理。

public static JavaSparkContext getSC(){
        SparkConf sparkConf = new SparkConf().setAppName("transformation").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        return sc;
    }

1、單詞計數

public static void wordCount(){

        // 製作數據集:
        List data = Arrays.asList("Google Bye GoodBye Hadoop code", "Java code Bye");

        // 將數據轉化爲RDD
        JavaSparkContext sc = getSC();
        JavaRDD lines = sc.parallelize(data);

        // 轉化邏輯:
        // 一行行轉化爲 "Google", "Bye"...
        // 然後轉爲:  ("Google", 1) 的key-value對
        // 最後根據 key 進行合併

        JavaRDD words = lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterator call(String lines) throws Exception {
                return Arrays.asList(lines.split(" ")).iterator();
            }
        });

        JavaPairRDD word = words.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2 call(String word) throws Exception {
                return new Tuple2(word, 1);
            }
        });

        JavaPairRDD wordCnt = word.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        
        wordCnt.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> o) throws Exception {
                System.out.println(o._1 + ":" + o._2);
            }
        });
    }

/* 輸出:
Bye:2
Google:1
Java:1
code:2
GoodBye:1
Hadoop:1
*/

2、倒排索引

單詞作爲Key,文檔的ids作爲value,查看單詞在哪篇文檔中出現過。

public static void invertedIndex(){

        // 製作數據
        List data = Arrays.asList(new Tuple2<>(1, "This is the content of document 1 it is very short"),
                new Tuple2<>(2, "This is the content of document 2 it is very long bilabial"),
                new Tuple2<>(3, "This is the a document of 3 I love programming"));

        JavaSparkContext sc = getSC();
        JavaPairRDD<Integer, String> docStr = sc.parallelizePairs(data);

        // 轉化邏輯:
        // 用map 將 數據轉爲 (單詞,文檔id)的key-value對
        // 用groupByKey 根據單詞集合,再用sort排序

        JavaPairRDD<String, Integer> strDocID = docStr.flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, String>, String, Integer>() {
            @Override
            public Iterator<Tuple2<String, Integer>> call(Tuple2<Integer, String> integerStringTuple2) throws Exception {
                List<String> word = Arrays.asList(integerStringTuple2._2.split(" "));
                List<Tuple2<String, Integer>> wordDocID = new ArrayList<>();

				// 這裏用Map來完成去重的工作,如果有更好的(key,values)去除values重複的方法,請指教一下
                Map<String, Integer> myMap = new HashMap<>();

                for (String s : word) {
                    if(!myMap.containsKey(word)){
                        myMap.put(s, integerStringTuple2._1);
                    }
                }

                for (Map.Entry<String, Integer> stringIntegerEntry : myMap.entrySet()) {
                    wordDocID.add(new Tuple2<>(stringIntegerEntry.getKey(), stringIntegerEntry.getValue()));
                }
                return wordDocID.iterator();
            }
        });

        JavaPairRDD wordIDs = strDocID.groupByKey();

        JavaPairRDD wordIDsSort = wordIDs.sortByKey(true);

        wordIDsSort.foreach(new VoidFunction<Tuple2<String, Iterable>>() {
            @Override
            public void call(Tuple2<String, Iterable> o) throws Exception {
                System.out.print(o._1 + ":");
                Iterator it = o._2.iterator();
                while(it.hasNext()){
                    System.out.print(it.next() + ",");
                }
                System.out.println("");
            }
        });
    }
/* 輸出:
1:1,
2:2,
3:3,
I:3,
This:1,2,3,
a:3,
bilabial:2,
content:1,2,
document:1,2,3,
is:1,2,3,
it:1,2,
long:2,
love:3,
of:1,2,3,
programming:3,
short:1,
the:1,2,3,
very:1,2,
*/

3、N-Gram

N-Gram 相N個單詞組成詞組,所有的詞組出現的次數

public static void nGramSimple(){

        // 製作數據:
        List data = Arrays.asList("abcabc", "abcabc", "bbcabc");
        final int N = 3;

        JavaSparkContext sc = getSC();
        JavaRDD nGramData = sc.parallelize(data);

        // 轉化邏輯:
        // (NGram, 1) -> reduceByKey

        JavaPairRDD nGram = nGramData.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
            @Override
            public Iterator<Tuple2<String, Integer>> call(String str) throws Exception {
                List<Tuple2<String, Integer>> pairList = new ArrayList<>();
                for(int index = 0; index < str.length() - N + 1; ++index){
                    pairList.add(new Tuple2<>(str.substring(index, index + N), 1));
                }
                return pairList.iterator();
            }
        });

        JavaPairRDD nGramCnt = nGram.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        nGramCnt.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> o) throws Exception {
                System.out.println(o._1 + ":"  + o._2);
            }
        });
    }

4、最常出現的前K個單詞

public static void topKFrequentWords(){

        List data = Arrays.asList("a b c d a a a a", "b b f f e e c b b b", "g h i j k f f f");
        final int N = 3;

        JavaSparkContext sc = getSC();

        // 轉化邏輯:
        // 先轉化爲 (word, 1) 的 key-value對,然後reduceByKey
        // 然後 用mapPartitions 在每個分區內 維護一個大小爲K的小頂堆 
        // 最後將這些小頂堆的元素 取出,變爲一個較小的列表,遍歷它,同時維護一個大小爲K的小頂堆,最後小頂堆爲前K高頻詞

        JavaRDD topKData = sc.parallelize(data);

        JavaPairRDD word = topKData.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
            @Override
            public Iterator<Tuple2<String, Integer>> call(String str) throws Exception {
                List<String> words = Arrays.asList(str.split(" "));
                List<Tuple2<String, Integer>> wordPair = new ArrayList<>();
                for (String s : words) {
                    wordPair.add(new Tuple2<>(s, 1));
                }
                return wordPair.iterator();
            }
        });

        JavaPairRDD wordCnt = word.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        class TopKKey implements Ordered<TopKKey>, Serializable{

            private String word;
            private Integer cnt;

            public void setWord(String word) {
                this.word = word;
            }

            public void setCnt(Integer cnt) {
                this.cnt = cnt;
            }

            public String getWord() {
                return word;
            }

            public Integer getCnt() {
                return cnt;
            }

            public TopKKey(String word, int cnt) {
                this.word = word;
                this.cnt = cnt;
            }

            @Override
            public int compare(TopKKey that) {
                return this.getCnt().compareTo(that.getCnt());
            }
            @Override
            public int compareTo(TopKKey that) {
                return this.getCnt().compareTo(that.getCnt());
            }

            @Override
            public boolean $less(TopKKey that) {
                return false;
            }

            @Override
            public boolean $greater(TopKKey that) {
                return false;
            }

            @Override
            public boolean $less$eq(TopKKey that) {
                return false;
            }

            @Override
            public boolean $greater$eq(TopKKey that) {
                return false;
            }
        }


        JavaRDD topKHeaps = wordCnt.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String, Integer>>, Iterator<TopKKey>>() {
            @Override
            public Iterator call(Iterator<Tuple2<String, Integer>> wordCount) throws Exception {
                PriorityQueue<TopKKey> Q = new PriorityQueue<>();
                while(wordCount.hasNext()){
                    Tuple2<String, Integer> t = wordCount.next();
                    TopKKey tk = new TopKKey(t._1, t._2);
                    if(Q.size() < N){
                        Q.add(tk);
                    }else{
                        TopKKey peek = Q.peek();
                        if(tk.compareTo(peek) > 0){
                            Q.poll();
                            Q.add(tk);
                        }
                    }
                }
                List list = new ArrayList();
                for (TopKKey topKKey : Q) {
                    list.add(topKKey);
                }
                return list.iterator();
            }
        });

        List<TopKKey> topKValues = topKHeaps.collect();
        PriorityQueue<TopKKey> topKHeap = new PriorityQueue<>();

        for (TopKKey value : topKValues) {
            if(topKHeap.size() < N){
                topKHeap.add(value);
            }else{
                TopKKey peek = topKHeap.peek();
                if(value.compareTo(peek) > 0){
                    topKHeap.poll();
                    topKHeap.add(value);
                }
            }

        }

        for (TopKKey topKKey : topKHeap) {
            System.out.println(topKKey.getWord() + ":" + topKKey.getCnt());
        }
    }

5、二次排序

public class SecondSortJava {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("wordCountApp").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List list = Arrays.asList("class1 67","class2 89","class1 78",
                "class2 90","class1 99","class3 34","class3 89");

        JavaRDD rdd = sc.parallelize(list);

        JavaPairRDD beginSortValues = rdd.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2 call(String line) throws Exception {
                String first = line.split(" ")[0];
                int second = Integer.parseInt(line.split(" ")[1]);
                SecondSortKey secondSortKey = new SecondSortKey(first, second);
                return new Tuple2(secondSortKey, line);
            }
        });

        JavaPairRDD sortValues = beginSortValues.sortByKey(false);

        sortValues.foreach(new VoidFunction<Tuple2<SecondSortKey, String>>(){
            @Override
            public void call(Tuple2 o) throws Exception {
                System.out.println(o._2);
            }
        });



    }

}

// ^ + I 實現接口中的虛擬方法
class SecondSortKey implements Ordered<SecondSortKey>, Serializable{

    private String first;
    private int second;

    public SecondSortKey(String first, int second) {
        this.first = first;
        this.second = second;
    }

    // ⌘N setter getter方法
    public void setFirst(String first) {
        this.first = first;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    public String getFirst() {
        return first;
    }

    public int getSecond() {
        return second;
    }

    @Override
    public int compareTo(SecondSortKey that) {
        int comp = this.getFirst().compareTo(that.getFirst());
        if(comp == 0){
            return Integer.valueOf(this.getSecond()).compareTo(that.getSecond());
        }
        return comp;
    }

    @Override
    public int compare(SecondSortKey that) {
        int comp = this.getFirst().compareTo(that.getFirst());
        if(comp == 0){
            return Integer.valueOf(this.getSecond()).compareTo(that.getSecond());
        }
        return comp;
    }


    @Override
    public boolean $less(SecondSortKey that) {
        return false;
    }

    @Override
    public boolean $greater(SecondSortKey that) {
        return false;
    }

    @Override
    public boolean $less$eq(SecondSortKey that) {
        return false;
    }

    @Override
    public boolean $greater$eq(SecondSortKey that) {
        return false;
    }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章