Storm入門與實踐(3)通過WordCount展開Storm的編程之旅

介紹

貌似WordCount已經成了大數據,分佈式計算的入門標配程序,其實仔細想一下WordCount的例子,它還有很用應用的場景,例如統計過去一段時間網站中各個商品的瀏覽量,最近一段時間相同查詢的數量等.

本文主要討論下Storm如何實現WordCount

Topology結構

WordCount的Topology比較簡單,流程如下
這裏寫圖片描述

主要代碼

Storm版本

Storm:apache-storm-1.1.1

完整代碼

package com.eric.storm.sample;

import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;

import java.util.*;

/*
** WordCountTopolopgyAllInJava類(單詞計數)
*/
public class WordCountTopolopgyAllInJava {

    // 定義一個噴頭,用於產生數據。該類繼承自BaseRichSpout
    public static class RandomSentenceSpout extends BaseRichSpout {
        private SpoutOutputCollector _collector;
        private Random _rand;
        private Map<String,Values> pending;

        @Override
        public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
            _collector = collector;
            _rand = new Random();
            pending=new HashMap<String, Values>();
        }

        @Override
        public void nextTuple() {

            // 睡眠一段時間後再產生一個數據
            Utils.sleep(100);
            // 句子數組
            String[] sentences = new String[]{"the cow jumped over the moon", "an apple a day keeps the doctor away",
                    "four score and seven years ago", "snow white and the seven dwarfs", "i am at two with nature"};
            // 隨機選擇一個句子
            String sentence = sentences[_rand.nextInt(sentences.length)];
            Values tmpValues=new Values(sentence);
            String msgID=UUID.randomUUID().toString();
            pending.put(msgID,tmpValues);
            // 發射該句子給Bolt,每個tuple都有一個唯一標識
            _collector.emit(tmpValues,msgID);
        }

        // 確認函數:成功處理的tuple,其id會從pending列表中刪除
        @Override
        public void ack(Object id) {
            System.out.println("Msg:"+id+" send successful!");
            pending.remove(id);

        }

        // 失敗處理函數:處理失敗的時候重新發送一次tuple,
        @Override
        public void fail(Object id) {
            System.out.println("Msg:"+id+" send failed,will try again!");
            Values failedMsg=pending.get(id);
            _collector.emit(failedMsg,id);
        }

        @Override
        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            // 定義一個字段word
            declarer.declare(new Fields("word"));
        }
    }

    // 定義個Bolt,用於將句子切分爲單詞
    public static class SplitSentence extends BaseRichBolt {
        private OutputCollector collector;

        @Override
        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            // 定義一個字段
            declarer.declare(new Fields("word"));
        }

        @Override
        public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
            collector=outputCollector;
        }

        @Override
        public void execute(Tuple tuple) {
            // 接收到一個句子
            String sentence = tuple.getString(0);
            // 把句子切割爲單詞
            StringTokenizer iter = new StringTokenizer(sentence);
            // 發送每一個單詞
            while (iter.hasMoreElements()) {
                collector.emit(new Values(iter.nextToken()));
            }
            // 確認對數據進行處理
            collector.ack(tuple);
        }
    }



    // 定義一個Bolt,用於單詞計數
    public static class WordCount extends BaseBasicBolt {
        Map<String, Long> counts = new HashMap<String, Long>();


        @Override
        public void execute(Tuple tuple, BasicOutputCollector collector) {
            // 接收一個單詞
            String word = tuple.getString(0);
            // 獲取該單詞對應的計數
            Long count = counts.get(word);
            if (count == null)
                count = 0l;
            // 計數增加
            count++;
            // 將單詞和對應的計數加入map中
            counts.put(word, count);
            collector.emit(new Values(word, count));
        }

        @Override
        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            // 定義兩個字段word和count
            declarer.declare(new Fields("word", "count"));
        }
    }

    //定義全局Bolt,用於統計最終結果以及所有的單詞數統計

    public static class GlobalWordCount extends BaseBasicBolt{
        Map<String,Long> result=new HashMap<String, Long>();
        @Override
        public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {
            String word=tuple.getStringByField("word");
            Long count=tuple.getLongByField("count");
            result.put(word,count);
        }

        @Override
        public void cleanup(){
            System.out.println("---------------------------------Final Result----------------------------------------------");
            long totalCount=0;
            for (String key:result.keySet()){
                long count=result.get(key);
                System.out.println("---------------------------------Word:"+key+"  Count:"+count);
                totalCount+=count;
            }
            System.out.println("---------------------------------TotalCount:"+totalCount);

        }

        @Override
        public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {

        }
    }

    public static void main(String[] args) throws Exception {
        // 創建一個拓撲
        TopologyBuilder builder = new TopologyBuilder();
        // 設置Spout,這個Spout的名字叫做"Spout",設置並行度爲5
        builder.setSpout("spout", new RandomSentenceSpout(), 2);
        // 設置slot——“split”,並行度爲8,它的數據來源是spout的
        builder.setBolt("split", new SplitSentence(), 8).shuffleGrouping("spout");
        // 設置slot——“count”,你並行度爲12,它的數據來源是split的word字段
        builder.setBolt("count", new WordCount(), 12).fieldsGrouping("split", new Fields("word"));
        //設置slot--------"globalcount" ,數據來源是spout
        builder.setBolt("globalcount",new GlobalWordCount()).globalGrouping("count");

        Config conf = new Config();
        conf.setDebug(false);

        //if(args != null && args.length > 0){
        //if(false){
        //  conf.setNumWorkers(3);
        //  StormSubmitter.submitTopology(args[0], conf, builder.createTopology());
        //}else{
        conf.setMaxTaskParallelism(3);

        // 本地僞集羣模式運行
        LocalCluster cluster = new LocalCluster();
        // 集羣運行
//        StormSubmitter.submitTopology("word-count", conf, builder.createTopology() );
        //本地僞集羣 提交拓撲(該拓撲的名字叫word-count)
        cluster.submitTopology("word-count", conf, builder.createTopology());
        Thread.sleep(30000);
        cluster.killTopology("word-count");
        cluster.shutdown();
        //}
    }
}

補充說明

運行模式

本文代碼運行模式爲LocalCluster模式,直接在本地運行main函數即可

可靠性的保障機制

關於Storm可靠新保障的詳細機制將在下一篇文章中進行描述,本文只描述基本用法。
以上代碼通過4處代碼實現可靠性。
1. 在RandomSentenceSpout.nextTuple方法的_collector.emit部分,發送消息時,附加一個UUID作爲消息標識
2. 在SplitSentence.execute方法中處理完tuple後,調用collector.ack(tuple)進行一次通知
3. 在RandomSentenceSpout中實現ack與fail方法,用來處理下游bolt處理成功與失敗的情況。

運行結果

運行30s後結果如下:

---------------------------------Final Result----------------------------------------------
---------------------------------Word:away  Count:94
---------------------------------Word:ago  Count:94
---------------------------------Word:jumped  Count:82
---------------------------------Word:seven  Count:195
---------------------------------Word:cow  Count:82
---------------------------------Word:two  Count:95
---------------------------------Word:dwarfs  Count:101
---------------------------------Word:years  Count:94
---------------------------------Word:score  Count:94
---------------------------------Word:apple  Count:94
---------------------------------Word:white  Count:101
---------------------------------Word:and  Count:195
---------------------------------Word:four  Count:94
---------------------------------Word:keeps  Count:94
---------------------------------Word:day  Count:94
---------------------------------Word:over  Count:82
---------------------------------Word:a  Count:94
---------------------------------Word:nature  Count:95
---------------------------------Word:i  Count:95
---------------------------------Word:am  Count:95
---------------------------------Word:an  Count:94
---------------------------------Word:the  Count:359
---------------------------------Word:doctor  Count:94
---------------------------------Word:with  Count:95
---------------------------------Word:moon  Count:82
---------------------------------Word:at  Count:95
---------------------------------Word:snow  Count:101
---------------------------------TotalCount:2984
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章