介紹
貌似WordCount已經成了大數據,分佈式計算的入門標配程序,其實仔細想一下WordCount的例子,它還有很用應用的場景,例如統計過去一段時間網站中各個商品的瀏覽量,最近一段時間相同查詢的數量等.
本文主要討論下Storm如何實現WordCount
Topology結構
WordCount的Topology比較簡單,流程如下
主要代碼
Storm版本
Storm:apache-storm-1.1.1
完整代碼
package com.eric.storm.sample;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
import java.util.*;
/*
** WordCountTopolopgyAllInJava類(單詞計數)
*/
public class WordCountTopolopgyAllInJava {
// 定義一個噴頭,用於產生數據。該類繼承自BaseRichSpout
public static class RandomSentenceSpout extends BaseRichSpout {
private SpoutOutputCollector _collector;
private Random _rand;
private Map<String,Values> pending;
@Override
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
_collector = collector;
_rand = new Random();
pending=new HashMap<String, Values>();
}
@Override
public void nextTuple() {
// 睡眠一段時間後再產生一個數據
Utils.sleep(100);
// 句子數組
String[] sentences = new String[]{"the cow jumped over the moon", "an apple a day keeps the doctor away",
"four score and seven years ago", "snow white and the seven dwarfs", "i am at two with nature"};
// 隨機選擇一個句子
String sentence = sentences[_rand.nextInt(sentences.length)];
Values tmpValues=new Values(sentence);
String msgID=UUID.randomUUID().toString();
pending.put(msgID,tmpValues);
// 發射該句子給Bolt,每個tuple都有一個唯一標識
_collector.emit(tmpValues,msgID);
}
// 確認函數:成功處理的tuple,其id會從pending列表中刪除
@Override
public void ack(Object id) {
System.out.println("Msg:"+id+" send successful!");
pending.remove(id);
}
// 失敗處理函數:處理失敗的時候重新發送一次tuple,
@Override
public void fail(Object id) {
System.out.println("Msg:"+id+" send failed,will try again!");
Values failedMsg=pending.get(id);
_collector.emit(failedMsg,id);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 定義一個字段word
declarer.declare(new Fields("word"));
}
}
// 定義個Bolt,用於將句子切分爲單詞
public static class SplitSentence extends BaseRichBolt {
private OutputCollector collector;
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 定義一個字段
declarer.declare(new Fields("word"));
}
@Override
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
collector=outputCollector;
}
@Override
public void execute(Tuple tuple) {
// 接收到一個句子
String sentence = tuple.getString(0);
// 把句子切割爲單詞
StringTokenizer iter = new StringTokenizer(sentence);
// 發送每一個單詞
while (iter.hasMoreElements()) {
collector.emit(new Values(iter.nextToken()));
}
// 確認對數據進行處理
collector.ack(tuple);
}
}
// 定義一個Bolt,用於單詞計數
public static class WordCount extends BaseBasicBolt {
Map<String, Long> counts = new HashMap<String, Long>();
@Override
public void execute(Tuple tuple, BasicOutputCollector collector) {
// 接收一個單詞
String word = tuple.getString(0);
// 獲取該單詞對應的計數
Long count = counts.get(word);
if (count == null)
count = 0l;
// 計數增加
count++;
// 將單詞和對應的計數加入map中
counts.put(word, count);
collector.emit(new Values(word, count));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 定義兩個字段word和count
declarer.declare(new Fields("word", "count"));
}
}
//定義全局Bolt,用於統計最終結果以及所有的單詞數統計
public static class GlobalWordCount extends BaseBasicBolt{
Map<String,Long> result=new HashMap<String, Long>();
@Override
public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {
String word=tuple.getStringByField("word");
Long count=tuple.getLongByField("count");
result.put(word,count);
}
@Override
public void cleanup(){
System.out.println("---------------------------------Final Result----------------------------------------------");
long totalCount=0;
for (String key:result.keySet()){
long count=result.get(key);
System.out.println("---------------------------------Word:"+key+" Count:"+count);
totalCount+=count;
}
System.out.println("---------------------------------TotalCount:"+totalCount);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
}
public static void main(String[] args) throws Exception {
// 創建一個拓撲
TopologyBuilder builder = new TopologyBuilder();
// 設置Spout,這個Spout的名字叫做"Spout",設置並行度爲5
builder.setSpout("spout", new RandomSentenceSpout(), 2);
// 設置slot——“split”,並行度爲8,它的數據來源是spout的
builder.setBolt("split", new SplitSentence(), 8).shuffleGrouping("spout");
// 設置slot——“count”,你並行度爲12,它的數據來源是split的word字段
builder.setBolt("count", new WordCount(), 12).fieldsGrouping("split", new Fields("word"));
//設置slot--------"globalcount" ,數據來源是spout
builder.setBolt("globalcount",new GlobalWordCount()).globalGrouping("count");
Config conf = new Config();
conf.setDebug(false);
//if(args != null && args.length > 0){
//if(false){
// conf.setNumWorkers(3);
// StormSubmitter.submitTopology(args[0], conf, builder.createTopology());
//}else{
conf.setMaxTaskParallelism(3);
// 本地僞集羣模式運行
LocalCluster cluster = new LocalCluster();
// 集羣運行
// StormSubmitter.submitTopology("word-count", conf, builder.createTopology() );
//本地僞集羣 提交拓撲(該拓撲的名字叫word-count)
cluster.submitTopology("word-count", conf, builder.createTopology());
Thread.sleep(30000);
cluster.killTopology("word-count");
cluster.shutdown();
//}
}
}
補充說明
運行模式
本文代碼運行模式爲LocalCluster模式,直接在本地運行main函數即可
可靠性的保障機制
關於Storm可靠新保障的詳細機制將在下一篇文章中進行描述,本文只描述基本用法。
以上代碼通過4處代碼實現可靠性。
1. 在RandomSentenceSpout.nextTuple方法的_collector.emit部分,發送消息時,附加一個UUID作爲消息標識
2. 在SplitSentence.execute方法中處理完tuple後,調用collector.ack(tuple)進行一次通知
3. 在RandomSentenceSpout中實現ack與fail方法,用來處理下游bolt處理成功與失敗的情況。
運行結果
運行30s後結果如下:
---------------------------------Final Result----------------------------------------------
---------------------------------Word:away Count:94
---------------------------------Word:ago Count:94
---------------------------------Word:jumped Count:82
---------------------------------Word:seven Count:195
---------------------------------Word:cow Count:82
---------------------------------Word:two Count:95
---------------------------------Word:dwarfs Count:101
---------------------------------Word:years Count:94
---------------------------------Word:score Count:94
---------------------------------Word:apple Count:94
---------------------------------Word:white Count:101
---------------------------------Word:and Count:195
---------------------------------Word:four Count:94
---------------------------------Word:keeps Count:94
---------------------------------Word:day Count:94
---------------------------------Word:over Count:82
---------------------------------Word:a Count:94
---------------------------------Word:nature Count:95
---------------------------------Word:i Count:95
---------------------------------Word:am Count:95
---------------------------------Word:an Count:94
---------------------------------Word:the Count:359
---------------------------------Word:doctor Count:94
---------------------------------Word:with Count:95
---------------------------------Word:moon Count:82
---------------------------------Word:at Count:95
---------------------------------Word:snow Count:101
---------------------------------TotalCount:2984