介绍
貌似WordCount已经成了大数据,分布式计算的入门标配程序,其实仔细想一下WordCount的例子,它还有很用应用的场景,例如统计过去一段时间网站中各个商品的浏览量,最近一段时间相同查询的数量等.
本文主要讨论下Storm如何实现WordCount
Topology结构
WordCount的Topology比较简单,流程如下
主要代码
Storm版本
Storm:apache-storm-1.1.1
完整代码
package com.eric.storm.sample;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
import java.util.*;
/*
** WordCountTopolopgyAllInJava类(单词计数)
*/
public class WordCountTopolopgyAllInJava {
// 定义一个喷头,用于产生数据。该类继承自BaseRichSpout
public static class RandomSentenceSpout extends BaseRichSpout {
private SpoutOutputCollector _collector;
private Random _rand;
private Map<String,Values> pending;
@Override
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
_collector = collector;
_rand = new Random();
pending=new HashMap<String, Values>();
}
@Override
public void nextTuple() {
// 睡眠一段时间后再产生一个数据
Utils.sleep(100);
// 句子数组
String[] sentences = new String[]{"the cow jumped over the moon", "an apple a day keeps the doctor away",
"four score and seven years ago", "snow white and the seven dwarfs", "i am at two with nature"};
// 随机选择一个句子
String sentence = sentences[_rand.nextInt(sentences.length)];
Values tmpValues=new Values(sentence);
String msgID=UUID.randomUUID().toString();
pending.put(msgID,tmpValues);
// 发射该句子给Bolt,每个tuple都有一个唯一标识
_collector.emit(tmpValues,msgID);
}
// 确认函数:成功处理的tuple,其id会从pending列表中删除
@Override
public void ack(Object id) {
System.out.println("Msg:"+id+" send successful!");
pending.remove(id);
}
// 失败处理函数:处理失败的时候重新发送一次tuple,
@Override
public void fail(Object id) {
System.out.println("Msg:"+id+" send failed,will try again!");
Values failedMsg=pending.get(id);
_collector.emit(failedMsg,id);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 定义一个字段word
declarer.declare(new Fields("word"));
}
}
// 定义个Bolt,用于将句子切分为单词
public static class SplitSentence extends BaseRichBolt {
private OutputCollector collector;
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 定义一个字段
declarer.declare(new Fields("word"));
}
@Override
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
collector=outputCollector;
}
@Override
public void execute(Tuple tuple) {
// 接收到一个句子
String sentence = tuple.getString(0);
// 把句子切割为单词
StringTokenizer iter = new StringTokenizer(sentence);
// 发送每一个单词
while (iter.hasMoreElements()) {
collector.emit(new Values(iter.nextToken()));
}
// 确认对数据进行处理
collector.ack(tuple);
}
}
// 定义一个Bolt,用於单词计数
public static class WordCount extends BaseBasicBolt {
Map<String, Long> counts = new HashMap<String, Long>();
@Override
public void execute(Tuple tuple, BasicOutputCollector collector) {
// 接收一个单词
String word = tuple.getString(0);
// 获取该单词对应的计数
Long count = counts.get(word);
if (count == null)
count = 0l;
// 计数增加
count++;
// 将单词和对应的计数加入map中
counts.put(word, count);
collector.emit(new Values(word, count));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 定义两个字段word和count
declarer.declare(new Fields("word", "count"));
}
}
//定义全局Bolt,用于统计最终结果以及所有的单词数统计
public static class GlobalWordCount extends BaseBasicBolt{
Map<String,Long> result=new HashMap<String, Long>();
@Override
public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {
String word=tuple.getStringByField("word");
Long count=tuple.getLongByField("count");
result.put(word,count);
}
@Override
public void cleanup(){
System.out.println("---------------------------------Final Result----------------------------------------------");
long totalCount=0;
for (String key:result.keySet()){
long count=result.get(key);
System.out.println("---------------------------------Word:"+key+" Count:"+count);
totalCount+=count;
}
System.out.println("---------------------------------TotalCount:"+totalCount);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
}
public static void main(String[] args) throws Exception {
// 创建一个拓扑
TopologyBuilder builder = new TopologyBuilder();
// 设置Spout,这个Spout的名字叫做"Spout",设置并行度为5
builder.setSpout("spout", new RandomSentenceSpout(), 2);
// 设置slot——“split”,并行度为8,它的数据来源是spout的
builder.setBolt("split", new SplitSentence(), 8).shuffleGrouping("spout");
// 设置slot——“count”,你并行度为12,它的数据来源是split的word字段
builder.setBolt("count", new WordCount(), 12).fieldsGrouping("split", new Fields("word"));
//设置slot--------"globalcount" ,数据来源是spout
builder.setBolt("globalcount",new GlobalWordCount()).globalGrouping("count");
Config conf = new Config();
conf.setDebug(false);
//if(args != null && args.length > 0){
//if(false){
// conf.setNumWorkers(3);
// StormSubmitter.submitTopology(args[0], conf, builder.createTopology());
//}else{
conf.setMaxTaskParallelism(3);
// 本地伪集群模式运行
LocalCluster cluster = new LocalCluster();
// 集群运行
// StormSubmitter.submitTopology("word-count", conf, builder.createTopology() );
//本地伪集群 提交拓扑(该拓扑的名字叫word-count)
cluster.submitTopology("word-count", conf, builder.createTopology());
Thread.sleep(30000);
cluster.killTopology("word-count");
cluster.shutdown();
//}
}
}
补充说明
运行模式
本文代码运行模式为LocalCluster模式,直接在本地运行main函数即可
可靠性的保障机制
关于Storm可靠新保障的详细机制将在下一篇文章中进行描述,本文只描述基本用法。
以上代码通过4处代码实现可靠性。
1. 在RandomSentenceSpout.nextTuple方法的_collector.emit部分,发送消息时,附加一个UUID作为消息标识
2. 在SplitSentence.execute方法中处理完tuple后,调用collector.ack(tuple)进行一次通知
3. 在RandomSentenceSpout中实现ack与fail方法,用来处理下游bolt处理成功与失败的情况。
运行结果
运行30s后结果如下:
---------------------------------Final Result----------------------------------------------
---------------------------------Word:away Count:94
---------------------------------Word:ago Count:94
---------------------------------Word:jumped Count:82
---------------------------------Word:seven Count:195
---------------------------------Word:cow Count:82
---------------------------------Word:two Count:95
---------------------------------Word:dwarfs Count:101
---------------------------------Word:years Count:94
---------------------------------Word:score Count:94
---------------------------------Word:apple Count:94
---------------------------------Word:white Count:101
---------------------------------Word:and Count:195
---------------------------------Word:four Count:94
---------------------------------Word:keeps Count:94
---------------------------------Word:day Count:94
---------------------------------Word:over Count:82
---------------------------------Word:a Count:94
---------------------------------Word:nature Count:95
---------------------------------Word:i Count:95
---------------------------------Word:am Count:95
---------------------------------Word:an Count:94
---------------------------------Word:the Count:359
---------------------------------Word:doctor Count:94
---------------------------------Word:with Count:95
---------------------------------Word:moon Count:82
---------------------------------Word:at Count:95
---------------------------------Word:snow Count:101
---------------------------------TotalCount:2984