整合storm-hdfs的過程,其實也就是編寫storm的拓撲結構,然後調用storm-hdfs-0.9.4.jar中的hdfsBolt,通過配置hdfsBolt的一些與hdfs有關的參數,將數據寫入到hdfs中。
配置的參數:
1、RecordFormat:定義字段定界符,你可以使用換行符\n或者製表符\t;
2、SyncPolicy:定義每次寫入的tuple的數量;
3、FileRotationPolicy:定義寫入的hdfs文件的輪轉策略,你可以以時間輪轉(TimedRotationPolicy)、大小輪轉(FileSizeRotationPolicy)、不輪轉(NoRotationPolicy);
4、FileNameFormat:定義寫入文件的路徑(withPath)和文件名的前後綴(withPrefix、withExtension);
5、withFsUrl:定義hdfs的地址。
hdfsBolt中寫數據的源碼:
- public void execute(Tuple tuple)
- {
- try
- {
- byte[] bytes = this.format.format(tuple); //對每一條數據添加定界符
- synchronized (this.writeLock)
- {
- this.out.write(bytes); //調用輸出流寫數據
- this.offset += bytes.length; //更新寫入文件的當前大小
- if (this.syncPolicy.mark(tuple, this.offset)) //當數據條數滿足所配的條數時,寫入到hdfs
- {
- if ((this.out instanceof HdfsDataOutputStream)) {
- ((HdfsDataOutputStream)this.out).hsync(EnumSet.of(HdfsDataOutputStream.SyncFlag.UPDATE_LENGTH));
- } else {
- this.out.hsync();
- }
- this.syncPolicy.reset();
- }
- }
- this.collector.ack(tuple);
- if (this.rotationPolicy.mark(tuple, this.offset)) //噹噹前文件大小等於所配輪轉文件的大小,則輪轉文件,重建新的寫入文件
- {
- rotateOutputFile();
- this.offset = 0L;
- this.rotationPolicy.reset();
- }
- }
- catch (IOException e)
- {
- LOG.warn("write/sync failed.", e);
- this.collector.fail(tuple);
- }
- }
hdfsBolt每次新建文件的方法:
- Path createOutputFile()
- throws IOException
- {
- Path path = new Path(this.fileNameFormat.getPath(), this.fileNameFormat.getName(this.rotation, System.currentTimeMillis()));
- this.out = this.fs.create(path); //新建一個輸出流(對應一個新的文件)
- return path; //返回路徑
- }
輪轉文件的方法:
- protected void rotateOutputFile()
- throws IOException
- {
- LOG.info("Rotating output file...");
- long start = System.currentTimeMillis();
- synchronized (this.writeLock)
- {
- closeOutputFile(); //關閉前一個文件的輸出流
- this.rotation += 1; //輪轉數加一(這裏的輪轉數會反應到文件名上)
- Path newFile = createOutputFile(); //新建一個文件
- LOG.info("Performing {} file rotation actions.", Integer.valueOf(this.rotationActions.size()));
- for (RotationAction action : this.rotationActions) {
- action.execute(this.fs, this.currentFile);
- }
- this.currentFile = newFile; //更新當前寫入文件的路徑
- }
- long time = System.currentTimeMillis() - start;
- LOG.info("File rotation took {} ms.", Long.valueOf(time));
- }
添加字段定界符源碼:
- public byte[] format(Tuple tuple)
- {
- StringBuilder sb = new StringBuilder();
- Fields fields = this.fields == null ? tuple.getFields() : this.fields;
- int size = fields.size();
- for (int i = 0; i < size; i++)
- {
- sb.append(tuple.getValueByField(fields.get(i)));
- if (i != size - 1) {
- sb.append(this.fieldDelimiter);
- }
- }
- sb.append(this.recordDelimiter); //添加定界符
- return sb.toString().getBytes();
- }
CountSyncPolicy源碼,CountSyncPolicy實現SyncPolicy的接口方法:
- public class CountSyncPolicy
- implements SyncPolicy
- {
- private int count; //配置的每次寫入tuple數量
- private int executeCount = 0; //當前已經執行的tuple的數量
- public CountSyncPolicy(int count)
- {
- this.count = count;
- }
- public boolean mark(Tuple tuple, long offset) //判斷當前寫入輸出流緩存中的數量是否超過每次寫入數量
- {
- this.executeCount += 1;
- return this.executeCount >= this.count;
- }
- public void reset()
- {
- this.executeCount = 0; //重置方法,每次寫入後,執行重置方法歸零
- }
- }
FileSizeRotationPolicy的源碼(FileSizeRotationPolicy實現FileRotationPolicy的接口方法):
- public class FileSizeRotationPolicy
- implements FileRotationPolicy
- {
- private static final Logger LOG = LoggerFactory.getLogger(FileSizeRotationPolicy.class);
- private long maxBytes; //文件寫滿的大小
- public static enum Units
- { //文件切換輪轉的大小單位
- KB(Math.pow(2.0D, 10.0D)), MB(Math.pow(2.0D, 20.0D)), GB(Math.pow(2.0D, 30.0D)), TB(Math.pow(2.0D, 40.0D));
- private long byteCount;
- private Units(long byteCount)
- {
- this.byteCount = byteCount;
- }
- public long getByteCount()
- {
- return this.byteCount;
- }
- }
- private long lastOffset = 0L;
- private long currentBytesWritten = 0L;
- public FileSizeRotationPolicy(float count, Units units)
- {
- this.maxBytes = ((count * (float)units.getByteCount())); //根據切換文件的單位來計算文件寫滿該有的大小
- }
- public boolean mark(Tuple tuple, long offset) //文件是否切換的判斷方法
- {
- long diff = offset - this.lastOffset;
- this.currentBytesWritten += diff;
- this.lastOffset = offset;
- return this.currentBytesWritten >= this.maxBytes;
- }
- public void reset() //重置方法
- {
- this.currentBytesWritten = 0L; //當前文件已寫的大小
- this.lastOffset = 0L; //一次寫入後的offset值
- }
- }
DefaultFileNameFormat的源碼(DefaultFileNameFormat實現FileNameFormat的接口方法):
- public class DefaultFileNameFormat
- implements FileNameFormat
- {
- private String componentId;
- private int taskId; //任務名id
- private String path = "/storm"; //寫入的目錄路徑
- private String prefix = ""; //文件名前綴
- private String extension = ".txt";//文件名後綴
- public DefaultFileNameFormat withPrefix(String prefix)
- {
- this.prefix = prefix;
- return this;
- }
- public DefaultFileNameFormat withExtension(String extension)
- {
- this.extension = extension;
- return this;
- }
- public DefaultFileNameFormat withPath(String path)
- {
- this.path = path;
- return this;
- }
- public void prepare(Map conf, TopologyContext topologyContext)
- {
- this.componentId = topologyContext.getThisComponentId();
- this.taskId = topologyContext.getThisTaskId();
- }
- public String getName(long rotation, long timeStamp) //得到寫入文件的文件名
- {
- return this.prefix + this.componentId + "-" + this.taskId + "-" + rotation + "-" + timeStamp + this.extension;
- }
- public String getPath()
- {
- return this.path;
- }
- }
http://blog.csdn.net/u014039577/article/details/50215913