Pig UDF(LOAD)示例代碼分析

參考文檔: Hadoop:The Definitive Guide3E Chapter11.Pig

代碼具體地址: tomwhite-hadoop-book-32dae01\ch11\src\main\java\com\hadoopbook\pig

工具類

 

public class Range {
  private final int start;
  private final int end;

  public Range(int start, int end) {
    this.start = start;
    this.end = end;
  }

  public int getStart() {
    return start;
  }

  public int getEnd() {
    return end;
  }
  
  public String getSubstring(String line) { //abcdefghi ==>sample.txt的每一行
	//rangeSpec: 1-2,5-6  ==>CutLoadFunc()的參數值
	//1-2: start=1,end=2, "abcedfghi".substring(0,2)=>ab
	//5-6: start=5,end=6, "abcedfghi".substring(4,6)=>df
    return line.substring(start - 1, end);
  }
  
  @Override
  public int hashCode() {
    return start * 37 + end;
  }

  @Override
  public boolean equals(Object obj) {
    if (!(obj instanceof Range)) {
      return false;
    }
    Range other = (Range) obj;
    return this.start == other.start && this.end == other.end;
  }

  //1-2,5-6
  public static List<Range> parse(String rangeSpec)
      throws IllegalArgumentException {
    if (rangeSpec.length() == 0) {
      return Collections.emptyList();
    }
    List<Range> ranges = new ArrayList<Range>();
    String[] specs = rangeSpec.split(","); //["1-2", "5-6"]
    for (String spec : specs) {
      String[] split = spec.split("-"); //["1", "2"]
      try {
        ranges.add(new Range(Integer.parseInt(split[0]), Integer
            .parseInt(split[1]))); //start=1, end=2
      } catch (NumberFormatException e) {
        throw new IllegalArgumentException(e.getMessage());
      }
    }
    return ranges;
  }

}

 自定義函數

 

public class CutLoadFunc extends LoadFunc {

  private static final Log LOG = LogFactory.getLog(CutLoadFunc.class);

  private final List<Range> ranges; //工具類,解析範圍參數,範圍針對的是文件裏的每一行數據.進行截取操作
  private final TupleFactory tupleFactory = TupleFactory.getInstance();
  private RecordReader reader; //LOAD命令加載文件,reader會去讀取文件裏的每一行數據

  public CutLoadFunc(String cutPattern) { //構造函數接收參數:範圍->'16-19,88-92,93-93'
    ranges = Range.parse(cutPattern);
  }
  
  @Override
  public void setLocation(String location, Job job)
      throws IOException {
    FileInputFormat.setInputPaths(job, location);
  }
  
  @Override
  public InputFormat getInputFormat() {
    return new TextInputFormat();
  }
  
  @Override
  public void prepareToRead(RecordReader reader, PigSplit split) {
    this.reader = reader;
  }

  //讀取文件裏的每一行記錄
  @Override
  public Tuple getNext() throws IOException {
    try {
      if (!reader.nextKeyValue()) {
        return null;
      }
      Text value = (Text) reader.getCurrentValue(); //讀取到當前行的數據
      String line = value.toString(); //line爲當前行的數據
	  
      Tuple tuple = tupleFactory.newTuple(ranges.size());
      for (int i = 0; i < ranges.size(); i++) {
        Range range = ranges.get(i); //調用構造函數時,通過範圍參數,已經將List<Range>的數據填充完畢.即此時每一個Range對象的start.end都是有值的.
        if (range.getEnd() > line.length()) {
          LOG.warn(String.format(
              "Range end (%s) is longer than line length (%s)",
              range.getEnd(), line.length()));
          continue;
        }
        tuple.set(i, new DataByteArray(range.getSubstring(line))); //傳入當前行的數據,調用截取方法(start,end已經有值),截取當前行對應的範圍的字符串.
      }
      return tuple; //tuple可以看做是一個上下文. 參數就是通過上下文傳入的.
    } catch (InterruptedException e) {
      throw new ExecException(e);
    }
  }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章