ORC 讀數據源碼分析

readAllDataStreams

RecordReaderImpl.java

  private void readAllDataStreams(StripeInformation stripe) throws IOException {
  //  這裏記錄的事一個stripe 裏 data  部分的起始位置
    long start = stripe.getIndexLength();
    long end = start + stripe.getDataLength();
    // explicitly trigger 1 big read
    DiskRangeList toRead = new DiskRangeList(start, end);
    // 加載數據到內存，是這個方法
    bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
    List<OrcProto.Stream> streamDescriptions = stripeFooter.getStreamsList();
    createStreams(streamDescriptions, bufferChunks, null,
        dataReader.getCompressionCodec(), bufferSize, streams);
  }

stripe.getIndexLength()

注意這是index 的bytes 大小

  /**
   * Get the length of the stripe's indexes.
   * @return the number of bytes in the index
   */
  long getIndexLength();

stripe.getDataLength()

 /**
   * Get the length of the stripe's data.
   * @return the number of bytes in the stripe
   */
  long getDataLength();

stripe.getOffset()

/**
   * Get the byte offset of the start of the stripe.
   * @return the bytes from the start of the file
   */
  long getOffset();

readFileData

RecordReaderUtils.java

baseOffset 即爲 stripe.getOffset()

@Override
    public DiskRangeList readFileData(
        DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException {
      return RecordReaderUtils.readDiskRanges(file, zcr, baseOffset, range, doForceDirect);
    }

readDiskRanges

RecordReaderUtils.java

  /**
   * Read the list of ranges from the file.
   * @param file the file to read
   * @param base the base of the stripe
   * @param range the disk ranges within the stripe to read
   * @return the bytes read for each disk range, which is the same length as
   *    ranges
   * @throws IOException
   */
// zcr 一般爲空
  static DiskRangeList readDiskRanges(FSDataInputStream file,
                                      HadoopShims.ZeroCopyReaderShim zcr,
                                 long base,
                                 DiskRangeList range,
                                 boolean doForceDirect) throws IOException {
    if (range == null) return null;
    DiskRangeList prev = range.prev;
    if (prev == null) {
   // 創建一個空的DiskRangeList 作爲前綴
      prev = new DiskRangeList.MutateHelper(range);
    }
    while (range != null) {
      if (range.hasData()) {
        range = range.next;
        continue;
      }
      // len 的計算在這裏
      int len = (int) (range.getEnd() - range.getOffset());
      long off = range.getOffset();
      if (zcr != null) {
        file.seek(base + off);
        boolean hasReplaced = false;
        while (len > 0) {
          ByteBuffer partial = zcr.readBuffer(len, false);
          BufferChunk bc = new BufferChunk(partial, off);
          if (!hasReplaced) {
            range.replaceSelfWith(bc);
            hasReplaced = true;
          } else {
            range.insertAfter(bc);
          }
          range = bc;
          int read = partial.remaining();
          len -= read;
          off += read;
        }
      } else {
        // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
        // 存放stripe 所有字節的buffer
        byte[] buffer = new byte[len];
        // 從文件讀數據
        file.readFully((base + off), buffer, 0, buffer.length);
        // 再包裝成 ByteBuffer
        ByteBuffer bb = null;
        if (doForceDirect) {
          bb = ByteBuffer.allocateDirect(len);
          bb.put(buffer);
          bb.position(0);
          bb.limit(len);
        } else {
          bb = ByteBuffer.wrap(buffer);
        }
        // BufferChunk 定義
        // public class BufferChunk extends DiskRangeList
        range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
      }
      range = range.next;
    }
    return prev.next;
  }

BufferChunk

// BufferChunk.java
BufferChunk 比 DiskRangeList 多了 ByteBuffer

/**
 * The sections of stripe that we have read.
 * This might not match diskRange - 1 disk range can be multiple buffer chunks,
 * depending on DFS block boundaries.
 */
public class BufferChunk extends DiskRangeList {

  private static final Logger LOG =
      LoggerFactory.getLogger(BufferChunk.class);
  final ByteBuffer chunk;

  public BufferChunk(ByteBuffer chunk, long offset) {
    super(offset, offset + chunk.remaining());
    this.chunk = chunk;
  }

  public ByteBuffer getChunk() {
    return chunk;
  }

  @Override
  public boolean hasData() {
    return chunk != null;
  }

ORC 讀數據源碼分析

readAllDataStreams

stripe.getIndexLength()

stripe.getDataLength()

stripe.getOffset()

readFileData

readDiskRanges

BufferChunk

動態庫、靜態庫的一些測試

使用Putty 實現網頁訪問代理

Debian 安裝numa 相關庫

ORC 讀數據源碼分析之 readAllDataStreams

CMakeLists.txt 一二三

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結