DFSClient.RemoteBlockReader.newBlockReader()
public static BlockReader newBlockReader( Socket sock, String file, long blockId, Token<BlockTokenIdentifier> accessToken,
long genStamp, long startOffset, long len, int bufferSize, boolean verifyChecksum, String clientName) throws IOException {
DataOutputStream out = new DataOutputStream(new BufferedOutputStream(NetUtils.getOutputStream(sock,HdfsConstants.WRITE_TIMEOUT)));
// write the header. 使用OutputStream發起讀數據塊的請求頭報文
out.writeShort(DataTransferProtocol.DATA_TRANSFER_VERSION); // ①
out.write( DataTransferProtocol.OP_READ_BLOCK); // ②
out.writeLong( blockId ); // ③
out.writeLong( genStamp ); // ④
out.writeLong( startOffset ); // ⑤
out.writeLong( len ); // ⑥
Text.writeString(out, clientName); // ⑦
accessToken.write(out); // ⑧
out.flush();
// Get bytes in block, set streams 使用InputStream接收DataNode傳回來的數據
DataInputStream in = new DataInputStream(new BufferedInputStream(NetUtils.getInputStream(sock), bufferSize));
short status = in.readShort(); // [1] 讀取狀態信息
if (status != DataTransferProtocol.OP_STATUS_SUCCESS) { } // throw Exception...
DataChecksum checksum = DataChecksum.newDataChecksum( in ); // [2-1] checksumHeader(校驗類型和校驗塊大小)
long firstChunkOffset = in.readLong(); // [2-2] Read the first chunk offset第一個校驗塊的起始位置
if ( firstChunkOffset < 0 || firstChunkOffset > startOffset || firstChunkOffset >= (startOffset + checksum.getBytesPerChecksum())) {
throw new IOException("BlockReader: error in first chunk offset (" + firstChunkOffset + ") startOffset is " + startOffset + " for file " + file);
}
return new RemoteBlockReader(file, blockId, in, checksum, verifyChecksum, startOffset, firstChunkOffset, sock);
}
DataXceiver.readBlock()
/** Read a block from the disk. 讀取本地磁盤上的數據塊, 用於發送給客戶端
* @param in The stream to read from 輸入流,客戶端發送的請求頭數據, 用於讀取解析數據構造BlockSender */
private void readBlock(DataInputStream in) throws IOException {
// 1. Read in the header 讀取客戶端發送的請求頭信息 // ① ② 在DataXceiver的run方法已經讀取過了,然後分發到不同的子程序處理
long blockId = in.readLong(); // ③ 8bytes的blockId.
Block block = new Block( blockId, 0 , in.readLong()); // ④ 8bytes的數據塊版本號
long startOffset = in.readLong(); // ⑤ 8bytes的startOffset開始讀取的Block的偏移量
long length = in.readLong(); // ⑥ 8bytes的讀取長度
String clientName = Text.readString(in); // ⑦ 發送請求的客戶端名稱
Token<BlockTokenIdentifier> accessToken = new Token<BlockTokenIdentifier>();
accessToken.readFields(in); // ⑧ 數據塊的訪問權限, 安全相關, 不討論
OutputStream baseStream = NetUtils.getOutputStream(s, datanode.socketWriteTimeout);
DataOutputStream out = new DataOutputStream(new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE));
// blockToken: If InvalidToken, out.writeShort(OP_STATUS_ERROR_ACCESS_TOKEN)
// 2. send the block
BlockSender blockSender = null;
try {
try {
blockSender = new BlockSender(block, startOffset, length, true, true, false, datanode, clientTraceFmt);
} catch(IOException e) {
out.writeShort(DataTransferProtocol.OP_STATUS_ERROR); // [1] 創建BlockSender對象失敗,發送操作失敗的狀態標示返回給客戶端
throw e; // --> catch(IOException e) 不會執行下面的語句哦
}
out.writeShort(DataTransferProtocol.OP_STATUS_SUCCESS); // [1] send op status 創建BlockSender成功,發送成功狀態標示返回給客戶端
long read = blockSender.sendBlock(out, baseStream, null); // send data [2] 會發送checksumHeader + offset + PACKET
if (blockSender.isBlockReadFully()) { // 客戶端讀取完整個數據塊, 由客戶端驗證文件的校驗和. 而不是在發送數據時驗證
// 3. See if client verification succeeded. This is an optional response from client.
if (in.readShort() == DataTransferProtocol.OP_STATUS_CHECKSUM_OK && datanode.blockScanner != null) {
datanode.blockScanner.verifiedByClient(block);
}
}
} catch ( SocketException ignored ) { // Its ok for remote side to close the connection anytime.
} catch ( IOException ioe ) { throw ioe;
} finally {
IOUtils.closeStream(out);
IOUtils.closeStream(blockSender);
}
}
BlockSender.sendBlock()
/** sendBlock() is used to read block and its metadata and stream the data to either a client or to another datanode.
* 讀取塊和元數據, 通過輸出流傳送給客戶端(讀取請求)或者另一個datanode(複製請求)
* @param out stream to which the block is written to 數據塊要寫入的輸出流, 輸出流即發送出去
* @param baseStream optional. if non-null, out is assumed to be a wrapper over this stream.
* This enables optimizations for sending the data, e.g. SocketOutputStream#transferToFully(FileChannel, long, int). 使用FileChannel優化發送數據
* @param throttler for sending data. 發送數據節流器
* @return total bytes reads, including crc. 總共讀取的字節,包括校驗文件 */
long sendBlock(DataOutputStream out, OutputStream baseStream, BlockTransferThrottler throttler) throws IOException {
this.throttler = throttler;
initialOffset = offset;
long totalRead = 0;
OutputStream streamForSendChunks = out;
lastCacheDropOffset = initialOffset;
if (isLongRead() && blockInFd != null) { // Advise that this file descriptor will be accessed sequentially.
NativeIO.posixFadviseIfPossible(blockInFd, 0, 0, NativeIO.POSIX_FADV_SEQUENTIAL);
}
manageOsCache(); // Trigger readahead of beginning of file if configured.
try {
// 1. 將checksum header通過DataOutputStream發送到客戶端. 客戶端通過DataInputStream接收
try {
checksum.writeHeader(out); // ① ② 寫入checksum header . DataChecksum在構造函數中構造,包含type, bytePerChecksum
if ( chunkOffsetOK ) { // ③ 需要發送塊的開始位置, 還需要寫入offset
out.writeLong( offset );
}
out.flush();
} catch (IOException e) { throw ioeToSocketException(e); } //socket error
// 2. 計算每個packet數據(checksum和數據)的大小 分配packet緩衝區大小(packet header + data + checksum)
int maxChunksPerPacket; // 一次Packet可以發送最多多少個chunks
int pktSize = DataNode.PKT_HEADER_LEN + SIZE_OF_INTEGER; // [1]packet header+length
if (transferToAllowed && !verifyChecksum && baseStream instanceof SocketOutputStream && blockIn instanceof FileInputStream) {
// 檢查是或允許transferTo, 使用FileChannel來傳輸數據, 而不是先將數據讀取到緩衝區
FileChannel fileChannel = ((FileInputStream)blockIn).getChannel();
blockInPosition = fileChannel.position(); // blockInPosition also indicates sendChunks() uses transferTo.
streamForSendChunks = baseStream;
// assure a mininum buffer size.
maxChunksPerPacket = (Math.max(BUFFER_SIZE, MIN_BUFFER_WITH_TRANSFERTO) + bytesPerChecksum - 1)/bytesPerChecksum;
} else {
maxChunksPerPacket = Math.max(1, (BUFFER_SIZE + bytesPerChecksum - 1)/bytesPerChecksum);
}
// packet buffer has to be able to do a normal transfer in the case of recomputing checksum
pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket; // [2](data length+checksum size)*chunks
ByteBuffer pktBuf = ByteBuffer.allocate(pktSize);
// 3. 將所有packet寫到out. Packet由一系列的Chunk組成.
while (endOffset > offset) {
manageOsCache();
long len = sendChunks(pktBuf, maxChunksPerPacket, streamForSendChunks); // ④
offset += len;
totalRead += len + ((len + bytesPerChecksum - 1)/bytesPerChecksum*checksumSize);
seqno++;
}
// 4. 將一整數(int)0寫到out, 標記塊的結束 mark the end of block
out.writeInt(0); // ⑤
out.flush();
} catch (RuntimeException e) { throw new IOException("unexpected runtime exception", e);
} finally { close();}
blockReadFully = (initialOffset == 0 && offset >= blockLength);
return totalRead;
}
BlockSender.sendChunks()
/** Sends upto maxChunks chunks of data. 發送一個數據包 */
private int sendChunks(ByteBuffer pkt, int maxChunks, OutputStream out) throws IOException {
// 1. 計算數據包的長度
// Sends multiple chunks in one packet with a single write().
int len = (int) Math.min(endOffset - offset, (((long) bytesPerChecksum) * ((long) maxChunks)));
// truncate len so that any partial chunks will be sent as a final packet. this is not necessary for correctness,
// but partial chunks are ones that may be recomputed and sent via buffer copy, so try to minimize those bytes
if (len > bytesPerChecksum && len % bytesPerChecksum != 0) {
len -= len % bytesPerChecksum;
}
if (len == 0) return 0;
int numChunks = (len + bytesPerChecksum - 1)/bytesPerChecksum; // 計算這個數據包中應該包含有多少個校驗數據塊
int packetLen = len + numChunks*checksumSize + 4; // len爲數據長度, 中間爲校驗和的長度, 最後的4爲下面⑤即數據長度
pkt.clear(); // 參數pkt是在sendBlock()經過計算pktSize的緩衝區 因此使用前要先清空buffer
// packetLen是從數據長度字段開始的長度即包括⑤之後的長度. ⑤爲4bytes, 然後分別是真正的數據和校驗和的長度.
// 實際上pkt的大小確定了下面要往該緩衝區pkt放入的數據的多少. 比如packet header分別從①-⑤, 對應的大小爲4+8+8+1+4=21+4.
// 2. 數據包頭部信息寫入緩衝區 write packet header
pkt.putInt(packetLen); // ① 數據包長度
pkt.putLong(offset); // ② 數據包中的數據在Block中的開始位置
pkt.putLong(seqno); // ③ 數據包的編號
pkt.put((byte)((offset + len >= endOffset) ? 1 : 0)); // ④ 是否有數據包標誌 ②③④其實對應sendBlock()中while循環的處理
pkt.putInt(len); // ⑤ 數據包中數據的長度
// 3. 校驗和寫入緩衝區 注: 此時還沒發送到客戶端. 只有通過OutputStream將buf內容寫到OutputStream, 纔算發送到客戶端
int checksumOff = pkt.position(); // 當前緩衝區的位置及校驗和的開始位置. 由此可見首先發送校驗和
int checksumLen = numChunks * checksumSize; // 校驗和的長度=chunks*4
byte[] buf = pkt.array(); // 緩衝區的大小爲pktSize的大小. 用來暫存接下來的校驗和和數據. 注: 將數據寫到緩衝區中!
if (checksumSize > 0 && checksumIn != null) { // ⑥ 從checksumIn輸入流將校驗和數據讀取到緩衝區buf中
checksumIn.readFully(buf, checksumOff, checksumLen); // 將輸入流的數據讀取到buf開始位置爲checksumOff, 長爲checksumLen的區域
} // 通過pkt.position計算checksumOff, pkt前面存放了packet header. checksum要接着packet header
int dataOff = checksumOff + checksumLen; // 校驗和的開始位置+長度=數據的偏移量/開始位置
if (blockInPosition < 0) { // 如果>=0, 則使用零拷貝. 默認爲-1. 如果允許零拷貝, 在sendBlock時會設置該值>=0
// 4. 數據寫入緩衝區 normal transfer 從blockIn輸入流讀取塊數據到緩衝區中偏移量爲dataOff, 長度爲len的區域
IOUtils.readFully(blockIn, buf, dataOff, len); // buf緊接着checksum的是data. data的開始位置爲checksum的結束位置+1. +1其實由position內部實現
// 5. 對發送的數據驗證校驗和 (客戶端讀取數據不會執行此校驗)
if (verifyChecksum) {
int dOff = dataOff; // 要計算的校驗塊(真正的數據)的起始位置
int cOff = checksumOff; // 校驗塊對應的校驗和的起始位置. 校驗塊的起始位置每+512bytes, 校驗和的起始位置就+4bytes
int dLeft = len; // len最開始爲多個chunk>512, min會取bytesPerChecksum=512. 在計算每個校驗塊後, dLeft遞減512, 最後 dLeft可能<512
for (int i=0; i<numChunks; i++) { // 對每個校驗塊chunk計算校驗和
checksum.reset();
int dLen = Math.min(dLeft, bytesPerChecksum); // 如果len不足512bytes, 則只對這部分數據計算校驗和, 因爲最後一部分可能不足512bytes
checksum.update(buf, dOff, dLen); // 緩衝區buf存放的就是已經從blockIn讀取出來的真正數據了. 可以直接對數據計算校驗和
if (!checksum.compare(buf, cOff)) { // 比較校驗和cOff開始和經過上面計算的校驗和
throw new ChecksumException("Checksum failed at " + (offset + len - dLeft), len);
}
dLeft -= dLen; // 剩餘參與計算的校驗塊的大小遞減
dOff += dLen; // 起始位置遞增, 即參與計算的下一個校驗塊的起始位置
cOff += checksumSize; // 校驗和的起始位置經過一個校驗塊的計算也遞增4bytes, 爲的是和下一個校驗塊經過計算的校驗和進行比較
}
}
// only recompute checksum if we can't trust the meta data due to concurrent writes 存在競爭條件重新計算校驗和
if (memoizedBlock.hasBlockChanged(len)) {ChecksumUtil.updateChunkChecksum(buf, checksumOff, dataOff, len, checksum);}
// 6. 將緩衝區的數據全部寫到輸出流OutputStream中, 完成向接收端的數據發送
out.write(buf, 0, dataOff + len); // ⑦ 緩衝區從0開始, 一直到真正數據的結束位置. 即發送整個PACKET
} else { // 如果允許零拷貝, 在調用該方法之前的sendBlock就設置了blockInPosition爲正數. 就會執行此零拷貝的優化方式.
try { // use transferTo(). Checks on out and blockIn are already done. 通過Socket發送, 即使用FileChannel來優化發送數據, 而不是通過流的方式
// 4. 採用零拷貝主要針對的是要發送的數據. PACKET的header和checksum並沒有使用零拷貝. 因爲那部分數據比較小.
SocketOutputStream sockOut = (SocketOutputStream) out; // 在sendBlock中已經確保了對象類型的正確性, 才允許進入零拷貝
FileChannel fileChannel = ((FileInputStream) blockIn).getChannel(); // 所以這裏可以放心將out和blockIn轉爲零拷貝需要的類型
if (memoizedBlock.hasBlockChanged(len)) {
fileChannel.position(blockInPosition);
IOUtils.readFileChannelFully(fileChannel, buf, dataOff, len);
ChecksumUtil.updateChunkChecksum(buf, checksumOff, dataOff, len, checksum);
sockOut.write(buf, 0, dataOff + len); // ⑦ 數據塊
} else {
// 5. 首先將buf緩衝區的數據先發送到接收端對應的SocketOutputStream. 這部分數據是PACKET的header和checksum
sockOut.write(buf, 0, dataOff); // ⑦ 使用Socket輸出流發送緩存數據包,直接寫到Socket輸出流 first write the packet
// 6. 接着使用零拷貝發送真正的數據. 普通的傳輸方式是將blockIn先寫到buf中, 這之後還要將buf中的數據發送到接收端的OutputStream.
// 而零拷貝直接將blockIn傳輸到接收端的socketOut, 免去了中間多餘的兩部分內存拷貝操作和上下文切換帶來的系統開銷.
sockOut.transferToFully(fileChannel, blockInPosition, len); // no need to flush. since we know out is not a buffered stream. 零拷貝!
}
blockInPosition += len;
} catch (IOException e) { // exception while writing to the client (well, with transferTo(), it could also be while reading from the local file).
throw ioeToSocketException(e);
}
}
if (throttler != null) { throttler.throttle(packetLen); } // rebalancing so throttle 調整發送速度
return len;
}
全文請期待 http://zqhxuyuan.github.com 的相關博文