zookeeper源碼分析之FileTxnSnapLog

zookeeper維護的數據有樹形結構的DataTree，它的每一個節點是DataNode。另一個是TxnLog事務日誌，它包括
事務頭TxnHeader和事務數據Record。它們存在zookeeper服務器的內存中，zookeeper也會將它們序列化至文件中。FileSnap文件快照保證內存中的DataTree和session與數據文件的一致性。FileTxnLog則是保證內存中事務數據Record與實務文件的一致性。

一、SnapShot

SnapShot接口爲持久層提供，實現此接口，提供DataTree和session持久化功能。

public interface SnapShot {
    
    // 反序列化至DataTree和sessions，返回被序列化的最後一個zxid
    long deserialize(DataTree dt, Map<Long, Integer> sessions) 
        throws IOException;
    
    // 序列化DataTree和sessions至文件名爲name中
    void serialize(DataTree dt, Map<Long, Integer> sessions,
                   File name, boolean fsync)
        throws IOException;
    
    // 找到最近一個持久化文件
    File findMostRecentSnapshot() throws IOException;

    // 獲取最近的持久化信息
    SnapshotInfo getLastSnapshotInfo();

    // 釋放資源
    void close() throws IOException;
}

二、FileSnap

FileSnap實現SnapShot接口方法，提供默認快照方法。構造方法參數爲快照文件目錄，每一次持久化都會生成一個由二進制組成的文件放入快照文件目錄下，它們的文件名由snapshot加上zxid的16進制組成。

public class FileSnap implements SnapShot {
    File snapDir;
    SnapshotInfo lastSnapshotInfo = null;
    private volatile boolean close = false;
    private static final int VERSION = 2;
    private static final long dbId = -1;
    // 魔鬼數據，放入文件頭中，校驗
    public final static int SNAP_MAGIC = ByteBuffer.wrap("ZKSN".getBytes()).getInt();

    public static final String SNAPSHOT_FILE_PREFIX = "snapshot";
    // 構造方法，快照文件目錄
    public FileSnap(File snapDir) {
        this.snapDir = snapDir;
    }

    public SnapshotInfo getLastSnapshotInfo() {
        return this.lastSnapshotInfo;
    }

    // 反序列方法
    public long deserialize(DataTree dt, Map<Long, Integer> sessions)
            throws IOException {
        // 按照zxid倒敘找到100個文件
        List<File> snapList = findNValidSnapshots(100);
        if (snapList.size() == 0) {
            return -1L;
        }
        File snap = null;
        boolean foundValid = false;
	// 遍歷文件
        for (int i = 0, snapListSize = snapList.size(); i < snapListSize; i++) {
	    // 當前文件file
            snap = snapList.get(i);
            LOG.info("Reading snapshot " + snap);
            try (CheckedInputStream snapIS = SnapStream.getInputStream(snap)) {
	        // 獲取文件流構造InputArchive
                InputArchive ia = BinaryInputArchive.getArchive(snapIS);
		// 通過ia反序列化dt和sessions
                deserialize(dt, sessions, ia);
		// 檢查安全數據
                SnapStream.checkSealIntegrity(snapIS, ia);
                foundValid = true;
		// 找到就返回
                break;
            } catch (IOException e) {
                LOG.warn("problem reading snap file " + snap, e);
            }
        }
        if (!foundValid) {
            throw new IOException("Not able to find valid snapshots in " + snapDir);
        }
	// 獲取找到的文件名，獲取它的zxid返回
        dt.lastProcessedZxid = Util.getZxidFromName(snap.getName(), SNAPSHOT_FILE_PREFIX);
	// 構造最新的SnapshotInfo
        lastSnapshotInfo = new SnapshotInfo(dt.lastProcessedZxid, snap.lastModified() / 1000);
        return dt.lastProcessedZxid;
    }

    // 通過ia反序列化dt和sessions
    public void deserialize(DataTree dt, Map<Long, Integer> sessions, InputArchive ia) throws IOException {
        // 先反序列化FileHeader，檢查它的魔鬼數據
        FileHeader header = new FileHeader();
        header.deserialize(ia, "fileheader");
        if (header.getMagic() != SNAP_MAGIC) {
            throw new IOException("mismatching magic headers "
                    + header.getMagic() +
                    " !=  " + FileSnap.SNAP_MAGIC);
        }
	// 調用util反序列化
        SerializeUtils.deserializeSnapshot(dt,ia,sessions);
    }

    // 在文件中找到最近的一個文件
    public File findMostRecentSnapshot() throws IOException {
        // 按照zxid倒敘找到1個文件並返回
        List<File> files = findNValidSnapshots(1);
        if (files.size() == 0) {
            return null;
        }
        return files.get(0);
    }

    // 按照文件名解析出zxid，倒序找到最近n個文件
    private List<File> findNValidSnapshots(int n) throws IOException {
        // 遍歷snapDir目錄下所有文件，並按照zxid倒序
        List<File> files = Util.sortDataDir(snapDir.listFiles(), SNAPSHOT_FILE_PREFIX, false);
        int count = 0;
        List<File> list = new ArrayList<File>();
        for (File f : files) {
            try {
                if (SnapStream.isValidSnapshot(f)) {
		    // 校驗n個snapShot文件
                    list.add(f);
                    count++;
                    if (count == n) {
                        break;
                    }
                }
            } catch (IOException e) {
                LOG.info("invalid snapshot " + f, e);
            }
        }
        return list;
    }

    // 同步方法，序列DataTree和snapShot至文件snapShot中
    public synchronized void serialize(DataTree dt, Map<Long, Integer> sessions, File snapShot, boolean fsync)
            throws IOException {
        if (!close) {
            try (CheckedOutputStream snapOS = SnapStream.getOutputStream(snapShot)) {
	        // 獲取當前文件流構造OutputArchive
                OutputArchive oa = BinaryOutputArchive.getArchive(snapOS);
		// SNAP_MAGIC魔鬼數字構造文件頭
                FileHeader header = new FileHeader(SNAP_MAGIC, VERSION, dbId);
		// 調用serialize方法
                serialize(dt, sessions, oa, header);
                SnapStream.sealStream(snapOS, oa);
		// 更新lastSnapshotInfo
                lastSnapshotInfo = new SnapshotInfo(
                        Util.getZxidFromName(snapShot.getName(), SNAPSHOT_FILE_PREFIX),
                        snapShot.lastModified() / 1000);
            }
        }
    }


    // serialize the datatree and sessions
    protected void serialize(DataTree dt,Map<Long, Integer> sessions,
            OutputArchive oa, FileHeader header) throws IOException {
        // this is really a programmatic error and not something that can
        // happen at runtime
        if(header==null)
            throw new IllegalStateException(
                    "Snapshot's not open for writing: uninitialized header");
        // 先序列化fileheader
        header.serialize(oa, "fileheader");
	// 再調用util序列化
        SerializeUtils.serializeSnapshot(dt,oa,sessions);
    }

    @Override
    public synchronized void close() throws IOException {
        close = true;
    }
}

三、TxnLog

// 接口去讀取TxnLog事務日誌數據
public interface TxnLog extends Closeable {

    // 設置ServerStats
    void setServerStats(ServerStats serverStats);
    
    // 回滾當前追加的日誌
    void rollLog() throws IOException;
    
    // 添加日誌，true爲成功 
    boolean append(TxnHeader hdr, Record r) throws IOException;

    // 通過給定的zxid，讀取日誌
    TxnIterator read(long zxid) throws IOException;
    
    // 返回最後一個日誌的zxid
    long getLastLoggedZxid() throws IOException;
    
    //truncate 日誌
    boolean truncate(long zxid) throws IOException;
    
    // 獲取日誌的dbId
    long getDbId() throws IOException;
    
    // 提交事務，保證持久化
    void commit() throws IOException;

    // 日誌Elapsed時間
    long getTxnLogSyncElapsedTime();
   
    void setTotalLogSize(long size);

    long getTotalLogSize();

    // 讀事務日誌遍歷器
    public interface TxnIterator extends Closeable {

        TxnHeader getHeader();

        Record getTxn();

        boolean next() throws IOException;

        long getStorageSize() throws IOException;
    }
}

四、FileTxnLog

FileTxnLog實現TxnLog接口，提供操作txnlogs的公共api。事務日誌的格式如下：
LogFile: FileHeader TxnList ZeroPad
FileHeader: {
magic 4bytes (ZKLG)
version 4bytes
dbid 8bytes
}

TxnList: Txn || Txn TxnList
Txn: checksum Txnlen TxnHeader Record 0x42
checksum: 8bytes Adler32（通過Txnlen, TxnHeader, Record和0x42計算得出）
Txnlen: len 4bytes
TxnHeader: {
sessionid 8bytes
cxid 4bytes
zxid 8bytes
time 8bytes
type 4bytes
}

public class FileTxnLog implements TxnLog {
    private static final Logger LOG;

    public final static int TXNLOG_MAGIC = ByteBuffer.wrap("ZKLG".getBytes()).getInt();

    public final static int VERSION = 2;

    long lastZxidSeen;
    volatile BufferedOutputStream logStream = null;
    volatile OutputArchive oa;
    volatile FileOutputStream fos = null;

    File logDir;
    private final boolean forceSync = !System.getProperty("zookeeper.forceSync", "yes").equals("no");
    long dbId;
    private final Queue<FileOutputStream> streamsToFlush = new ArrayDeque<>();
    File logFileWrite = null;
    private FilePadding filePadding = new FilePadding();

    private ServerStats serverStats;

    private volatile long syncElapsedMS = -1L;

    // 構造方法，傳入日誌目錄
    public FileTxnLog(File logDir) {
        this.logDir = logDir;
    }

    // 回滾日誌方法
    public synchronized void rollLog() throws IOException {
    // 當前日誌流不爲空，則刷新
        if (logStream != null) {
            this.logStream.flush();
            prevLogsRunningTotal += getCurrentLogSize();
            this.logStream = null;
            oa = null;
	    // 回滾當前日誌文件數據添加至prevLogsRunningTotal中即可
        }
    }

    // 關閉已經打開的文件流
    public synchronized void close() throws IOException {
        if (logStream != null) {
            logStream.close();
        }
        for (FileOutputStream log : streamsToFlush) {
            log.close();
        }
    }

    // 添加事務日誌數據
    public synchronized boolean append(TxnHeader hdr, Record txn)
        throws IOException {
	// 校驗駛入頭消息
        if (hdr == null) {
            return false;
        }
	// 校驗事務的zxid是否比最後的大
        if (hdr.getZxid() <= lastZxidSeen) {
            LOG.warn("Current zxid " + hdr.getZxid()
                    + " is <= " + lastZxidSeen + " for "
                    + hdr.getType());
        } else {
            lastZxidSeen = hdr.getZxid();
        }
        if (logStream==null) {
           if(LOG.isInfoEnabled()){
                LOG.info("Creating new log file: " + Util.makeLogName(hdr.getZxid()));
           }
	   // 創建文件
           logFileWrite = new File(logDir, Util.makeLogName(hdr.getZxid()));
	   // 打開日誌文件流
           fos = new FileOutputStream(logFileWrite);
	   // 打開日誌內存緩衝流
           logStream=new BufferedOutputStream(fos);
	   // 構造BinaryOutputArchive
           oa = BinaryOutputArchive.getArchive(logStream);
	   // 構造FileHeader信息
           FileHeader fhdr = new FileHeader(TXNLOG_MAGIC,VERSION, dbId);
	   // 先序列化文件頭
           fhdr.serialize(oa, "fileheader");
           // 刷新數據至文件中，確保魔鬼數據比filePadding先至文件
           logStream.flush();
           filePadding.setCurrentSize(fos.getChannel().position());
           streamsToFlush.add(fos);
        }
        filePadding.padFile(fos.getChannel());
	// 調用util，序列化hdr和txn
        byte[] buf = Util.marshallTxnEntry(hdr, txn);
        if (buf == null || buf.length == 0) {
            throw new IOException("Faulty serialization for header " +
                    "and txn");
        }
	// 構造Checksum
        Checksum crc = makeChecksumAlgorithm();
        crc.update(buf, 0, buf.length);
	// 先寫入長度校驗數據
        oa.writeLong(crc.getValue(), "txnEntryCRC");
	// 再寫buf數據
        Util.writeTxnBytes(oa, buf);
        return true;
    }

    // 獲取最後一條事務日誌的zxid
    public long getLastLoggedZxid() {
        // 獲取目錄下zxid大於0的所有文件
        File[] files = getLogFiles(logDir.listFiles(), 0);
        long maxLog=files.length>0?
                Util.getZxidFromName(files[files.length-1].getName(),LOG_FILE_PREFIX):-1;
        long zxid = maxLog;
        TxnIterator itr = null;
        try {
	    // 新建 TxnIterator遍歷器，讀取大於maxLog所有的文件，找到zxid的最大值
            FileTxnLog txn = new FileTxnLog(logDir);
            itr = txn.read(maxLog);
            while (true) {
                if(!itr.next())
                    break;
                TxnHeader hdr = itr.getHeader();
                zxid = hdr.getZxid();
            }
        } catch (IOException e) {
            LOG.warn("Unexpected exception", e);
        } finally {
            close(itr);
        }
        return zxid;
    }
    // 安靜的關閉事務日誌TxnIterator遍歷器
    private void close(TxnIterator itr) {
        if (itr != null) {
            try {
                itr.close();
            } catch (IOException ioe) {
                LOG.warn("Error closing file iterator", ioe);
            }
        }
    }

    // 提交日誌，確保所有數據都刷新至文件中 
    public synchronized void commit() throws IOException {
        // 刷新當前logStream
        if (logStream != null) {
            logStream.flush();
        }
	// 刷新所有log
        for (FileOutputStream log : streamsToFlush) {
            log.flush();
            if (forceSync) {
                long startSyncNS = System.nanoTime();

                FileChannel channel = log.getChannel();
                channel.force(false);

                syncElapsedMS = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startSyncNS);
                if (syncElapsedMS > fsyncWarningThresholdMS) {
                    if(serverStats != null) {
                        serverStats.incrementFsyncThresholdExceedCount();
                    }
                }
                ServerMetrics.getMetrics().FSYNC_TIME.add(syncElapsedMS);
            }
        }
        while (streamsToFlush.size() > 1) {
            streamsToFlush.poll().close();
        }
        // Roll the log file if we exceed the size limit
        if(txnLogSizeLimit > 0) {
            long logSize = getCurrentLogSize();
            if (logSize > txnLogSizeLimit) {
                LOG.debug("Log size limit reached: {}", logSize);
                rollLog();
            }
        }
    }

    // truncate the current transaction logs
    public boolean truncate(long zxid) throws IOException {
        FileTxnIterator itr = null;
        try {
	    // 構造logDir目錄下，大於zxid的文件遍歷器
            itr = new FileTxnIterator(this.logDir, zxid);
            PositionInputStream input = itr.inputStream;
            if(input == null) {
                throw new IOException("No log files found to truncate! This could " +
                        "happen if you still have snapshots from an old setup or " +
                        "log files were deleted accidentally or dataLogDir was changed in zoo.cfg.");
            }
            long pos = input.getPosition();
            //  通過RandomAccessFile設置文件長度至當前位置
            RandomAccessFile raf=new RandomAccessFile(itr.logFile,"rw");
            raf.setLength(pos);
            raf.close();
            while(itr.goToNextLog()) {
	         // 刪除文件
                if (!itr.logFile.delete()) {
                    LOG.warn("Unable to truncate {}", itr.logFile);
                }
            }
        } finally {
	    // 關閉遍歷器
            close(itr);
        }
        return true;
    }


    // 獲取dbId，讀取目錄下第一個文件，獲取文件頭中dbId
    public long getDbId() throws IOException {
        FileTxnIterator itr = new FileTxnIterator(logDir, 0);
        FileHeader fh=readHeader(itr.logFile);
        itr.close();
        if(fh==null)
            throw new IOException("Unsupported Format.");
        return fh.getDbid();
    }
}

五、FileTxnIterator

FileTxnIterator文件遍歷器是FileTxnLog的內部類，它實現TxnLog.TxnIterator接口，提供文件遍歷功能。

    public static class FileTxnIterator implements TxnLog.TxnIterator {
        // 文件目錄
        File logDir;
	// 需遍歷大於zxid的文件
        long zxid;
	// 當前文件TxnHeader
        TxnHeader hdr;
	// 當前文件record
        Record record;
	// 當前文件
        File logFile;
	// 當前文件構造的InputArchive
        InputArchive ia;
        PositionInputStream inputStream=null;
        // 我們已經找到的比zxid大的文件
        private ArrayList<File> storedFiles;

        // 構造函數，logDir目錄，比zxid大，構造時，是否提前向前找到大於zxid的文件
        public FileTxnIterator(File logDir, long zxid, boolean fastForward)
                throws IOException {
            this.logDir = logDir;
            this.zxid = zxid;
	    // 初始化時，打開第一個文件，初始化hdr和record
            init();
	    // fastForward爲true時,獲取當前hed的zxid，直到找到大於zxid的第一個文件才停止向下遍歷
            if (fastForward && hdr != null) {
                while (hdr.getZxid() < zxid) {
                    if (!next())
                        break;
                }
            }
        }

        // 構造函數，fastForward爲true
        public FileTxnIterator(File logDir, long zxid) throws IOException {
            this(logDir, zxid, true);
        }

        // 初始化，打開第一個文件
        void init() throws IOException {
            storedFiles = new ArrayList<File>();
	    // 獲取logDir目錄下zxid大於0的所有文件，並且按照zxid倒序
            List<File> files = Util.sortDataDir(FileTxnLog.getLogFiles(logDir.listFiles(), 0), LOG_FILE_PREFIX, false);
            for (File f: files) {
	        // 找到大於等於zxid的文件放入storedFiles中
                if (Util.getZxidFromName(f.getName(), LOG_FILE_PREFIX) >= zxid) {
                    storedFiles.add(f);
                }
                // 並且添加第一個小於zxid的文件至storedFiles中
                else if (Util.getZxidFromName(f.getName(), LOG_FILE_PREFIX) < zxid) {
                    storedFiles.add(f);
                    break;
                }
            }
	    // 下一個日誌
            goToNextLog();
            next();
        }

        // 打開下一個文件logFile，並創建ia
        private boolean goToNextLog() throws IOException {
            if (storedFiles.size() > 0) {
                this.logFile = storedFiles.remove(storedFiles.size()-1);
                ia = createInputArchive(this.logFile);
                return true;
            }
            return false;
        }

        // 創建InputArchiv
        protected InputArchive createInputArchive(File logFile) throws IOException {
            if(inputStream==null){
                inputStream= new PositionInputStream(new BufferedInputStream(new FileInputStream(logFile)));
                LOG.debug("Created new input stream " + logFile);
                ia  = BinaryInputArchive.getArchive(inputStream);
                inStreamCreated(ia,inputStream);
                LOG.debug("Created new input archive " + logFile);
            }
            return ia;
        }

        // 創建Adler32 Checksum
        protected Checksum makeChecksumAlgorithm(){
            return new Adler32();
        }

        // 遍歷器處理當前打開的數據
        public boolean next() throws IOException {
            if (ia == null) {
                return false;
            }
            try {
	        // 先讀取 crcvalue
                long crcValue = ia.readLong("crcvalue");
		// 再讀取bytes
                byte[] bytes = Util.readTxnBytes(ia);
                // Since we preallocate, we define EOF to be an
                if (bytes == null || bytes.length==0) {
                    throw new EOFException("Failed to read " + logFile);
                }
                // 校驗Checksum
                Checksum crc = makeChecksumAlgorithm();
                crc.update(bytes, 0, bytes.length);
                if (crcValue != crc.getValue())
                    throw new IOException(CRC_ERROR);
                hdr = new TxnHeader();
		// 反序列化hdr和record
                record = SerializeUtils.deserializeTxn(bytes, hdr);
            } catch (EOFException e) {
                LOG.debug("EOF exception " + e);
                inputStream.close();
                inputStream = null;
                ia = null;
                hdr = null;
                // this means that the file has ended
                // we should go to the next file
                if (!goToNextLog()) {
                    return false;
                }
                // 如果當前文件結束，則自動向下走
                return next();
            } catch (IOException e) {
                inputStream.close();
                throw e;
            }
            return true;
        }
    }

六、FileTxnSnapLog

FileTxnSnapLog是融合TxnLog和SnapShot的工具類

public class FileTxnSnapLog {

    //the directory containing the
    //the transaction logs
    final File dataDir;
    //the directory containing the
    //the snapshot directory
    final File snapDir;
    TxnLog txnLog;
    SnapShot snapLog;

    public FileTxnSnapLog(File dataDir, File snapDir) throws IOException {

        this.dataDir = new File(dataDir, version + VERSION);
        this.snapDir = new File(snapDir, version + VERSION);
        if(!this.dataDir.getPath().equals(this.snapDir.getPath())){
            checkLogDir();
            checkSnapDir();
        }
        txnLog = new FileTxnLog(this.dataDir);
        snapLog = new FileSnap(this.snapDir);
    }

    public long restore(DataTree dt, Map<Long, Integer> sessions,
                        PlayBackListener listener) throws IOException {
        long snapLoadingStartTime = Time.currentElapsedTime();
        long deserializeResult = snapLog.deserialize(dt, sessions);
        ServerMetrics.getMetrics().STARTUP_SNAP_LOAD_TIME.add(
                Time.currentElapsedTime() - snapLoadingStartTime);
        FileTxnLog txnLog = new FileTxnLog(dataDir);
        boolean trustEmptyDB;
        File initFile = new File(dataDir.getParent(), "initialize");
        if (Files.deleteIfExists(initFile.toPath())) {
            LOG.info("Initialize file found, an empty database will not block voting participation");
            trustEmptyDB = true;
        } else {
            trustEmptyDB = autoCreateDB;
        }
        return fastForwardFromEdits(dt, sessions, listener);
    }
    public void save(DataTree dataTree,
                     ConcurrentHashMap<Long, Integer> sessionsWithTimeouts,
                     boolean syncSnap)
        throws IOException {
        long lastZxid = dataTree.lastProcessedZxid;
        File snapshotFile = new File(snapDir, Util.makeSnapshotName(lastZxid));
        LOG.info("Snapshotting: 0x{} to {}", Long.toHexString(lastZxid),
                snapshotFile);
        try {
            snapLog.serialize(dataTree, sessionsWithTimeouts, snapshotFile, syncSnap);
        } catch (IOException e) {
            if (snapshotFile.length() == 0) {
                if (snapshotFile.delete()) {
                    LOG.info("Deleted empty snapshot file: " +
                             snapshotFile.getAbsolutePath());
                } else {
                    LOG.warn("Could not delete empty snapshot file: " +
                             snapshotFile.getAbsolutePath());
                }
            } else {
            }
            throw e;
        }
    }

    public boolean truncateLog(long zxid) throws IOException {
        // close the existing txnLog and snapLog
        close();

        // truncate it
        FileTxnLog truncLog = new FileTxnLog(dataDir);
        boolean truncated = truncLog.truncate(zxid);
        truncLog.close();
        txnLog = new FileTxnLog(dataDir);
        snapLog = new FileSnap(snapDir);

        return truncated;
    }

    public File findMostRecentSnapshot() throws IOException {
        FileSnap snaplog = new FileSnap(snapDir);
        return snaplog.findMostRecentSnapshot();
    }
}

青楓綠嶼

發佈了40 篇原創文章 · 獲贊 43 · 訪問量 6萬+

私信關注

zookeeper源碼分析之FileTxnSnapLog

目錄

一、SnapShot

二、FileSnap

三、TxnLog

四、FileTxnLog

五、FileTxnIterator

六、FileTxnSnapLog

Dubbo源碼分析之SPI(一) | serviceLoader

Dubbo源碼分析之Netty網絡服務（二）

Dubbo源碼分析之SPI(二) | ExtensionLoader

zookeeper的windows和Linux的安裝與啓動

zookeeper java api 操作（三） | zkclient

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結