mysql流式讀取大數據量與批量插入數據分析

1、流式讀取
java從mysql讀取大量數據，當結果從myql服務端返回後立即對其進行處理，這樣應用就不需要大量內存來存儲這個結果集。此時應該用流式讀取。

PreparedStatement ps = connection.prepareStatement("select .. from ..", 
            ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY); 

/*
TYPE_FORWARD_ONLY和CONCUR_READ_ONLY是mysql 驅動的默認值，所以不指定也是可以的 比如： PreparedStatement ps = connection.prepareStatement("select .. from .."); 
*/

//可以修改jdbc url通過defaultFetchSize參數來設置，這樣默認所以的返回結果都是通過流方式讀取
ps.setFetchSize(Integer.MIN_VALUE); 
ResultSet rs = ps.executeQuery(); 

while (rs.next()) { 
　　System.out.println(rs.getString("fieldName")); 
}

/*
mysql判斷是否開啓流式讀取結果的方法，有三個條件forward-only，read-only和fatch size是Integer.MIN_VALUE。我們可以看看它的源碼：
/**
 * We only stream result sets when they are forward-only, read-only, and the
 * fetch size has been set to Integer.MIN_VALUE
 *
 * @return true if this result set should be streamed row at-a-time, rather
 * than read all at once.
 */
protected boolean createStreamingResultSet() {
    try {
        synchronized(checkClosed().getConnectionMutex()) {
            return ((this.resultSetType == java.sql.ResultSet.TYPE_FORWARD_ONLY)
                 && (this.resultSetConcurrency == java.sql.ResultSet.CONCUR_READ_ONLY) 
                 && (this.fetchSize == Integer.MIN_VALUE));
        }
    } catch (SQLException e) {
        // we can't break the interface, having this be no-op in case of error is ok

        return false;
    }
}
*/

2、批量寫入
如果應用程序是一條一條的執行insert來寫入數據，寫入是很慢的。主要原因是單條寫入時候需要應用於db之間大量的請求響應交互。每個請求都是一個獨立的事務提交。這樣網絡延遲大的情況下多次請求會有大量的時間消耗的網絡延遲上。第二個是由於每個事務db都會有刷新磁盤操作寫事務日誌，保證事務的持久性。由於每個事務只是寫入一條數據所以磁盤io利用率不高，因爲對於磁盤io是按塊來的，所以連續寫入大量數據效率更好。所以必須改成批量寫入的方式，減少請求數與事務數。下面是批量插入的例子：

int batchSize = 1000;
PreparedStatement ps = connection.prepareStatement("insert into tb1 (c1,c2,c3...) values (?,?,?...)");

for (int i = 0; i < list.size(); i++) {

    ps.setXXX(list.get(i).getC1());
    ps.setYYY(list.get(i).getC2());
    ps.setZZZ(list.get(i).getC3());

    ps.addBatch();

    if ((i + 1) % batchSize == 0) {
        ps.executeBatch();
    }
}

if (list.size() % batchSize != 0) {
    ps.executeBatch();
}
//注意：jdbc連接串須加：rewriteBatchedStatements=true

上面代碼示例是每1000條數據發送一次請求。mysql驅動內部在應用端會把多次addBatch()的參數合併成一條multi value的insert語句發送給db去執行。比如insert into tb1(c1,c2,c3) values (v1,v2,v3),(v4,v5,v6),(v7,v8,v9)…，這樣可以比每條一個insert 明顯少很多請求。減少了網絡延遲消耗時間與磁盤io時間，從而提高了tps。

3、代碼展示

public class TestInsert {

    public static void main(String[] args) throws SQLException {

        int batchSize = 1000;
        int insertCount = 1000;

        testDefault(batchSize, insertCount);

        testRewriteBatchedStatements(batchSize,insertCount);

    }

    //默認方式插入
    private static void testDefault(int batchSize, int insertCount) throws SQLException{  

        long start = System.currentTimeMillis();

        doBatchedInsert(batchSize, insertCount,"");

        long end = System.currentTimeMillis();
        System.out.println("default:" + (end -start) + "ms");
    }


    //批量插入
    private static void testRewriteBatchedStatements(int batchSize, int insertCount) throws SQLException {

        long start = System.currentTimeMillis();

        doBatchedInsert(batchSize, insertCount, "rewriteBatchedStatements=true");

        long end = System.currentTimeMillis();
        System.out.println("rewriteBatchedStatements:" + (end -start) + "ms");
    }


    private static void doBatchedInsert(int batchSize, int insertCount, String mysqlProperties) throws SQLException {
        DruidDataSource dataSource = new DruidDataSource();
        dataSource.setUrl("jdbc:mysql://ip:3306/test?" + mysqlProperties);
        dataSource.setUsername("name");
        dataSource.setPassword("password");

        dataSource.init();

        Connection connection = dataSource.getConnection();

        PreparedStatement preparedStatement = connection.prepareStatement("insert into Test (name,gmt_created,gmt_modified) values (?,now(),now())");

        for (int i = 0; i < insertCount; i++) {
            preparedStatement.setString(1, i+" ");
            preparedStatement.addBatch();
            if((i+1) % batchSize == 0) {
                preparedStatement.executeBatch();
            }
        }
        preparedStatement.executeBatch();

        connection.close();   
        dataSource.close();
    }

}