一星期沒有寫博文了,只是因爲最近領導交代了一項艱鉅的任務,讓我無暇顧及其他。將mysql數據庫中的數據搬到我們的ES集羣中。mysql數據是我們從阿里那邊拉過來臨時存放數據的地方,現在要將其存儲到我們的集羣中。說道這裏,可能讀者會覺得,這有什麼難度。是的,如果數據量很少,幾百萬或者上千萬確實多花點時間傳送,就可以了。可是,如果是二十幾億的數據量呢,怎麼辦?領導給了我一星期時間,包括我寫代碼,調試速度,開始拉數據。爲此,我被這二十幾億數據折磨得體無完膚。
接下來就是我寫代碼,我用了半天時間寫好了代碼,進行了測試,調試了速度,開始部署,程序是跑起來了,我鬆了一口氣。緊接着,我將其部署到正式環境,在ES中建好index,在mysql中建了一個測試表,插入了幾十萬數據,將mysql數據中的數據開始導入到集羣,似乎還可以的樣子。這時候一天時間已經過去了。
然後我就將其開始正式操作。這時候問題來了,首先拋開速度不說,mysql中每張表一億多條數據,數據裏面未知的東西太多太多,數據格式、空、各種奇葩字符、特殊字符等。我之前沒有想到的異常處理,改善了代碼。然後程序可以跑通了。我從mysql中找了一張表,決定先插入100萬數據,試試。問題又來了,從mysql中查找了100萬數據,但是最終插入到集羣中的卻只有99萬多,還有幾千條數據去哪了,縱使我考慮了各種異常情況,還是有這麼多數據量的丟失。老大給我的要求是數據量丟失控制在百萬分之一啊。無奈,我再次改善代碼,我打印了更多的log信息,記錄下插入到集羣中失敗的數據。然後分析了這些數據,再次改進了代碼,這次100萬數據全部插入進去了。我終於緩了一緩。
最後,就是調試速度了,採用多線程,如何從mysql中查詢,查詢出來的數據是放到內存中然後直接讀取插入到集羣中還是寫入文件後在讀取文件插入集羣中,取決於自己了。個人推薦第一種。當然這其中還有很多問題需要考慮,比如連接如果斷掉,失敗數據記錄,磁盤負載等之類,對於大數據量的導入,需要注意的問題很多。下面我將貼上我最開始的大概代碼,供大家參考。
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.PropertyNamingStrategy;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.elasticsearch.action.bulk.*;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilders;
import java.io.*;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.sql.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Date;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
public class FileToEsOrderTest {
static ConcurrentLinkedQueue<String> queues = new ConcurrentLinkedQueue<String>();
static AtomicBoolean isInsert = new AtomicBoolean(true);
static TransportClient client = null;
public static void main(String[] agrs) throws Exception {
Settings settings = Settings.settingsBuilder()
.put("cluster.name", "elasticsearch-cluster").build();
client = TransportClient.builder().settings(settings).build();
try {
client.addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName("xxxxx"), 9500));
} catch (UnknownHostException error) {
System.out.print(error.getMessage());
}
final long aa = System.currentTimeMillis();
final ConcurrentHashMap<String, Boolean> hashMap = new ConcurrentHashMap();
for (int i = 0; i < 20; i++) {
new Thread(new Runnable() {
Integer num = 1;
public void run() {
//Add transport addresses and do something with the client...
hashMap.put(Thread.currentThread().getName(), Boolean.FALSE);
final BulkProcessor bulkProcessor = BulkProcessor.builder(
client,
new BulkProcessor.Listener() {
//批量成功後執行
public void afterBulk(long l, BulkRequest bulkRequest,
BulkResponse bulkResponse) {
System.out.println("請求數量是:" +
bulkRequest.numberOfActions());
if (bulkResponse.hasFailures()) {
for (BulkItemResponse item :
bulkResponse.getItems()) {
if (item.isFailed()) {
System.out.println("失敗信息:--------" +
item.getFailureMessage());
}
}
}
}
//批量提交之前執行
public void beforeBulk(long executionId,
BulkRequest request) {
}
//批量失敗後執行
public void afterBulk(long executionId,
BulkRequest request,
Throwable failure) {
System.out.println("happen fail = " +
failure.getMessage() + " ,
cause = " + failure.getCause());
}
})
.setBulkActions(10000)
.setBulkSize(new ByteSizeValue(100, ByteSizeUnit.MB))
.setBackoffPolicy(
BackoffPolicy.exponentialBackoff(
TimeValue.timeValueMillis(100), 3))
.setConcurrentRequests(1)
.build();
while (true) {
if (!queues.isEmpty()) {
try {
String json = queues.poll();
if (json == null) continue;
int index1 = json.indexOf("checksum");
int index2 = json.indexOf("}", index1);
index1 += 10;
String id = json.substring(index1 + 1, index2 - 1);
int index3 = json.indexOf("dp_id");
int index4 = json.indexOf(",", index3);
index3 += 7;
String routing = json.substring(index3 + 1, index4 - 1);
count++;
bulkProcessor.add(new IndexRequest("xxxx",
"xxxxx").id(id).routing(routing).source(json));
} catch (Exception e) {
System.out.print(e.getMessage());
}
}
if (queues.isEmpty() && !isInsert.get()) {
bulkProcessor.flush();
long jjj = System.currentTimeMillis() - aa;
System.out.print(" " + Thread.currentThread().getName()
+ ":" + jjj + " ");
hashMap.put(Thread.currentThread().getName(), Boolean.TRUE);
while (hashMap.values().contains(Boolean.FALSE)) {
try {
Thread.currentThread().sleep(1 * 1000);
} catch (Exception e) {
e.printStackTrace(System.out);
}
}
bulkProcessor.close();
break;
}
}
}
}).start();
}
// File file = new File("/test/rfm/rfm_data.txt");
// FileOutputStream fileOutputStream = new FileOutputStream((file));
// OutputStreamWriter outputStreamWriter =
// new OutputStreamWriter(fileOutputStream);
// bufferedWriter = new BufferedWriter(outputStreamWriter);
for(int i = 2; i <= 23; i++){
WriteData("xxx" + i);
}
// WriteData("rfm_1");
// bufferedWriter.close();
// outputStreamWriter.close();
// fileOutputStream.close();
System.out.println("數據寫入完畢");
}
// 寫數據
public static void WriteData(String tableName) throws IOException {
Connection conn = null;
PreparedStatement ps = null;
ResultSet rs = null;
Integer count = 1;
List<String> columnName = Arrays.asList("trade_last_interval","trade_first_interval");
List<String> columnDateName = Arrays.asList("modify","trade_first_time","trade_last_time");
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
try {
Class.forName("com.mysql.jdbc.Driver");
String url = "jdbc:mysql://xxxxxxxxx";
conn = DriverManager.getConnection(url, "xxxx", "xxxx");
System.out.println("寫入數據開始,成功連接MySQL:" + tableName);
String sql = "select * from " + tableName;
ps = conn.prepareStatement(sql,
ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
ps.setFetchSize(Integer.MIN_VALUE);
rs = ps.executeQuery();
ResultSetMetaData rsmd = rs.getMetaData();
int colCount = rsmd.getColumnCount();
ObjectMapper objectMapper = new ObjectMapper()
.setSerializationInclusion(JsonInclude.Include.NON_NULL)
.setDateFormat(new SimpleDateFormat("yyyy-MM-dd"))
.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS)
.setPropertyNamingStrategy(
PropertyNamingStrategy.CAMEL_CASE_TO_LOWER_CASE_WITH_UNDERSCORES);
while(rs.next()) { //while控制行數
Map<String, String> map = new LinkedHashMap<>();
//StringBuilder buffer = new StringBuilder();
for(int i = 1; i <= colCount; i++ ) {
String name = rsmd.getColumnName(i);
if(!columnName.contains(name)) {
String value = rs.getString(i);
boolean flag = true;
if(columnDateName.contains(name)){
try {
dateFormat.parse(value);
} catch (Exception e){
flag = false;
}
} else if("buyer_nick".equalsIgnoreCase(name)){
value = encrypt(value);
}
if (flag && value != null && !"".equals(value.trim()) && value.trim().length() > 0) {
//buffer.append("\"" + name + "\":\"" + value + "\"");
//buffer.append(",");
map.put(name, value);
}
}
}
count++;
if(map != null && map.size() > 0){
queues.add(objectMapper.writeValueAsString(map));
}
if(count % 200000 == 0){
int number = queues.size();
int jj = number/200000;
System.out.println("index: " + count + ",
jj: " + jj + ", number: " + number);
while(jj > 0){
try {
Thread.sleep(2000*jj);
} catch (InterruptedException e) {
e.printStackTrace();
}
int number2 = queues.size();
jj = number2 / 200000;
System.out.println("index2: " + count + ",
jj: " + jj + ", number2: " + number2);
}
}
}
isInsert = new AtomicBoolean(false);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}catch (SQLException e) {
e.printStackTrace();
} finally {
try {
if(rs != null) {
rs.close();
}
} catch (SQLException e) {
e.printStackTrace();
}
try {
if(ps != null) {
ps.close();
}
} catch (SQLException e) {
e.printStackTrace();
}
try {
if(conn != null) {
conn.close();
}
} catch (SQLException e) {
e.printStackTrace();
}
}
System.out.println(tableName + "數據寫入完畢,共有數據:" + count);
try {
Thread.sleep(100000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}