背景
在大數據領域,總是會遇到需要將各種數據類型序列化成字節數組,或者從字節數組反序列化回常用數據類型的場景,比如,Spark中推薦使用kyro,HBase中,使用HBase提供的工具來進行序列化以及反序列化,HBase內部使用google的probuff來序列化進行網絡通訊等情況,以及hadoop使用Apache avro來序列化。當然,各種序列化方式的性能以及效率各種優缺點(此文不做對比)。
以上序列化都是框架內部已經給我們做好了序列化以及反序列化操作,如果我們在實際工作中,需要自己手動來序列化的場景,比如,將Java中的一個對象序列化成一個字節數組,然後存儲到HBase中,或者從HBase中讀取出對應的字節數組反序列化成一個對象,面對這樣的需求,我們就可以借組avro來幫我們處理。
下面,給出Apache avro對各種類型的序列化以及反序列化操作。
1. Integer
@Test
public void serdesIntType() throws IOException {
// 構建一個schema
Schema intSchema = SchemaBuilder.builder().intType();
SpecificData specificData = SpecificData.get();
DatumWriter writer = specificData.createDatumWriter(intSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// 將Integer類型序列化成字節數組
writer.write(1, EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
// 將字節數組反序列化成Integer
DatumReader reader = specificData.createDatumReader(intSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals(1, deResult);
}
2. Long
@Test
public void serdesLongType() throws IOException {
Schema longSchema = SchemaBuilder.builder().longType();
SpecificData specificData = SpecificData.get();
DatumWriter writer = specificData.createDatumWriter(longSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write(((long) 1), EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(longSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals(((long) 1), deResult);
}
3. Float
@Test
public void serdesFloatType() throws IOException {
Schema floatSchema = SchemaBuilder.builder().floatType();
SpecificData specificData = SpecificData.get();
DatumWriter writer = specificData.createDatumWriter(floatSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write((1.0f), EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(floatSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals(1.0f, deResult);
}
4. Double
@Test
public void serdesDoubleType() throws IOException {
Schema doubleSchema = SchemaBuilder.builder().doubleType();
SpecificData specificData = SpecificData.get();
DatumWriter writer = specificData.createDatumWriter(doubleSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write((1.0d), EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(doubleSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals(1.0d, deResult);
}
5. Boolean
@Test
public void serdesBooleanType() throws IOException {
Schema booleanSchema = SchemaBuilder.builder().booleanType();
SpecificData specificData = SpecificData.get();
DatumWriter writer = specificData.createDatumWriter(booleanSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write(true, EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(booleanSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals(true, deResult);
}
6. String
@Test
public void serdesStringType() throws IOException {
Schema stringSchema = SchemaBuilder.builder().stringType();
SpecificData specificData = SpecificData.get();
DatumWriter writer = specificData.createDatumWriter(stringSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write("hello", EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(stringSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Utf8 deResult = (Utf8) reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals("hello", deResult.toString());
}
7. byte[]
@Test
public void serdesBytesType() throws IOException {
Schema bytesSchema = SchemaBuilder.builder().bytesType();
SpecificData specificData = SpecificData.get();
DatumWriter writer = specificData.createDatumWriter(bytesSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write(ByteBuffer.wrap("hello".getBytes()), EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(bytesSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
ByteBuffer deResult = (ByteBuffer) reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertTrue(Arrays.equals("hello".getBytes(), deResult.array()));
}
8. java.sql.Date
注意,apache avro對date的反序列化時使用的時區是UTC的,且沒法更換,如果我們序列化時使用的時區不是UTC的,可能會導致結果不正確,所以需要我們自定義一個轉換類提供給avro內部轉化時使用。
自定義SqlDateConversion,實現繼承avro內置的Conversion基類。
public class SqlDateConversion extends Conversion<Date> {
private static final LocalDate EPOCH = LocalDate.of(1970, 1, 1);
public SqlDateConversion() {
}
@Override
public Class<Date> getConvertedType() {
return Date.class;
}
@Override
public String getLogicalTypeName() {
// 注意,avro對date的LogicalType name爲date,所以這裏只能是date
return "date";
}
@Override
public Date fromInt(Integer value, Schema schema, LogicalType type) {
LocalDate localDate = EPOCH.plusDays(value);
return Date.valueOf(localDate);
}
@Override
public Integer toInt(Date value, Schema schema, LogicalType type) {
return Math.toIntExact(value.toLocalDate().toEpochDay());
}
}
序列化與反序列化代碼如下:
@Test
public void serdesDateType() throws IOException {
Schema dateSchema = SchemaBuilder.builder().intType();
LogicalTypes.date().addToSchema(dateSchema);
SpecificData specificData = SpecificData.get();
// 使用我們自定義的Conversion來進行內部date的轉化
specificData.addLogicalTypeConversion(new SqlDateConversion());
DatumWriter writer = specificData.createDatumWriter(dateSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write(Date.valueOf("2020-01-26"), EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(dateSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals(deResult, Date.valueOf("2020-01-26"));
}
9. java.sql.Timestamp
同Date一樣,爲了避免avro內部反序列化時使用的時區與序列化時我們給的時區不一致導致結果不正確,我們需要自定義轉化類。
public class SqlTimestampConversion extends Conversion<Timestamp> {
public SqlTimestampConversion() {
}
@Override
public Class<Timestamp> getConvertedType() {
return Timestamp.class;
}
@Override
public String getLogicalTypeName() {
// 注意對於Timestamp,avro內部的LogicalType name爲timestamp-millis和timestamp-micros,
// 此處我們使用timestamp-millis
return "timestamp-millis";
}
@Override
public Timestamp fromLong(Long value, Schema schema, LogicalType type) {
return new Timestamp(value);
}
@Override
public Long toLong(Timestamp value, Schema schema, LogicalType type) {
return value.getTime();
}
}
序列化與反序列化如下:
@Test
public void serdesTimestampType() throws IOException {
Schema timestampSchema = SchemaBuilder.builder().longType();
LogicalTypes.timestampMillis().addToSchema(timestampSchema);
SpecificData specificData = SpecificData.get();
specificData.addLogicalTypeConversion(new SqlTimestampConversion());
DatumWriter writer = specificData.createDatumWriter(timestampSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write(Timestamp.valueOf("2021-01-25 17:39:46"), EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(timestampSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals(deResult, Timestamp.valueOf("2021-01-25 17:39:46"));
}
10. BigDecimal
@Test
public void serdesDecimalType() throws IOException {
Schema decimalSchema = SchemaBuilder.builder().bytesType();
LogicalTypes.decimal(10, 2).addToSchema(decimalSchema);
SpecificData specificData = SpecificData.get();
// 注意設置一個decimal的轉換器,否則序列化過程中會遇到轉換異常的情況
specificData.addLogicalTypeConversion(new Conversions.DecimalConversion());
DatumWriter writer = specificData.createDatumWriter(decimalSchema);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
writer.write(BigDecimal.valueOf(18.67), EncoderFactory.get().directBinaryEncoder(baos, null));
byte[] bytes = baos.toByteArray();
DatumReader reader = specificData.createDatumReader(decimalSchema);
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
Assert.assertEquals(deResult, BigDecimal.valueOf(18.67));
}
11. 對於複雜數據類型record、array、map的序列化與反序列化綜合案例
@Test
public void serdesRecordType() throws IOException {
SchemaBuilder.TypeBuilder<Schema> builder = SchemaBuilder.builder();
SchemaBuilder.FieldAssembler<Schema> fieldAssembler = builder.record("Record").namespace("com.bugboy")
.fields();
Schema f1Schema = builder.bytesType();
LogicalTypes.decimal(10, 2).addToSchema(f1Schema);
Schema f2Schema = builder.intType();
LogicalTypes.date().addToSchema(f2Schema);
Schema f3Schema = builder.longType();
LogicalTypes.timestampMillis().addToSchema(f3Schema);
Schema subRecordSchema = builder.record("SubRecord")
.namespace("com.bugboy")
.fields()
.name("f1")
.type(f1Schema)
.noDefault()
.name("f2")
.type(f2Schema)
.noDefault()
.name("f3")
.type(f3Schema)
.noDefault()
.endRecord();
Schema itemsSchema = builder.array().items(f1Schema);
Schema eleSchema = builder.intType();
LogicalTypes.date().addToSchema(eleSchema);
Schema valueSchema = builder.map().values(eleSchema);
// 構建Schema
Schema recordSchema = fieldAssembler.name("id")
.type(builder.stringType())
.noDefault()
.name("subRecord")
.type(subRecordSchema)
.noDefault()
.name("array")
.type(itemsSchema)
.noDefault()
.name("map")
.type(valueSchema)
.noDefault()
.endRecord();
// 構建Record
GenericData.Record record = new GenericData.Record(recordSchema);
record.put("id", "0001");
GenericData.Record subRecord = new GenericData.Record(subRecordSchema);
subRecord.put("f1", BigDecimal.valueOf(6867.68));
subRecord.put("f2", Date.valueOf("2021-02-01"));
subRecord.put("f3", Timestamp.valueOf("2021-02-01 10:47:56"));
record.put("subRecord", subRecord);
GenericData.Array array = new GenericData.Array(itemsSchema,
Lists.newArrayList(BigDecimal.valueOf(6573.89),
BigDecimal.valueOf(2347.56), BigDecimal.valueOf(6543.12)));
record.put("array", array);
Map<String, Date> map = new HashMap<>();
record.put("map", map);
map.put("today", Date.valueOf("2021-02-01"));
map.put("tomorrow", Date.valueOf("2021-02-02"));
map.put("yesterday", Date.valueOf("2021-02-02"));
SpecificDatumWriter<GenericData.Record> writer = new SpecificDatumWriter<>();
SpecificData specificData = writer.getSpecificData();
// 添加轉換器
specificData.addLogicalTypeConversion(new SqlDateConversion());
specificData.addLogicalTypeConversion(new SqlTimestampConversion());
specificData.addLogicalTypeConversion(new Conversions.DecimalConversion());
writer.setSchema(recordSchema);// 此處別忘記設置
// 對record進行序列化
ByteArrayOutputStream baos = new ByteArrayOutputStream();
BinaryEncoder encoder = EncoderFactory.get().directBinaryEncoder(baos, null);
writer.write(record, encoder);
byte[] bytes = baos.toByteArray();
// 反序列化爲record
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
BinaryDecoder decoder = DecoderFactory.get().directBinaryDecoder(bais, null);
SpecificDatumReader<Object> reader = new SpecificDatumReader<>();
reader.setSchema(recordSchema);
Object read = reader.read(null, decoder);
// 打印觀察結果
System.out.println(read);
}
最後
Apache avro還支持enum、fixed等上面沒有具體給出的類型的序列化,因爲我實際用不到,就沒有給出(手動滑稽)。所以如果有需要的同學,就自己摸索,或者聯繫我一起討論也行。
最後,快過年了,祝大家牛氣沖天。biu biu biu ~ ~ ~