在運行map和reduce任務時會需要從hdfs中讀取數據,從linux磁盤中讀取數據,這些數據往往存在於不同的節點上,這樣就會產生IO網絡消耗。Hadoop提供了org.apache.hadoop.io.Writable接口來實現序列化,相比於java的序列化hadoop的序列化消耗資源更少。
writable接口中存在兩個方法
void write(DataOutputout) throws IOException;
voidreadFields(DataInput in) throws IOException;
write方法作用是序列化數據到DataOutPut中,readFileds方法的作用是從輸入流中反序列化數據,需要我手動來完成。Java則不用只需要實現java.io.Serializable接口。下面就是一個hadoop的序列化例子。
package mapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import org.apache.hadoop.io.Writable;
import org.junit.Test;
public class TestSerExample {
@Test
public void testJava() throws Exception {
peopleJava p1 = new peopleJava(1L,"張三");
FileOutputStream fs = new FileOutputStream(new File("G:/BigData/java1"));
ObjectOutputStream os = new ObjectOutputStream(fs);
os.writeObject(p1);
os.close();
fs.close();
}
@Test
public void testHadoop() throws Exception {
peopleHadoop p1 = new peopleHadoop(1L, "張三");
FileOutputStream fs = new FileOutputStream(new File("G:/BigData/hadoop1"));
DataOutputStream dos = new DataOutputStream(fs);
p1.write(dos);
fs.close();
dos.close();
}
}
/**
* java 序列化
* @author think
*
*/
class peopleJava implements java.io.Serializable
{
private Long id;
private String name;
public peopleJava(Long id, String name) {
super();
this.id = id;
this.name = name;
}
}
/**
* hadoop序列化
* @author think
*
*/
class peopleHadoop implements Writable
{
private Long id;
private String name;
public peopleHadoop(Long id, String name) {
super();
this.id = id;
this.name = name;
}
/**
* 將數據序列化到DataOutput中
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(id);
out.writeUTF(name);
}
/**
* 從DataInput中反序列化數據
*/
@Override
public void readFields(DataInput in) throws IOException {
}
}