使用HBase的javaAPI或者使用sqoop將數據寫入或者導入到HBase中,這些方式不是慢就是在導入的過程的佔用Region資料導致效率低下。
而Bulkload方式通過MR的程序,將數據直接轉換成HBase的最終存儲格式HFile,然後直接load數據到HBase中即可。
bulkload優點:
1.導入過程不佔用Region資源
2.能快速導入海量的數據
3.節省內存
HFile
HBase中每張Table在根目錄(/HBase)下用一個文件夾存儲,Table名爲文件夾名,在Table文件夾下每個Region同樣用一個文件夾存儲,每個Region文件夾下的每個列族也用文件夾存儲,而每個列族下存儲的就是一些HFile文件,HFile就是HBase數據在HFDS下存儲格式,所以HBase存儲文件最終在hdfs上面的表現形式就是HFile,如果我們可以直接將數據轉換爲HFile的格式,那麼我們的HBase就可以直接讀取加載HFile格式的文件,就可以直接讀取了
普通讀寫與bulkload方式對比
HBase數據正常讀寫流程
使用bulkload的方式將我們的數據直接生成HFile格式,然後直接加載到HBase的表當中去
需求:將我們hdfs上面的這個路徑/hbase/input/user.txt的數據文件,轉換成HFile格式,然後load到myuser3這張表裏面去
創建myuser3表,並執行如下代碼
create 'myuser3','f1'
package com.czxy.demo04;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import javax.management.ImmutableDescriptor;
import java.io.IOException;
public class BulkLoadMR extends Configured implements Tool {
//編寫將hdfs數據轉換成HFile文件的代碼
public static class createHFile extends Mapper<LongWritable, Text,ImmutableBytesWritable, Put>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String data = value.toString();
String[] split = data.split("\t");
String rowkey = split[0];
String name = split[1];
String age = split[2];
//封裝put
Put put = new Put(rowkey.getBytes());
put.addColumn("f1".getBytes(),"name".getBytes(),name.getBytes());
put.addColumn("f1".getBytes(),"age".getBytes(),age.getBytes());
context.write(new ImmutableBytesWritable(rowkey.getBytes()),put);
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum","hadoop01:2181,hadoop02:2181,hadoop03:2181");
Connection connection = ConnectionFactory.createConnection(conf);
TableName tableName = TableName.valueOf("myuser3");
Table myuser3 = connection.getTable(tableName);
RegionLocator myuser3RegionLocator = connection.getRegionLocator(tableName);
//實例一個job
Job job = Job.getInstance(conf,"BulkLoad");
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://hadoop01:8020/hbase/input/user.txt"));
job.setJarByClass(BulkLoadMR.class);
job.setMapperClass(createHFile.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Put.class);
job.setOutputFormatClass(HFileOutputFormat2.class);
HFileOutputFormat2.configureIncrementalLoad(job,myuser3,myuser3RegionLocator);
HFileOutputFormat2.setOutputPath(job,new Path("hdfs://hadoop01:8020/tmp/output"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new BulkLoadMR(),args);
}
}
運行結果:生成了HFile文件
[root@hadoop01 ~]# hadoop fs -ls /tmp/output/f1
Found 1 items
-rw-r--r-- 3 Administrator supergroup 1308 2019-12-19 16:52 /tmp/output/f1/b7be1f99900842ee88090bb30cec8175
第四步:開發代碼,加載數據
將我們的輸出路徑下面的HFile文件,加載到我們的hbase表當中去
package com.czxy.demo04;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
public class LoadDatas {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.property.clientPort","2181");
conf.set("hbase.zookeeper.quorum","hadoop01,hadoop02,hadoop03");
Connection connection = ConnectionFactory.createConnection(conf);
Admin admin = connection.getAdmin();
Table myuser3 = connection.getTable(TableName.valueOf("myuser3"));
LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
load.doBulkLoad(new Path("hdfs://hadoop01:8020/tmp/output"),admin,myuser3,connection.getRegionLocator(TableName.valueOf("myuser3")));
}
}
運行前:
hbase(main):001:0> scan 'myuser3'
ROW COLUMN+CELL
0 row(s) in 0.4670 seconds
運行後:
hbase(main):002:0> scan 'myuser3'
ROW COLUMN+CELL
0007 column=f1:age, timestamp=1576745579632, value=18
0007 column=f1:name, timestamp=1576745579632, value=zhangsan
0008 column=f1:age, timestamp=1576745579632, value=25
0008 column=f1:name, timestamp=1576745579632, value=lisi
0009 column=f1:age, timestamp=1576745579632, value=20
0009 column=f1:name, timestamp=1576745579632, value=wangwu
3 row(s) in 0.1310 seconds