1.描述
HBase可以隨機讀寫海量的數據,但是如果把這海量數據導入到HBase卻是一個挑戰。如,將Hive表儘可能快的導入到HBase中。這裏有以下三種解決方案:
- 使用API把數據一條一條地寫入HBase。
- 用HBaseIntegration方法。
- 使用HBase自帶的Bulkload功能。
- 用MapReduce生成HFile。
- 使用LoadIncrementalHFiles.doBulkLoad將HFile導入到HBase。
public class Driver extends Configured implements Tool{
private static Configuration conf = new Configuration();
private static Configuration hconf = null;
private static HBaseAdmin hadmin = null;
public static void connectHBase(){
final String HBASE_CONFIG_ZOOKEEPER_CLIENT = "hbase.zookeeper.property.clientPort";
final String HBASE_ZOOKEEPER_CLIENT_PORT = "2181";
final String HBASE_CONFIG_ZOOKEEPER_QUORUM = "hbase.zookeeper.quorum";
final String HBASE_ZOOKEEPER_SERVER = "hbase38,hbase43,hbase00";
conf.set(HBASE_CONFIG_ZOOKEEPER_CLIENT, HBASE_ZOOKEEPER_CLIENT_PORT);
conf.set(HBASE_CONFIG_ZOOKEEPER_QUORUM, HBASE_ZOOKEEPER_SERVER);
hconf = HBaseConfiguration.create(conf);
try{
hadmin = new HBaseAdmin(hconf);
}
catch (Exception e){
e.printStackTrace();
}
}
public static void main(String[] args)throws Exception{
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length != 4){
System.err.println("Usage: <rcfile> <hfile> <schemafile> <hbasetable>");
System.exit(1);
}
String path = System.getProperty("user.dir") + otherArgs[2];
List<String> fieldNames = HiveTableUtils.getFieldName(path);
StringBuilder sb = new StringBuilder(fieldNames.get(0));
int size = fieldNames.size();
for(int i = 1; i < size; i++){
sb.append(":").append(fieldNames.get(i));
}
conf.set("schema", sb.toString());
if(ToolRunner.run(conf, new Driver(), otherArgs) == 0){
// Importing the generated HFiles into a HBase table
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
loader.doBulkLoad(new Path(otherArgs[1], otherArgs[3]);
System.exit(0);
}
else{
System.exit(1);
}
}
@SuppressWarnings("deprecation")
@Override
public int run(String[] strings) throws Exception {
Configuration config = getConf();
Driver.connectHBase();
Job job = new Job(config, "RCFile to HFile");
job.setJarByClass(Driver.class);
job.setMapperClass(RCFileToHFile.ParseMapper.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(KeyValue.class);
//Reduce's number is 0.
job.setNumReduceTasks(0);
job.setPartitionerClass(SimpleTotalOrderPartitioner.class);
job.setInputFormatClass(RCFileMapReduceInputFormat.class);
// job.setOutputFormatClass(HFileOutputFormat.class);
HTable table = new HTable(config, strings[3]);
HFileOutputFormat.configureIncrementalLoad(job, table);
RCFileMapReduceInputFormat.addInputPath(job, new Path(strings[0]));
FileOutputFormat.setOutputPath(job, new Path(strings[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
}
- 用MapReduce生成HFile。
- 將已生成的HFile從Hive集羣拷貝到HBase集羣。
- 用HBase的命令將HFile導入到HBase中(你也可以用Java代碼如上述代碼,這樣就相對複雜了)。
public class Driver extends Configured implements Tool{
private static Configuration conf = new Configuration();
private static Configuration hconf = null;
private static HBaseAdmin hadmin = null;
public static void connectHBase(){
final String HBASE_CONFIG_ZOOKEEPER_CLIENT = "hbase.zookeeper.property.clientPort";
final String HBASE_ZOOKEEPER_CLIENT_PORT = "2181";
final String HBASE_CONFIG_ZOOKEEPER_QUORUM = "hbase.zookeeper.quorum";
final String HBASE_ZOOKEEPER_SERVER = "hbase38,hbase43,hbase00";
conf.set(HBASE_CONFIG_ZOOKEEPER_CLIENT, HBASE_ZOOKEEPER_CLIENT_PORT);
conf.set(HBASE_CONFIG_ZOOKEEPER_QUORUM, HBASE_ZOOKEEPER_SERVER);
hconf = HBaseConfiguration.create(conf);
try{
hadmin = new HBaseAdmin(hconf);
}
catch (Exception e){
e.printStackTrace();
}
}
public static void main(String[] args)throws Exception{
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length != 4){
System.err.println("Usage: <rcfile> <hfile> <schemafile> <hbasetable>");
System.exit(1);
}
String path = System.getProperty("user.dir") + otherArgs[2];
List<String> fieldNames = HiveTableUtils.getFieldName(path);
StringBuilder sb = new StringBuilder(fieldNames.get(0));
int size = fieldNames.size();
for(int i = 1; i < size; i++){
sb.append(":").append(fieldNames.get(i));
}
conf.set("schema", sb.toString());
System.exit(ToolRunner.run(conf, new Driver(), otherArgs));
}
@SuppressWarnings("deprecation")
@Override
public int run(String[] strings) throws Exception {
Configuration config = getConf();
Driver.connectHBase();
Job job = new Job(config, "RCFile to HFile");
job.setJarByClass(Driver.class);
job.setMapperClass(RCFileToHFile.ParseMapper.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(KeyValue.class);
//Reduce's number is 0.
job.setNumReduceTasks(0);
job.setPartitionerClass(SimpleTotalOrderPartitioner.class);
job.setInputFormatClass(RCFileMapReduceInputFormat.class);
// job.setOutputFormatClass(HFileOutputFormat.class);
HTable table = new HTable(config, strings[3]);
HFileOutputFormat.configureIncrementalLoad(job, table);
RCFileMapReduceInputFormat.addInputPath(job, new Path(strings[0]));
FileOutputFormat.setOutputPath(job, new Path(strings[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
}
第二步,拷貝命令distcp:
# Distributed copy HFile to mycluster-hbase.
hadoop distcp hdfs://mycluster-hive/hfile/hbase hdfs://mycluster-hbase/hbase/test
第三步,Bulkload:
# BulkLoad HFile into hbase table on mycluster-hbase.
hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /hbase/test hbase_table
2.生成HFile
- 解析存儲Hive表元數據的文件。
- 獲取Hive表的元數據,
- 訪問MySQL
- 用HCatalog訪問Hive表
但是,我個人認爲解析文件的方式更爲高效,因爲我們的Hive的metadata一般有數千列。
public class HiveTableUtils {
//Gain hive table columns by parsing file.
public static List<String> getFieldName(String filePath){
File file = new File(filePath);
BufferedReader reader = null;
List<String> fieldName = new ArrayList<String>();
try {
if (file.exists()) {
reader = new BufferedReader(new FileReader(file));
String tmp = null;
while ((tmp = reader.readLine()) != null) {
if (tmp.contains("`") && tmp.contains("COMMENT")) {
int start = tmp.indexOf("`");
int end = tmp.lastIndexOf("`");
fieldName.add(tmp.substring(start + 1, end));
}
}
} else {
System.err.println("The file doesn't exist!");
System.exit(1);
}
reader.close();
}
catch (Exception e) {
e.printStackTrace();
}
return fieldName;
}
public class RCFileToHFile {
public static class ParseMapper extends Mapper<LongWritable, BytesRefArrayWritable, ImmutableBytesWritable, KeyValue>{
// private List<String> fieldName = null;
private String[] fieldName = null;
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
super.setup(context);
Configuration conf = context.getConfiguration();
String schema = conf.get("schema");
fieldName = schema.split(":");
// fieldName = new ArrayList<String>();
// fieldName.add("id");
// fieldName.add("name");
// fieldName.add("age");
}
@Override
protected void map(LongWritable key, BytesRefArrayWritable values,
Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
Text line = new Text();
List<String> fields = new ArrayList<String>();
int size = values.size();
for(int i = 0; i < size; i++){
BytesRefWritable value = values.get(i);
line.set(value.getData(), value.getStart(), value.getLength());
fields.add(line.toString());
}
String rowKey = fields.get(0);
String columnFamily = "cf";
int length = fieldName.length;
ImmutableBytesWritable hKey = new ImmutableBytesWritable();
hKey.set(rowKey.getBytes());
KeyValue kv = null;
for(int i = 1; i < length; i++){
kv = new KeyValue(hKey.get(), columnFamily.getBytes(), fieldName[i].getBytes(), fields.get(i).getBytes());
context.write(hKey, kv);
}
}
}
}
注意:
- 生成HFile時,連接的HBase表必須存在。
- 連接的HBase表,可以與最後Bulkload的目標HBase表可以不同名,但是表的結構必須相同(如,Column Family的個數)。
- 版本之間的匹配。