1.HBase和MapReduce的集成
需求:將myuser表當中的f1列族的name和age字段寫入待myuser2這張表的f1列族當中去
ImmutableBytesWritable 序列,hbase的存儲類型
NullWriter沒有數據
context上下文的作用是起到橋樑作用把map階段處理完的數據傳遞給reduce階段
(1)在原有基礎上導入集成MR的maven工程
(2)代碼實現本地運行
1.創建Mapper類,讀取出表myuser的name和age字段,寫入到上下文中
package cn.it.hbase.demo;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.util.List;
/**
* 負責myuser表當中的數據
*如果mapper類需要讀取hbase表數據,那麼我們mapper類需要繼承TbaleMapper這樣的一個類
*/
public class HBaseSourceMapper extends TableMapper<Text, Put> {
/**
*
* @param key rowkey
* @param value result對象,封裝了我們一條條的數據
* @param context 上下文對象
* @throws IOException
* @throws InterruptedException
*
* 需求:讀取myuser表當中f1列族下面的name和age列
*/
@Override
//rowkey是個行鍵,數據類型是序列
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
//獲取到rowkey的字節數組
byte[] bytes = key.get();
String rowkey = Bytes.toString(bytes);
Put put = new Put(bytes);
//獲取到所有的cell
List<Cell> cells = value.listCells();
for (Cell cell : cells) {
//獲取cell對應的列族
byte[] familyBytes = CellUtil.cloneFamily(cell);
//獲取對應的列
byte[] qualifierBytes = CellUtil.cloneQualifier(cell);
//這裏判斷只需要f1列族,下面的name和ageie
if (Bytes.toString(familyBytes).equals("f1") && Bytes.toString(qualifierBytes).equals("name") || Bytes.toString(qualifierBytes).equals("age")){
put.add(cell);
}
}
//將數據寫出去
if (! put.isEmpty()){
//k2,v2
context.write(new Text(rowkey),put);
}
}}
2.創建Reduce類,
package cn.it.hbase.demo;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.Text;
import java.io.IOException;
/**
* 負責將數據寫入到myuser2
*/
public class HBaseSinkReducer extends TableReducer<Text, Put, ImmutableBytesWritable> {
@Override
protected void reduce(Text key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
for (Put put : values) {
//k3,v3
context.write(new ImmutableBytesWritable(key.toString().getBytes()),put);
}
}
}
3.創建一個程序入口,主類
package cn.it.hbase.demo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class HBaseMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job = Job.getInstance(super.getConf(), "habseMR");
//打包運行必須設置main方法所在的主類
job.setJarByClass(HBaseMain.class);
Scan scan = new Scan();
//定義mapper類和reduce類
/*
String table,
Scan scan,
Class<? extends TableMapper> mapper,
Class<?> outputKeyClass,
Class<?> outputValueClass, Job job,
boolean addDependencyJars
*/
//使用工具類初始化Mapper類
TableMapReduceUtil.initTableMapperJob("myuser",
scan,
HBaseSourceMapper.class,
Text.class,
Put.class,
job,
false);
//使用工具類初始化Reducer類
TableMapReduceUtil.initTableReducerJob("myuser2",
HBaseSinkReducer.class,
job);
boolean b = job.waitForCompletion(true);
return b?1:0;
}
//程序的入口類
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
int run = ToolRunner.run(configuration, new HBaseMain(),args);
System.exit(run);
}
}
(2)集羣運行
1.在maven中在添加一個打包插件
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
2.打包後上傳集羣
運行上傳的jar包
yarn jar My-hbase-1.0-SNAPSHOT.jar cn.it.hbase.demo.HBaseMain
2.讀取hdfs數據寫入到hbase當中
(1)設置數據文件上傳到hdfs中
cd /export/servers
vim user.txt
0007
0008
0009
hdfs dfs -mkidr -p /hbase/input
hdfs dfs put user.txt /hbase/input
(2)開發MR程序對hdfs讀取存儲至hbase
1.創建map類
package cn.it.yuge.hdfsToHbase;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 通過這個mapper讀取hdfs上面的文件,然後進行處理
*/
public class HDFSMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//讀取數據之後不做任何處理,直接將數據傳到reduce
context.write(value,NullWritable.get());
}
}
2.創建reduce類
package cn.it.yuge.hdfsToHbase;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import javax.management.ImmutableDescriptor;
import java.io.IOException;
/**
* 0007 zhangsan 18
*/
public class HDFSReduce extends TableReducer<Text, NullWritable, ImmutableBytesWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
String[] split = key.toString().split("\t");
Put put = new Put(split[0].getBytes());//rowkey對象
//我們對一個rowkey對象設置列族,和列族下面對應的列
put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
put.addColumn("f1".getBytes(),"age".getBytes(),split[2].getBytes());
context.write(new ImmutableBytesWritable(split[0].getBytes()),put);
}
}
3.創建程序入口,主類
package cn.it.yuge.hdfsToHbase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(),"Hdfs2HBase");
job.setJarByClass(JobMain.class);
//MR八步驟
//1.指定讀取類和讀取路徑
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input"));
//2.指定map類,和map,k2,v2數據類型
job.setMapperClass(HDFSMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
//分區,排序,規約,分組 Shuffle操作
//設置reduce類
TableMapReduceUtil.initTableReducerJob("myuser2",HDFSReduce.class,job);
boolean b = job.waitForCompletion(true);
return b?1:0;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
int run = ToolRunner.run(configuration, new JobMain(), args);
System.exit(run);
}
}
這就添加成功了
4.總結
-
如果讀取hbase裏面的數據 mapper類需要繼承TableMapper
-
如果需要讀取hdfs上面的文本文件數據,mapper類需要繼承Mapper
-
如果要將reduce程序處理完的數據,保存到hbase裏面去,reduce類,一定要繼承TableReducer
5.通過bulkload的方式批量加載數據到HBase當中去
(1)設置map類
package cn.it.cn.it.BulkLoad;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class HDFSMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
Put put = new Put(split[0].getBytes());
put.addColumn("f1".getBytes(),"name".getBytes(),split[2].getBytes());
put.addColumn("f1".getBytes(),"age".getBytes(),split[2].getBytes());
ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable(split[0].getBytes());
context.write(immutableBytesWritable,put);
}
}
(2)設置主類
package cn.it.cn.it.BulkLoad;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class BulkLoadMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(),"BulkLoad");
//設置讀取文件類,和路徑
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input"));
Connection connection = ConnectionFactory.createConnection(super.getConf());
Table table = connection.getTable(TableName.valueOf("myuser2"));
//設定map類
job.setMapperClass(HDFSMapper.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Put.class);
//設定輸出爲HFile類型
//配置增量添加數據
HFileOutputFormat2.configureIncrementalLoad(job,table,connection.getRegionLocator(TableName.valueOf("myuser2")));
job.setOutputFormatClass(HFileOutputFormat2.class);
HFileOutputFormat2.setOutputPath(job,new Path("hdfs:node01:8020/hbase/hfile_out"));
//等待程序執行完畢
boolean b = job.waitForCompletion(true);
return b?1:0;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
int run = ToolRunner.run(configuration, new BulkLoadMain(),args);
System.exit(run);
}
}
(3)我們的輸出路徑下面的HFile文件,加載到我們的hbase表當中去
public class LoadData {
public static void main(String[] args) throws Exception {
Configuration configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.property.clientPort", "2181");
configuration.set("hbase.zookeeper.quorum", "node01,node02,node03");
Connection connection = ConnectionFactory.createConnection(configuration);
Admin admin = connection.getAdmin();
Table table = connection.getTable(TableName.valueOf("myuser2"));
LoadIncrementalHFiles load = new LoadIncrementalHFiles(configuration);
load.doBulkLoad(new Path("hdfs://node01:8020/hbase/output_hfile"), admin,table,connection.getRegionLocator(TableName.valueOf("myuser2")));
}
}
3.hive和HBase整合
…
4.HBase的預分區
(1)使用場景
- 當HFile達到10GB的閾值後region和DFile會一份爲二,降低單一HFile的佔用,但是不管如何去分裂,都是在node01這個單節點上,並沒有解決數據傾斜的問題.數據還是一樣的多.
- 所以預分區主要是解決HBse數據熱點問題,單一region故障問題
其次是: - 增加數據讀寫效率
- 負載均衡,防止數據傾斜
- 方便集羣容災調度region
- 優化Map數量
(2)如何預分區?
在創建表時提前設置預分區規則
(3)如何設定預分區?
1.手動指定預分區
create 'staff','info','partition1',SPLITS => ['1000','2000','3000','4000']
2.使用16進制算法生成預分區
create 'staff2','info','partition2',{NUMREGIONS => 15, SPLITALGO => 'HexStringSplit'}
numregions 設置有多少個region
splitalgo 按照什麼進行劃分
HexStringSplit 16進制進行劃分
3.使用JavaAPI創建預分區
/**
* 通過javaAPI進行HBase的表的創建以及預分區操作
*/
@Test
public void hbaseSplit() throws IOException {
//獲取連接
Configuration configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.quorum", "node01:2181,node02:2181,node03:2181");
Connection connection = ConnectionFactory.createConnection(configuration);
Admin admin = connection.getAdmin();
//自定義算法,產生一系列Hash散列值存儲在二維數組中
byte[][] splitKeys = {{1,2,3,4,5},{'a','b','c','d','e'}};
//通過HTableDescriptor來實現我們表的參數設置,包括表名,列族等等
HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf("staff3"));
//添加列族
hTableDescriptor.addFamily(new HColumnDescriptor("f1"));
//添加列族
hTableDescriptor.addFamily(new HColumnDescriptor("f2"));
admin.createTable(hTableDescriptor,splitKeys);
admin.close();
}
5.hbase的rowkey設計技巧
- rowkey不宜過長
- rowkey均勻的散列。將數據均勻的落到不同的region裏面去,避免所有的數據都落第到一個region裏面去了,造成數據熱點問題。在rowkey前幾位,隨機生成一些數字,實現均勻的負載
5.1避免rowkey的熱點問題:
-
加鹽:在rowkey前面增加隨機數具體就是給rowkey分配一個隨機前綴以使得它和之前的rowkey的開頭不同。分配的前綴種類數量應該和你想使用數據分散到不同的region的數量一致。加鹽之後的rowkey就會根據隨機生成的前綴分散到各個region上,以避免熱點。
-
hash取值:對rowkey進行取hashcode
-
反轉:對rowkey進行反轉
-
時間戳反轉:
6.HBase的協處理器
我打一段代碼打成jar包,加載到某一個表裏面去
(1)通過協處理器的開發,將proc1表當中插入數據之前,將數據保存一份到pro2表裏面去
1.HBase當中創建第一張表proc1,第二張表proc2
create 'proc1','info'
create 'proc2','info'
2.協處理器
public class MyProcessor implements RegionObserver,RegionCoprocessor {
static Connection connection = null;
static Table table = null;
//使用靜態代碼塊來創建連接對象
static{
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum","node01:2181");
try {
connection = ConnectionFactory.createConnection(conf);
table = connection.getTable(TableName.valueOf("proc2"));
} catch (Exception e) {
e.printStackTrace();
}
}
private RegionCoprocessorEnvironment env = null;
//定義列族名
private static final String FAMAILLY_NAME = "info";
//定義列明
private static final String QUALIFIER_NAME = "name";
//2.0加入該方法,否則無法生效
@Override
public Optional<RegionObserver> getRegionObserver() {
// Extremely important to be sure that the coprocessor is invoked as a RegionObserver
return Optional.of(this);
}
//初始化協處理器環境
@Override
public void start(CoprocessorEnvironment e) throws IOException {
env = (RegionCoprocessorEnvironment) e;
}
//停止環境,一般不用管它
@Override
public void stop(CoprocessorEnvironment e) throws IOException {
// nothing to do here
}
/**
* 覆寫prePut方法,在我們數據插入之前進行攔截,
* @param e
* @param put put對象裏面封裝了我們需要插入到目標表的數據
* @param edit
* @param durability
* @throws IOException
*/
@Override
public void prePut(final ObserverContext<RegionCoprocessorEnvironment> e,
final Put put, final WALEdit edit, final Durability durability)
throws IOException {
try {
//通過put對象獲取插入數據的rowkey
byte[] rowBytes = put.getRow();
String rowkey = Bytes.toString(rowBytes);
//獲取我們插入數據的name字段的值
List<Cell> list = put.get(Bytes.toBytes(FAMAILLY_NAME), Bytes.toBytes(QUALIFIER_NAME));
//判斷如果沒有獲取到info列族和name列,直接返回即可
if (list == null || list.size() == 0) {
return;
}
//獲取到info列族,name列對應的cell
Cell cell2 = list.get(0);
//通過cell獲取數據值
String nameValue = Bytes.toString(CellUtil.cloneValue(cell2));
//創建put對象,將數據插入到proc2表裏面去
Put put2 = new Put(rowkey.getBytes());
put2.addColumn(Bytes.toBytes(FAMAILLY_NAME), Bytes.toBytes(QUALIFIER_NAME), nameValue.getBytes());
table.put(put2);
table.close();
} catch (Exception e1) {
return ;
}
}
}
3.打包上傳至HDFS
hdfs dfs -mkdir -p /processor
hdfs dfs -put processor.jar /processor
4.將打好的jar包掛載到proc1表當中去
alter 'proc1',METHOD => 'table_att','Coprocessor'=>'hdfs://node01:8020/processor/processor.jar|cn.itcast.hbasemr.demo4.MyProcessor|1001|'
table_att 加載一些屬性
Coprecessor :協處理器
hdfs://node01:8020/processor/processor.jar 指定jar包路徑
cn.itcast.hbasemr.demo4.MyProcessor 協處理器類的全路徑名稱
1001 起一個編號,主要是用來區分其他的協處理器
5.proc1表當中添加數據
進入hbase-shell客戶端,然後直接執行以下命令向proc1表當中添加數據
put 'proc1','0001','info:name','zhangsan'
put 'proc1','0001','info:age','28'
put 'proc1','0002','info:name','lisi'
put 'proc1','0002','info:age','25'
scan 'proc2'
我們會發現,proc2表當中也插入了數據,並且只有info列族,name列
注意:如果需要卸載我們的協處理器,那麼進入hbase的shell命令行,執行以下命令即可
disable 'proc1'
alter 'proc1',METHOD=>'table_att_unset',NAME=>'coprocessor$1'
enable 'proc1'
(2)協處理器分爲兩大類
- observer:可以對數據進行前置或者後置攔截
- endpoint:可以用來求最大值,最小值,平均值
7.二級索引
一級索引:按照rowkey進行查詢
1. get
2. scan startRow stopRow
3. scan 全表