SparkHbase 切分Hfile, 海量數據優化

原創

2020-07-07 21:46

package cn.jkjf.bigdata.utils.test




import cn.jkjf.bigdata.utils.mysql.Global
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client.{ConnectionFactory, HTable}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark._


/**
  * Created by maokm on 2018/9/20.
  */
object hbasetest {

  def main(args: Array[String]) = {
    val sc = new SparkContext("local",s"${this.getClass.getSimpleName}")
    val hdfsRootPath = "hdfs://192.168.152.133:9000/data"
    val hadoopConf = new Configuration()
    hadoopConf.set("fs.defaultFS", hdfsRootPath)
    val conf = HBaseConfiguration.create(hadoopConf)

    conf.set("hbase.zookeeper.quorum", Global.ZKQUORUM)
    conf.set("hbase.zookeeper.property.clientPort", "2181")
    val hbaseConn = ConnectionFactory.createConnection(conf)
    val tableName = "testgrace"
    val table = new HTable(conf, tableName)

    conf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
    val job = Job.getInstance(conf)
    job.setMapOutputKeyClass (classOf[ImmutableBytesWritable])
    job.setMapOutputValueClass (classOf[KeyValue])
    val regionLocator = hbaseConn.getRegionLocator(TableName.valueOf(tableName))
    println("regionLocator" + regionLocator)
    HFileOutputFormat2.configureIncrementalLoad(job, table,regionLocator)

    // Generate 10 sample data:
    val num = sc.parallelize(1 to 10)
    val rdd = num.map(x=>{
       println("xxxxxxxxxxxxxxxxxxxxxxxx" + x)
      val kv: KeyValue = new KeyValue(Bytes.toBytes("11"), "cf".getBytes(), "c1".getBytes(), "value_xxx".getBytes() )
      println("-----kv------")
      println(kv)
      (new ImmutableBytesWritable(Bytes.toBytes("11")), kv)
    })
    rdd.foreach(println)
    rdd.saveAsNewAPIHadoopFile(hdfsRootPath, classOf[ImmutableBytesWritable], classOf[KeyValue], classOf[HFileOutputFormat2], job.getConfiguration())
    //將保存在臨時文件夾的hfile數據保存到hbase中
    val bulkLoader = new LoadIncrementalHFiles(conf)
    bulkLoader.doBulkLoad(new Path(hdfsRootPath), table)
    table.flushCommits()
    table.close()
    hbaseConn.close()
  }

}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

SparkHbase 切分Hfile, 海量數據優化

《日本蠟燭圖》讀書筆記 & 技術分析回測

《期貨-市場技術分析》讀書筆記

Python多線程編程深度探索：從入門到實戰

mongodb處理json數據很好

35K*14 薪，入職了！這公司只要不裁員，我能一直呆下去！

SparkHbase 切分Hfile, 海量數據優化

數據結構之隊列的基本操作(Java基礎)

數據結構之棧的基本操作(Java基礎)

數據結構之漢諾塔(Java基礎)

Spark排序算法之二次排序

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結