spark寫入Hbase

原創

2019-10-26 03:16

方式一：

package com.bupt.spark.hbase
//1 table put
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}


object SparkHbaseTablePut {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ints = List(1,2,3,4,5,6,7,8,9,10)
    val unit = sc.parallelize(ints,1)
    unit.foreach(t => {
      println(t)
      val configuration = HBaseConfiguration.create()
      val connection = ConnectionFactory.createConnection(configuration)
      val table = connection.getTable(TableName.valueOf("student"))
      val put = new Put(Bytes.toBytes("spark_"+ t))
      put.addColumn(Bytes.toBytes("info"),Bytes.toBytes("count"),Bytes.toBytes(t.toString))
      table.put(put)
      table.close()
      connection.close()
    })


  }
}

//方式二：

package com.bupt.spark.hbase
//2 table batch put
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}

object SparkHbaseTableBatchPut {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ints = List(1,2,3,4,5,6,7,8,9,10)
    val unit = sc.parallelize(ints,2)
    println(unit.partitions.length)
    unit.foreachPartition(f => {
      val list = f.toList
      println(list)
      val puts = new java.util.ArrayList[Put]()
      for(next <- list){
        val put = new Put(Bytes.toBytes("spark_batch"+next))
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(next.toString))
        puts.add(put)
      }
      val configuration = HBaseConfiguration.create()
      val connection = ConnectionFactory.createConnection(configuration)
      val table = connection.getTable(TableName.valueOf("student"))
      table.put(puts)
      table.close()
      connection.close()
    })
  }
}

方式三：

package com.bupt.spark.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat

object SparkHbaseTableOutPutFormatPut {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ints = List(1,2,3,4,5,6,7,8,9,10)
    val unit = sc.parallelize(ints,2)
    val configuration = HBaseConfiguration.create()
    configuration.set(TableOutputFormat.OUTPUT_TABLE, "student")
    configuration.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]].getName)
    configuration.set("mapreduce.job.output.key.class", classOf[ImmutableBytesWritable].getName)
    configuration.set("mapreduce.job.output.value.class", classOf[Put].getName)
    val unit1 = unit.map(t => {
      val keyOut = new ImmutableBytesWritable()
      val put = new Put(Bytes.toBytes("spark_tablePut" + t))
      put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(t.toString))
      keyOut.set(put.getRow)
      (keyOut, put)
    })
    unit1.saveAsNewAPIHadoopDataset(configuration)
  }
}

方式四：

package com.bupt.spark.hbase

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}

object SparkHbasePutPartition {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("aa").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ints = List(1,2,3,4,5,6,7,8,9,10)
    val unit = sc.parallelize(ints,2)
    val configuration = HBaseConfiguration.create()
    configuration.set(TableOutputFormat.OUTPUT_TABLE, "student")
    configuration.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]].getName)
    configuration.set("mapreduce.job.output.key.class", classOf[ImmutableBytesWritable].getName)
    configuration.set("mapreduce.job.output.value.class", classOf[Put].getName)

    println(unit.getNumPartitions)
    var rdd = unit.repartition(5)
    println(rdd.getNumPartitions)

    val replication = rdd.mapPartitions(f => {
      val list = f.toList
      println(list)
     import scala.collection.mutable.ListBuffer
      val list1 = new ListBuffer[(ImmutableBytesWritable, Put)]
      val writable = new ImmutableBytesWritable()
      for(next <- list){
        val put = new Put(Bytes.toBytes("spark_part"+next))
        writable.set(put.getRow)
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("count"), Bytes.toBytes(next.toString))
        list1 += ((writable,put))
      }
      list1.toIterator
    })
    replication.saveAsNewAPIHadoopDataset(configuration)


  }
}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

spark寫入Hbase

[轉帖]使用NMT和pmap解決JVM資源泄漏問題原創

Python實現大麥網搶票的四大關鍵技術點解析

Python 安裝庫指令大全

salesforce零基礎學習（一百三十八）零碎知識點小總結（十）

一款開源的.NET程序集反編譯、編輯和調試神器

關於接口協議，你必須要知道這些！

【2024-05-21】以茶會友

RDD算子練習

RDD中的函數傳遞（序列化問題）

RDD行動算子

第5章 RDD編程進階

reducByKey總結

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結