用python實現Hbase的讀寫操作

讀取Habse中的數據

from pyspark import SparkContext, SparkConf

# spark集羣的地址
conf = SparkConf().setMaster('local').setAppName('ReadHbase')
sc = SparkContext(conf=conf)

# 配置 連接參數
host = 'localhost'
table = 'student'
conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table}

# 鍵轉化器 將鍵值對的格式轉化爲字符串的格式
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
# 值轉換器
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"

# pysaprk連接hbase目前只能通過sparkContext對象,所以這裏用的前面設置的spark_context
hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat",
                                "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
                                "org.apache.hadoop.hbase.client.Result",
                                keyConverter=keyConv,
                                valueConverter=valueConv,
                                conf=conf)


# 如果想通過sparkSesssion對象連接hbase,代碼如下
'''
 hbase_rdd = spark_session.saprkContext.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat",
                                            "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
                                            "org.apache.hadoop.hbase.client.Result",
                                             keyConverter=keyConv,
                                             valueConverter=valueConv,                                             
'''
# 統計 多少個行鍵
count = hbase_rdd.count()
# 進行緩存 存儲
hbase_rdd.cache()
# collect的作用是 將鍵值對分裝到一個列表進行返回
out_put = hbase_rdd.collect()
# 將內容以鍵值對的方式讀出來
for (k, v) in out_put:
    print(k, v)

將數據寫入Hbase中

from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster('local').setAppName('ReadHbase')
sc = SparkContext(conf=conf)
host = "localhost"
table = "student"
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
conf = {"hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table,
        "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
        "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
        "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}

rawData = ['3,info,name,huangxingxing', '4,info,name,zhangshuaijun']
# 先生成RDD x指的是3,info,name,huangxingxing,x[0] = 3指的是行鍵,x.split(",")後得到一個列表:[3,info,name,huangxingxing];
# .map()之後 將這些字符串列表轉化爲{key:values} {row key:3,info,name,huangxingxing}
sc.parallelize(rawData).map(lambda x: (x[0], x.split(","))).saveAsHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章