讀取Habse中的數據
from pyspark import SparkContext, SparkConf
# spark集羣的地址
conf = SparkConf().setMaster('local').setAppName('ReadHbase')
sc = SparkContext(conf=conf)
# 配置 連接參數
host = 'localhost'
table = 'student'
conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table}
# 鍵轉化器 將鍵值對的格式轉化爲字符串的格式
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
# 值轉換器
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
# pysaprk連接hbase目前只能通過sparkContext對象,所以這裏用的前面設置的spark_context
hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat",
"org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"org.apache.hadoop.hbase.client.Result",
keyConverter=keyConv,
valueConverter=valueConv,
conf=conf)
# 如果想通過sparkSesssion對象連接hbase,代碼如下
'''
hbase_rdd = spark_session.saprkContext.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat",
"org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"org.apache.hadoop.hbase.client.Result",
keyConverter=keyConv,
valueConverter=valueConv,
'''
# 統計 多少個行鍵
count = hbase_rdd.count()
# 進行緩存 存儲
hbase_rdd.cache()
# collect的作用是 將鍵值對分裝到一個列表進行返回
out_put = hbase_rdd.collect()
# 將內容以鍵值對的方式讀出來
for (k, v) in out_put:
print(k, v)
將數據寫入Hbase中
from pyspark import SparkContext, SparkConf
conf = SparkConf().setMaster('local').setAppName('ReadHbase')
sc = SparkContext(conf=conf)
host = "localhost"
table = "student"
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
conf = {"hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table,
"mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
"mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
rawData = ['3,info,name,huangxingxing', '4,info,name,zhangshuaijun']
# 先生成RDD x指的是3,info,name,huangxingxing,x[0] = 3指的是行鍵,x.split(",")後得到一個列表:[3,info,name,huangxingxing];
# .map()之後 將這些字符串列表轉化爲{key:values} {row key:3,info,name,huangxingxing}
sc.parallelize(rawData).map(lambda x: (x[0], x.split(","))).saveAsHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv)