一、啓動thrift
因爲用到了happybase,需要先在服務器上啓動thrift服務, 關閉終端thrift繼續運行命令如下:
nohup hbase thrift -p 9090 start
二、讀取hbase的代碼
class GetHbase(object):
def __init__(self, hostname,table_name,start_date):
self.hostname=hostname #主機名
self.table_name = table_name #表名
self.start_date = start_date
def getdata(self):
connection = happybase.Connection(self.hostname, autoconnect=False)
connection.open()
print "已成功連接到Hbase"
print "準備連接到表weibo_content"
table = connection.table(self.table_name)
scanner = table.scan() # scanner瀏覽的是Hbase中所有字段數據
print "已成功連接到Hbase中表weibo_content"
# for e in scanner:
# print e
# 下面開始讀取這次運行需要的數據
mydata = list()
d = dict()
#讀入大於start_date的數據
print "開始讀取%s之後的數據" % (self.start_date)
for key, data in scanner:
if data['cont:pubDate'] >= self.start_date:
d['pubDate'] = data['cont:pubDate']
Timedict = TimeMatch(d['pubDate']) #將日期解析爲week_num,month_num等形式
d['author'] = data['cont:author']
# 過濾'cont:content'爲空的值
try:
d['content'] = data['cont:content']
except Exception as e:
del data
d = {'pubDate':d['pubDate'],'author':d['author'],'content':d['content'],'Timedict':Timedict}
# print type(d),'\n',d,'\n',d['content']
mydata.append(d)
return d
三、調用
mydata爲讀出的數據
mydata = GetHbase(host, table_name, start_date).getdata()