本文目录一览:
- 1、在ubuntu环境下怎么利用python将数据批量导入数据hbase
- 2、如何在python中访问hbase的数据
- 3、Python访问hbase集群
- 4、python可以把爬虫的数据写入hbase么
- 5、如何在Python中访问HBase的数据
在ubuntu环境下怎么利用python将数据批量导入数据hbase
能够单条导入就能够批量导入
配置 thrift
python使用的包 thrift
个人使用的python 编译器是pycharm community edition. 在工程中设置中,找到project interpreter, 在相应的工程下,找到package,然后选择 “+” 添加, 搜索 hbase-thrift (Python client for HBase Thrift interface),然后安装包。
安装服务器端thrift。
参考官网,同时也可以在本机上安装以终端使用。
thrift Getting Started
也可以参考安装方法 python 调用HBase 范例
首先,安装thrift
下载thrift,这里,我用的是thrift-0.7.0-dev.tar.gz 这个版本
tar xzf thrift-0.7.0-dev.tar.gz
cd thrift-0.7.0-dev
sudo ./configure –with-cpp=no –with-ruby=no
sudo make
sudo make install
然后,到HBase的源码包里,找到
src/main/resources/org/apache/hadoop/hbase/thrift/
执行
thrift –gen py Hbase.thrift
mv gen-py/hbase/ /usr/lib/python2.4/site-packages/ (根据python版本可能有不同)
获取数据示例 1
# coding:utf-8
from thrift import Thrift
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol
from hbase import Hbase
# from hbase.ttypes import ColumnDescriptor, Mutation, BatchMutation
from hbase.ttypes import *
import csv
def client_conn():
# Make socket
transport = TSocket.TSocket('hostname,like:localhost', port)
# Buffering is critical. Raw sockets are very slow
transport = TTransport.TBufferedTransport(transport)
# Wrap in a protocol
protocol = TBinaryProtocol.TBinaryProtocol(transport)
# Create a client to use the protocol encoder
client = Hbase.Client(protocol)
# Connect!
transport.open()
return client
if __name__ == "__main__":
client = client_conn()
# r = client.getRowWithColumns('table name', 'row name', ['column name'])
# print(r[0].columns.get('column name')), type((r[0].columns.get('column name')))
result = client.getRow("table name","row name")
data_simple =[]
# print result[0].columns.items()
for k, v in result[0].columns.items(): #.keys()
#data.append((k,v))
# print type(k),type(v),v.value,,v.timestamp
data_simple.append((v.timestamp, v.value))
writer.writerows(data)
csvfile.close()
csvfile_simple = open("data_xy_simple.csv", "wb")
writer_simple = csv.writer(csvfile_simple)
writer_simple.writerow(["timestamp", "value"])
writer_simple.writerows(data_simple)
csvfile_simple.close()
print "finished"
会基础的python应该知道result是个list,result[0].columns.items()是一个dict 的键值对。可以查询相关资料。或者通过输出变量,观察变量的值与类型。
说明:上面程序中 transport.open()进行链接,在执行完后,还需要断开transport.close()
目前只涉及到读数据,之后还会继续更新其他dbase操作。
如何在python中访问hbase的数据
python访问hbase需要额外的库,一般用thrift。使用thrift调用hbase,由于篇幅限制在这里不能说的很详细。
请百度Phthon thrift 或 python hbase 自行查阅相关资料。
下面是一个例子仅供参考
# coding:utf-8
from thrift import Thrift
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol
from hbase import Hbase
from hbase.ttypes import *
import csv
def client_conn():
transport = TSocket.TSocket('hostname,like:localhost', port)
transport = TTransport.TBufferedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = Hbase.Client(protocol)
transport.open()
return client
if __name__ == "__main__":
client = client_conn()
result = client.getRow("table name","row name")
data_simple =[]
for k, v in result[0].columns.items(): #.keys()
data_simple.append((v.timestamp, v.value))
writer.writerows(data)
csvfile.close()
csvfile_simple = open("data_xy_simple.csv", "wb")
writer_simple = csv.writer(csvfile_simple)
writer_simple.writerow(["timestamp", "value"])
writer_simple.writerows(data_simple)
csvfile_simple.close()
Python访问hbase集群
HBase-thrift项目是对HBase Thrift接口的封装,屏蔽底层的细节,使用户可以方便地通过HBase Thrift接口访问HBase集群,python通过thrift访问HBase。
python可以把爬虫的数据写入hbase么
在已经安装了HBase服务的服务器中,已经自动安装了HBase的Thrift的脚本,路径为:/usr/lib/hbase/include/thrift
。
需要使用这个脚本生成基于Python语言的HBase的Thrift脚本,具体命令如下:
thrift
--gen
py
hbase2.thrift
命令执行成功后会生成名为gen-py的目录,其中包含了python版本的HBase包。
主要文件介绍如下:
l
Hbase.py
中定义了一些HbaseClient可以使用的方法
l
ttypes.py中定义了HbaseClient传输的数据类型
将生成的HBase包放入项目代码或者放入Python环境的依赖包目录中即可调用。
如何在Python中访问HBase的数据
python访问hbase数据
#!/usr/bin/python
import getopt,sys,time
from thrift.transport.TSocket import TSocket
from thrift.transport.TTransport import TBufferedTransport
from thrift.protocol import TBinaryProtocol
from hbase import Hbase
def usage():
print '''Usage :
-h: Show help information;
-l: Show all table in hbase;
-t {table} Show table descriptors;
-t {table} -k {key} : show cell;
-t {table} -k {key} -c {coulmn} : Show the coulmn;
-t {table} -k {key} -c {coulmn} -v {versions} : Show more version;
(write by liuhuorong@koudai.com)
'''
class geilihbase:
def __init__(self):
self.transport = TBufferedTransport(TSocket("127.0.0.1", "9090"))
self.transport.open()
self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
self.client = Hbase.Client(self.protocol)
def __del__(self):
self.transport.close()
def glisttable(self):
for table in self.client.getTableNames():
print table
def ggetColumnDescriptors(self,table):
rarr=self.client.getColumnDescriptors(table)
if rarr:
for (k,v) in rarr.items():
print "%-20s\t%s" % (k,v)
def gget(self,table,key,coulmn):
rarr=self.client.get(table,key,coulmn)
if rarr:
print "%-15s %-20s\t%s" % (rarr[0].timestamp,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(rarr[0].timestamp/1000)),rarr[0].value)
def ggetrow(self,table,key):
rarr=self.client.getRow(table, key)
if rarr:
for (k,v) in rarr[0].columns.items():
print "%-20s\t%-15s %-20s\t%s" % (k,v.timestamp,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(v.timestamp/1000)),v.value)
def ggetver(self, table, key, coulmn, versions):
rarr=self.client.getVer(table,key,coulmn, versions);
if rarr:
for row in rarr:
print "%-15s %-20s\t%s" % (row.timestamp,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(row.timestamp/1000)),row.value)
def main(argv):
tablename=""
key=""
coulmn=""
versions=""
try:
opts, args = getopt.getopt(argv, "lht:k:c:v:", ["help","list"])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit(0)
elif opt in ("-l", "--list"):
ghbase=geilihbase()
ghbase.glisttable()
sys.exit(0)
elif opt == '-t':
tablename = arg
elif opt == '-k':
key = arg
elif opt == '-c':
coulmn = arg
elif opt == '-v':
versions = int(arg)
if ( tablename and key and coulmn and versions ):
ghbase=geilihbase()
ghbase.ggetver(tablename, key, coulmn, versions)
sys.exit(0)
if (tablename and key and coulmn ):
ghbase=geilihbase()
ghbase.gget(tablename, key, coulmn)
sys.exit(0)
if (tablename and key ):
ghbase=geilihbase()
ghbase.ggetrow(tablename, key)
sys.exit(0)
if (tablename ):
ghbase=geilihbase()
ghbase.ggetColumnDescriptors(tablename)
sys.exit(0)
usage()
sys.exit(1)
if __name__ == "__main__":
main(sys.argv[1:])