实例讲解hadoop中的hive查询(python语言实现)

q986 发表于 2018-11-1 08:58:23

#!/usr/bin/python　　#-*-coding:UTF-8 -*-
　　import sys
　　import os
　　import string
　　import re
　　import MySQLdb
　　from hive_service import ThriftHive
　　from hive_service.ttypes import HiveServerException
　　from thrift import Thrift
　　from thrift.transport import TSocket
　　from thrift.transport import TTransport
　　from thrift.protocol import TBinaryProtocol
　　def hiveExe(hsql,dbname):
　　#定义hive查询函数
　　try:
　　transport = TSocket.TSocket('192.168.10.1', 10000)
　　transport = TTransport.TBufferedTransport(transport)
　　protocol = TBinaryProtocol.TBinaryProtocol(transport)
　　client = ThriftHive.Client(protocol)
　　transport.open()
　　client.execute('ADD jar /opt/modules/hive/hive-0.7.1/lib/hive-contrib-0.7.1.jar')
　　client.execute("use "+dbname)
　　row = client.fetchOne()
　　#使用库名，只需一次fetch，用fetchOne
　　client.execute(hsql)
　　return client.fetchAll()
　　#查询所有数据，用fetchAll()
　　transport.close()
　　except Thrift.TException, tx:
　　print '%s' % (tx.message)
　　def mysqlExe(sql):
　　try:
　　conn = MySQLdb.connect(user="test",passwd="test123",host="127.0.0.1",db="active2_ip",port=5029)
　　except Exception,data:
　　print "Could not connect to MySQL server.:",data
　　try:
　　cursor = conn.cursor()
　　cursor.execute(sql)
　　return row
　　cursor.commit()
　　cursor.close()
　　conn.close()
　　except Exception,data:
　　print "Could not Fetch anything:",data
　　dbname = "active2"
　　date = os.popen("date -d '1 day ago' +%Y%m%d").read().strip()
　　#shell方式取昨天日期，读取并去前后\n
　　date.close()
　　sql = "create table IF NOT EXISTS "+dbname+"_group_ip_"+date+" like "+dbname+"_group_ip;load data infile '/tmp/"+dbname+"_"+date+".csv' into table "+dbname+"_group_ip_"+date+" FIELDS TERMINATED BY ','"
　　#以模板表创建日期表，并load data到该表中
　　hsql = "insert overwrite local directory '/tmp/"+dbname+"_"+date+"' select count(version) as vc,stat_hour,type,version,province,city,isp from "+dbname+"_"+date+" group by province,city,version,type,stat_hour,isp"
　　#hive查询，并将查询结果导出到本地/tmp/active2_20111129目录下，可能生成多个文件
　　hiveExe(hsql, dbname)
　　#执行查询
　　os.system("sudo cat /tmp/"+dbname+"_"+date+"/* > /tmp/tmplog ")
　　#将多个文件通过shell合并为一个文件tmplog
　　file1 = open("/tmp/tmplog", 'r')
　　#打开合并后的临时文件
　　file2 = open("/tmp/"+dbname+"_"+date+".csv",'w')
　　#打开另一个文件，做文字替换。因为hive导出结果，其分隔符为特殊字符。所以需要做替换，格式为csv，故用逗号分隔
　　sep = ','
　　for line in file1:
　　tmp = line[:-1].split('\x01')
　　#hive导出文件分隔符为ascii中的001，\x01是16进制，但其实也就是十进制的1
　　replace = sep.join(tmp)
　　file2.write(replace+"\n")
　　file1.close()
　　file2.close()
　　os.system("sudo rm -f /tmp/tmplog")
　　#删除临时的tmplog
　　mysqlExe(sql)
　　#执行mysql查询，创建表和加载数据。
　　os.system("sudo rm -f /tmp/"+dbname+"_"+date)

页: [1]

运维网's Archiver

实例讲解hadoop中的hive查询(python语言实现)