python利用pandas.DataFrame批量写入clickhouse
python 包引入
import json
import logging
import math
import os
import pandas as pd
import datetime
import requests
from clickhouse_driver import Client
from cmc.config import config
#在类外定义全局变量,这样当前这个py文件都可以共享
process_date = datetime.datetime.now().strftime("%Y%m%d")
class MyPyClassDemo:
def __init__(self, api_key: str):
self.api_key = api_key
def tstFuctions(self):
pass
....
def getClickHouseClient(self):
try:
host_name = 'xxxx.xxx.com'
client = Client(
host=host_name,
database='your db name',
user='root',
password='123123',
send_receive_timeout=20,
settings={'use_numpy': True}
)
return client
except Exception as e:
print("Error: "+str(e))
return None
注意这里一定要有 settings={'use_numpy': True} 这个设置,否则会报错:
TypeError: Unsupported column type: <class 'numpy.ndarray'>. list or tuple is expected.
def process_json_files(self):
tmp_dir = 'out/cmc_data/'
files = os.listdir(tmp_dir) #获取当前目录下的所有文件名称列表
print(files)
storage_client = self.getClickHouseClient() #加载CH数据库连接
for file in files:
if not file.startswith('cmc_projects'):
continue
with open(tmp_dir + file, 'r') as f: #根据相对路径读取json文件
json_string = f.read() #获取json字符串
data_list = json.loads(json_string) #转成list类型
#df = pd.DataFrame.from_dict(json_normalize(data_list), orient='columns') #按列读取(也包含json格式当中嵌套的列),用全部读取到的列来构造dFrame当中的类
df = pd.json_normalize(data_list) # 新版本的推荐写法
#print(df.T) # 打印读取到的列
insert_df = df[['id', 'name', 'symbol', 'slug', 'rank', 'is_active', 'first_historical_data', 'platform.id', 'platform.name', 'platform.symbol', 'platform.slug', 'platform.token_address']] # 抽取指定的列,重组新的dframe
insert_df.insert(loc=12, column='update_time', value=process_date) # 在新重组的dframe当中插入一列(数据写入的日期)
insert_df["platform.id"] = insert_df["platform.id"].apply(self.modify) # 通过apply() 修改某一列的值
#insert_df = insert_df.loc[0:1] # (调试)取第0行
#insert_df.iloc[:,0:12] # (调试)取0-12列
insert_df.rename(columns={"platform.id": "platform_id", "platform.name": "platform_name", "platform.symbol": "platform_symbol", "platform.slug": "platform_slug", "platform.token_address": "token_address"}, inplace=True) # 要求dframe当中的列字段必须与CH数据库当中的列字段一一对应,否则报keyError错
#print(insert_df)
#print(type(insert_df["platform_id"]))
storage_client.insert_dataframe('INSERT INTO tstdb.ods_infos (*) VALUES', insert_df) # 用CH提供的client批量将的frame当中的数据一次刷入CH当中
注意: 这里一次批量刷入CH的条数,取决于json文件当中的数据条数,可在源文件或者data_list
的位置做数据量的控制操作
def modify(self, id):
if math.isnan(id) : # python <class 'float'>判断是nan
return int(0)
else :
return int(id)
return id
CH建表语句和说明
CREATE TABLE tst_db.ods_infos (
id UInt32,
name String,
symbol String,
slug String,
rank UInt32,
is_active UInt32,
first_historical_data String,
platform_id UInt32,
platform_name String,
platform_symbol String,
platform_slug String,
token_address String,
update_time String
)
ENGINE = ReplacingMergeTree
PARTITION BY update_time
PRIMARY KEY id -- 这里一定要设置主键,插入相同key值的数据才会覆盖跟新,否则记录会重复
ORDER BY (
id,
name,
symbol,
platform_id,
platform_name
)
SETTINGS index_granularity = 8192,
storage_policy = 'policy_name_eth';