其他
简单代码让你的数据库聊天机器人支持Excel、CSV、JSON、Parquet甚至数据湖
昨天介绍了5 行代码打造你自己的大模型数据库聊天机器人,今天则在这个的基础上,利用 DuckDB 的强大功能,扩展更多数据源的支持。
已经向官方提交了 PR,当然可以使用 patch 的方式来实现,
import os
import requests
def connect_to_duckdb(url: str="memory", init_sql: str = None):
"""
Connect to a DuckDB database. This is just a helper function to set [`vn.run_sql`][vanna.run_sql]
Args:
url (str): The URL of the database to connect to.
init_sql (str, optional): SQL to run when connecting to the database. Defaults to None.
Returns:
None
"""
try:
import duckdb
except ImportError:
raise DependencyError(
"You need to install required dependencies to execute this method,"
" run command: \npip install vanna[duckdb]"
)
# URL of the database to download
if url==":memory:" or url=="":
path=":memory:"
else:
# Path to save the downloaded database
if os.path.exists(url):
path=url
else:
path = os.path.basename(urlparse(url).path)
# Download the database if it doesn't exist
if not os.path.exists(path):
response = requests.get(url)
response.raise_for_status() # Check that the request was successful
with open(path, "wb") as f:
f.write(response.content)
# Connect to the database
conn = duckdb.connect(path)
if init_sql:
conn.query(init_sql)
def run_sql_duckdb(sql: str):
return conn.query(sql).to_df()
# global run_sql
vn.run_sql = run_sql_duckdb
vn.connect_to_duckdb=connect_to_duckdb
介绍下几种使用方法
通过 DuckDB 访问 SQLite
import vanna as vn
vn.set_api_key('your_vanna_api_key')
vn.set_model('chinook')
init_sql="""
attach '/Users/m2max/py/notebook/duckdb/data/Chinook_Sqlite.sqlite' as sqlite;
SET search_path TO 'sqlite,memory';
"""
vn.connect_to_duckdb(url=':memory:',init_sql=init_sql)
vn.ask('What are the top 10 artists by sales?')
这个和直接访问 SQLite 没有区别。
通过 DuckDB 访问 CSV
这个昨天的文章附截图了
vn.create_model('iris','duckdb')
vn.set_model('iris')
init_sql="""
CREATE VIEW IF NOT EXISTS iris AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/data/iris.csv'
"""
vn.connect_to_duckdb(url=':memory:',init_sql=init_sql)
vn.ask('分析下鸢尾花表')
通过 DuckDB 访问 Excel
vn.set_model('iris')
init_sql="""
install spatial;
load spatial;
CREATE VIEW IF NOT EXISTS iris AS SELECT * FROM st_read('/Users/m2max/py/notebook/duckdb/data/iris.xlsx',layer='Sheet1');
"""
vn.connect_to_duckdb(url=':memory:',init_sql=init_sql)
vn.ask('分析下鸢尾花表')
4.通过 DuckDB 访问 Parquet
#创建新模型
vn.create_model(model="tpc-duck", db_type="duckdb")
vn.set_model('tpc-duck')
#使用问题集训练模型
vn.train(json_file='./questions.json')
init_sql=""""""
vn.connect_to_duckdb(url='/Users/m2max/temp/duckdb0',init_sql=init_sql)
vn.ask('Who are the top 10 customers by Sales?')
questions.json 是从源代码中 tpc-h 的 questions.json 修改而来,原文件是为 Snowflake 准备的,我只是简单地把snowflake_sample_data.tpch_sf1.
替换为空。我这里为了用演示,没有做仔细验证。
duckdb0 是使用 duckdb 的 tcp-h 插件生成的一系列 parquet 文件,然后创建的视图,
CREATE VIEW IF NOT EXISTS lineitem AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/10/lineitem/*.parquet';
CREATE VIEW IF NOT EXISTS orders AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/10/orders/*.parquet';
CREATE VIEW IF NOT EXISTS partsupp AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/10/partsupp/*.parquet';
CREATE VIEW IF NOT EXISTS part AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/10/part/*.parquet';
CREATE VIEW IF NOT EXISTS supplier AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/10/supplier/*.parquet';
CREATE VIEW IF NOT EXISTS nation AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/10/nation/*.parquet';
CREATE VIEW IF NOT EXISTS region AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/10/region/*.parquet';
CREATE VIEW IF NOT EXISTS customer AS SELECT * FROM '/Users/m2max/py/notebook/duckdb/10/customer/*.parquet';
更多强大应用就留给大家去解锁吧。