Hi there π
Happy to see you here! I would love to hear your feedback and suggestions!
Letβs connect on LinkedIn
dataset β https://www.kaggle.com/datasets/kushagra1211/usa-sales-product-datasetcleaned
# requirement.txt
duckdb==0.6.1
pandas==1.5.2
seaborn==0.12.1
matplotlib==3.6.2
import pandas as pd
import glob
import time
import duckdb
conn = duckdb.connect() # create an in-memory database
# with pandas
cur_time = time.time()
df = pd.concat([pd.read_csv(f) for f in glob.glob('dataset/*.csv')])
print(f"time: {(time.time() - cur_time)}")
print(df.head(10))
# with duckdb
cur_time = time.time()
df = conn.execute("""
SELECT *
FROM 'dataset/*.csv'
LIMIT 10
""").df()
print(f"time: {(time.time() - cur_time)}")
print(df)
df = conn.execute("""
SELECT *
FROM 'dataset/*.csv'
""").df()
conn.register("df_view", df)
conn.execute("DESCRIBE df_view").df() # doesn't work if you don't register df as a virtual table
conn.execute("SELECT COUNT(*) FROM df_view").df()
df.isnull().sum()
df = df.dropna(how='all')
# Notice we use df and not df_view
# With DuckDB you can run SQL queries on top of Pandas dataframes
conn.execute("SELECT COUNT(*) FROM df").df()