import sqlite3
import duckdb
import pandas as pd
import numpy as np
import time

def generate_large_dataset(n_trades=10_000_000, n_quotes=1_000_000):
    print(f"--- Generating {n_trades:,} trades and {n_quotes:,} quotes ---")
    tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'NVDA', 'META', 'NFLX']
    
    # Generate sorted timestamps for better realism and performance
    quote_times = np.sort(np.random.randint(0, 10**8, n_quotes))
    trade_times = np.sort(np.random.randint(0, 10**8, n_trades))

    quotes = pd.DataFrame({
        'timestamp': pd.to_datetime(quote_times, unit='s'),
        'ticker': np.random.choice(tickers, n_quotes),
        'bid_price': np.random.uniform(100, 500, n_quotes)
    })
    
    trades = pd.DataFrame({
        'timestamp': pd.to_datetime(trade_times, unit='s'),
        'ticker': np.random.choice(tickers, n_trades),
        'quantity': np.random.randint(1, 100, n_trades)
    })
    return trades, quotes

def benchmark_engines(trades, quotes):
    # --- SETUP DUCKDB ---
    duck_conn = duckdb.connect(":memory:")
    # We register the dataframes as virtual tables (zero-copy)
    duck_conn.register("t", trades)
    duck_conn.register("q", quotes)

    # --- SETUP SQLITE ---
    sql_conn = sqlite3.connect(":memory:")
    trades.to_sql("t", sql_conn, index=False)
    quotes.to_sql("q", sql_conn, index=False)
    sql_conn.execute("CREATE INDEX idx_q ON q(ticker, timestamp DESC)")

    print("\n--- Running Engine-Only Benchmark (Computing SUM) ---")

    # 1. DuckDB Benchmark
    duck_query = """
    SELECT SUM(t.quantity * q.bid_price) 
    FROM t ASOF JOIN q ON t.ticker = q.ticker AND t.timestamp >= q.timestamp
    """
    start = time.perf_counter()
    duck_res = duck_conn.execute(duck_query).fetchone()[0]
    duck_time = time.perf_counter() - start
    print(f"DuckDB Time: {duck_time:.4f}s")

    # 2. SQLite Benchmark
    sql_query = """
    SELECT SUM(t.quantity * (
        SELECT q.bid_price FROM q 
        WHERE q.ticker = t.ticker AND q.timestamp <= t.timestamp 
        ORDER BY q.timestamp DESC LIMIT 1
    )) FROM t
    """
    start = time.perf_counter()
    sql_res = sql_conn.execute(sql_query).fetchone()[0]
    sql_time = time.perf_counter() - start
    print(f"SQLite Time: {sql_time:.4f}s")

    print(f"\n[WINNER]: DuckDB is {sql_time/duck_time:.1f}x faster at this scale.")

if __name__ == "__main__":
    # Scaling to 1 million rows to see the architectural breakout
    t_df, q_df = generate_large_dataset(n_trades=1_000_000, n_quotes=100_000)
    benchmark_engines(t_df, q_df)