import duckdb
import pandas as pd
import numpy as np
import time

def generate_eigen_dataset(n_rows=1_000_000, vec_dim=2):
    """Generates a large set of synthetic eigenvectors for testing."""
    print(f"--- Generating {n_rows:,} eigenvectors (dim={vec_dim}) ---")
    
    # Generate random vectors and normalize them (as a Johansen test would)
    data = np.random.randn(n_rows, vec_dim)
    norms = np.linalg.norm(data, axis=1, keepdims=True)
    normalized_vecs = data / norms
    
    df = pd.DataFrame({
        'date': pd.date_range(start='2000-01-01', periods=n_rows, freq='h'),
        'vec': list(normalized_vecs)
    })
    return df

def benchmark_rwec(df):
    # --- SETUP DUCKDB ---
    con = duckdb.connect(":memory:")
    # Register the dataframe as a virtual table
    con.register("eigen_table", df)

    print("\n--- Running RWEC Benchmark (Cosine Similarity + Angle) ---")

    # 1. DuckDB Benchmark (Vectorized SQL)
    # Uses array_cosine_similarity and LAG to compare consecutive rows
    duck_query = """
    SELECT AVG(DEGREES(ACOS(inner_sim)))
    FROM (
        SELECT 
            LEAST(GREATEST(array_cosine_similarity(
                vec::DOUBLE[2], 
                LAG(vec::DOUBLE[2]) OVER (ORDER BY date)
            ), -1), 1) as inner_sim
        FROM eigen_table
    ) 
    WHERE inner_sim IS NOT NULL
    """
    
    start = time.perf_counter()
    duck_res = con.execute(duck_query).fetchone()[0]
    duck_time = time.perf_counter() - start
    print(f"DuckDB Time: {duck_time:.4f}s (Result Avg Angle: {duck_res:.2f}°)")

    # 2. Python/NumPy Loop Benchmark (Original Script Logic)
    # We iterate through the dataframe as in the rwec.py script
    start = time.perf_counter()
    
    similarities = []
    # Replicating the logic from rwec.py vector_similarity()
    vecs = np.stack(df['vec'].values)
    for i in range(1, len(vecs)):
        vec1 = vecs[i-1]
        vec2 = vecs[i]
        # Manual cosine similarity calculation
        cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        angle_deg = np.degrees(np.arccos(np.clip(cos_sim, -1, 1)))
        similarities.append(angle_deg)
        
    py_res = np.mean(similarities)
    py_time = time.perf_counter() - start
    print(f"Python Loop Time: {py_time:.4f}s (Result Avg Angle: {py_res:.2f}°)")

    print(f"\n[WINNER]: DuckDB is {py_time/duck_time:.1f}x faster for RWEC logic.")

if __name__ == "__main__":
    # Test with 500,000 rows to see the gap
    eigen_df = generate_eigen_dataset(n_rows=500_000)
    benchmark_rwec(eigen_df)