"""
Program for clustering the results of the optimization first stage

@version 1.01
"""

import pandas as pd
from sklearn.cluster import KMeans
import sqlite3
import argparse

# Configure the parser of the command line arguments
parser = argparse.ArgumentParser(description="Сlustering passes for previous job(s)")
parser.add_argument("db_path", type=str, help="Path to database file")
parser.add_argument("id_task", type=int, help="ID of current task")
parser.add_argument("--id_parent_job", type=str, help="ID of parent job(s)")
parser.add_argument("--n_clusters", type=int, default=256, help="Number of clusters")
parser.add_argument(
    "--min_custom_ontester",
    type=float,
    default=0,
    help="Min value for `custom_ontester`",
)
parser.add_argument(
    "--min_trades", type=float, default=40, help="Min value for `trades`"
)
parser.add_argument(
    "--min_sharpe_ratio", type=float, default=0.7, help="Min value for `sharpe_ratio`"
)

# Read values of the command line arguments to variables
args = parser.parse_args()
db_path = args.db_path
id_task = args.id_task
id_parent_job = args.id_parent_job
n_clusters = args.n_clusters
min_custom_ontester = args.min_custom_ontester
min_trades = args.min_trades
min_sharpe_ratio = args.min_sharpe_ratio

# Establish connection to the database
connection = sqlite3.connect(db_path)
cursor = connection.cursor()

# Mark the start of the task
cursor.execute(f"""UPDATE tasks SET status='Process' WHERE id_task={id_task};""")
connection.commit()

# Create the table for clustering results if it is absent
cursor.execute(
    """CREATE TABLE IF NOT EXISTS passes_clusters (
    id_task INTEGER,
    id_pass INTEGER,
    cluster INTEGER
);"""
)

# Clear the result table of the previously obtained results
cursor.execute(f"""DELETE FROM passes_clusters WHERE id_task={id_task};""")

# Download data about parent work passes for the task to the dataframe
query = f"""SELECT p.*
FROM passes p
    JOIN
    tasks t ON t.id_task = p.id_task
    JOIN
    jobs j ON j.id_job = t.id_job    
WHERE p.profit > 0 AND 
      j.id_job IN ({id_parent_job}) AND
      p.custom_ontester >= {min_custom_ontester} AND
      p.trades >= {min_trades} AND 
      p.sharpe_ratio >= {min_sharpe_ratio};"""

print(query)

df = pd.read_sql(query, connection)

# Have a look at the dataframe
print(df)

# List of dataframe columns
print(*enumerate(df.columns), sep="\n")

# Launch clustering on some dataframe columns
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=42).fit(
    df.iloc[:, [7, 8, 9, 24, 29, 30, 31, 32, 33, 36, 45, 46]]
)

# Add cluster indices to the dataframe
df["cluster"] = kmeans.labels_

# Set the current task ID
df["id_task"] = id_task

# Sort the dataframe by clusters and normalized profit
df = df.sort_values(["cluster", "custom_ontester"])

# Have a look at the dataframe
print(df)

# Group lines by cluster and take by a single line
# from the highest normalized profit from each cluster
df = df.groupby("cluster").agg("last").reset_index()

# Have a look at the dataframe
print(df)

# Leave only id_task, id_pass and cluster columns in the dataframe
df = df.iloc[:, [2, 1, 0]]

# Have a look at the dataframe
print(df)

# Save the dataframe to the passes_clusters table (replacing the existing one)
df.to_sql("passes_clusters", connection, if_exists="append", index=False)

# Mark the task execution
cursor.execute(f"""UPDATE tasks SET status='Done' WHERE id_task={id_task};""")
connection.commit()

# Close the connection
connection.close()
