import numpy as np
import pandas as pd
import random
from datetime import datetime
import MetaTrader5 as mt5
import time
import concurrent.futures
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler

# GLOBALS
MARKUP = 0.00001
BACKWARD = datetime(2000, 1, 1)
FORWARD = datetime(2010, 1, 1)
EXAMWARD = datetime(2024, 1, 1)
MAX_OPEN_TRADES = 3
symbol = "EURUSD"

def retrieve_data(symbol, retries_limit=300):
    terminal_path = "C:/Program Files/RoboForex - MetaTrader 5/Arima/terminal64.exe"

    attempt = 0
    raw_data = None

    while attempt < retries_limit:
        if not mt5.initialize(path=terminal_path):
            print("MetaTrader initialization failed")
            return None

        instrument_count = mt5.symbols_total()
        if instrument_count > 0:
            print(f"Number of instruments in the terminal: {instrument_count}")
        else:
            print("No instruments in the terminal")

        rates = mt5.copy_rates_range(symbol, mt5.TIMEFRAME_H1, BACKWARD, EXAMWARD)
        mt5.shutdown()

        if rates is None or len(rates) == 0:
            print(f"Data for {symbol} not available (attempt {attempt+1})")
            attempt += 1
            time.sleep(1)
        else:
            raw_data = pd.DataFrame(rates[:-1], columns=['time', 'open', 'high', 'low', 'close', 'tick_volume'])
            raw_data['time'] = pd.to_datetime(raw_data['time'], unit='s')
            raw_data.set_index('time', inplace=True)
            break

    if raw_data is None:
        print(f"Data retrieval failed after {retries_limit} attempts")
        return None

    # Add simple features
    raw_data['raw_SMA_10'] = raw_data['close'].rolling(window=10).mean()
    raw_data['raw_SMA_20'] = raw_data['close'].rolling(window=20).mean()
    raw_data['Price_Change'] = raw_data['close'].pct_change() * 100

    # Additional features
    raw_data['raw_Std_Dev_Close'] = raw_data['close'].rolling(window=20).std()
    raw_data['raw_Volume_Change'] = raw_data['tick_volume'].pct_change() * 100

    raw_data['raw_Prev_Day_Price_Change'] = raw_data['close'] - raw_data['close'].shift(1)
    raw_data['raw_Prev_Week_Price_Change'] = raw_data['close'] - raw_data['close'].shift(7)
    raw_data['raw_Prev_Month_Price_Change'] = raw_data['close'] - raw_data['close'].shift(30)

    raw_data['Consecutive_Positive_Changes'] = (raw_data['Price_Change'] > 0).astype(int).groupby((raw_data['Price_Change'] > 0).astype(int).diff().ne(0).cumsum()).cumsum()
    raw_data['Consecutive_Negative_Changes'] = (raw_data['Price_Change'] < 0).astype(int).groupby((raw_data['Price_Change'] < 0).astype(int).diff().ne(0).cumsum()).cumsum()
    raw_data['Price_Density'] = raw_data['close'].rolling(window=10).apply(lambda x: len(set(x)))
    raw_data['Fractal_Analysis'] = raw_data['close'].rolling(window=10).apply(lambda x: 1 if x.idxmax() else (-1 if x.idxmin() else 0))
    raw_data['Price_Volume_Ratio'] = raw_data['close'] / raw_data['tick_volume']
    raw_data['Median_Close_7'] = raw_data['close'].rolling(window=7).median()
    raw_data['Median_Close_30'] = raw_data['close'].rolling(window=30).median()
    raw_data['Price_Volatility'] = raw_data['close'].rolling(window=20).std() / raw_data['close'].rolling(window=20).mean()

    print("\nOriginal columns:")
    print(raw_data[['close', 'high', 'low', 'open', 'tick_volume']].tail(100))

    print("\nList of features:")
    print(raw_data.columns.tolist())

    print("\nLast 100 features:")
    print(raw_data.tail(100))

    # Replace NaN values with the mean
    raw_data.fillna(raw_data.mean(), inplace=True)

    return raw_data

retrieve_data(symbol)

def augment_data(raw_data, noise_level=0.01, time_shift=1, scale_range=(0.9, 1.1)):
    print(f"Number of rows before augmentation: {len(raw_data)}")

    # Copy raw_data into augmented_data
    augmented_data = raw_data.copy()

    # Add noise
    noisy_data = raw_data.copy()
    noisy_data += np.random.normal(0, noise_level, noisy_data.shape)

    # Replace NaN values with the mean
    noisy_data.fillna(noisy_data.mean(), inplace=True)

    augmented_data = pd.concat([augmented_data, noisy_data])
    print(f"Added {len(noisy_data)} rows after adding noise")

    # Time shift
    shifted_data = raw_data.copy()
    shifted_data.index += pd.DateOffset(hours=time_shift)

    # Replace NaN values with the mean
    shifted_data.fillna(shifted_data.mean(), inplace=True)

    augmented_data = pd.concat([augmented_data, shifted_data])
    print(f"Added {len(shifted_data)} rows after time shift")

    # Scaling
    scale = np.random.uniform(scale_range[0], scale_range[1])
    scaled_data = raw_data.copy()
    scaled_data *= scale

    # Replace NaN values with the mean
    scaled_data.fillna(scaled_data.mean(), inplace=True)

    augmented_data = pd.concat([augmented_data, scaled_data])
    print(f"Added {len(scaled_data)} rows after scaling")

    # Inversion
    inverted_data = raw_data.copy()
    inverted_data *= -1

    # Replace NaN values with the mean
    inverted_data.fillna(inverted_data.mean(), inplace=True)

    augmented_data = pd.concat([augmented_data, inverted_data])
    print(f"Added {len(inverted_data)} rows after inversion")

    print(f"Number of rows after augmentation: {len(augmented_data)}")

    # Print dates by years
    print("Print dates by years:")
    for year, group in augmented_data.groupby(augmented_data.index.year):
        print(f"Year {year}: {group.index}")

    return augmented_data

def markup_data(data, target_column, label_column, markup_ratio=0.00002):
    data.loc[:, label_column] = np.where(data.loc[:, target_column].shift(-1) > data.loc[:, target_column] + markup_ratio, 1, 0)

    data.loc[data[label_column].isna(), label_column] = 0

    print(f"Number of markups on price change greater than markup: {data[label_column].sum()}")

    return data

def label_data(data, symbol, min=2, max=48):
    terminal_path = "C:/Program Files/RoboForex - MetaTrader 5/Arima/terminal64.exe"

    if not mt5.initialize(path=terminal_path):
        print("Error connecting to MetaTrader 5 terminal")
        return

    symbol_info = mt5.symbol_info(symbol)
    stop_level = 100 * symbol_info.point
    take_level = 800 * symbol_info.point

    labels = []

    for i in range(data.shape[0] - max):
        rand = random.randint(min, max)
        curr_pr = data['close'].iloc[i]
        future_pr = data['close'].iloc[i + rand]
        min_pr = data['low'].iloc[i:i + rand].min()
        max_pr = data['high'].iloc[i:i + rand].max()

        price_change = abs(future_pr - curr_pr)

        if price_change > take_level and future_pr > curr_pr and min_pr > curr_pr - stop_level:
            labels.append(1)  # Growth
        elif price_change > take_level and future_pr < curr_pr and max_pr < curr_pr + stop_level:
            labels.append(0)  # Fall
        else:
            labels.append(None)

    data = data.iloc[:len(labels)].copy()
    data['labels'] = labels

    data.dropna(inplace=True)

    X = data.drop('labels', axis=1)
    y = data['labels']

    rus = RandomUnderSampler(random_state=2)
    X_balanced, y_balanced = rus.fit_resample(X, y)

    data_balanced = pd.concat([X_balanced, y_balanced], axis=1)

    return data

def generate_new_features(data, num_features=200, random_seed=1):
    random.seed(random_seed)
    new_features = {}

    for _ in range(num_features):
        feature_name = f'feature_{len(new_features)}'

        col1_idx, col2_idx = random.sample(range(len(data.columns)), 2)
        col1, col2 = data.columns[col1_idx], data.columns[col2_idx]

        operation = random.choice(['add', 'subtract', 'multiply', 'divide', 'shift', 'rolling_mean', 'rolling_std', 'rolling_max', 'rolling_min', 'rolling_sum'])

        if operation == 'add':
            new_features[feature_name] = data[col1] + data[col2]
        elif operation == 'subtract':
            new_features[feature_name] = data[col1] - data[col2]
        elif operation == 'multiply':
            new_features[feature_name] = data[col1] * data[col2]
        elif operation == 'divide':
            new_features[feature_name] = data[col1] / data[col2]
        elif operation == 'shift':
            shift = random.randint(1, 10)
            new_features[feature_name] = data[col1].shift(shift)
        elif operation == 'rolling_mean':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).mean()
        elif operation == 'rolling_std':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).std()
        elif operation == 'rolling_max':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).max()
        elif operation == 'rolling_min':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).min()
        elif operation == 'rolling_sum':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).sum()

    new_data = pd.concat([data, pd.DataFrame(new_features)], axis=1)

    print("\nGenerated features:")
    print(new_data[list(new_features.keys())].tail(100))

    return data

from sklearn.mixture import GaussianMixture

def cluster_features_by_gmm(data, n_components=4):
    X = data.drop(['label', 'labels'], axis=1)

    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())

    gmm = GaussianMixture(n_components=n_components, random_state=1)

    gmm.fit(X)

    data['cluster'] = gmm.predict(X)

    print("\nFeature clusters:")
    print(data[['cluster']].tail(100))

    return data

# Retrieve data
raw_data = retrieve_data(symbol)

# Augment data
augmented_data = augment_data(raw_data)

# Markup data
marked_data = markup_data(raw_data.copy(), 'close', 'label')

# Label data
symbol = "EURUSD"
labeled_data = label_data(marked_data, symbol)

# Print number of labels for each category
print("Number of growth labels (1.0):", labeled_data['labels'].value_counts()[1.0])
print("Number of fall labels (0.0):", labeled_data['labels'].value_counts()[0.0])

# Generate new features
labeled_data_generate = generate_new_features(labeled_data, num_features=100, random_seed=42)

# Cluster features using GMM
labeled_data_clustered = cluster_features_by_gmm(labeled_data_generate, n_components=4)

from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

def feature_engineering(data, n_features_to_select=10):
    # Remove the 'label' column as it is not a feature
    X = data.drop(['label', 'labels'], axis=1)
    y = data['labels']

    # Create a RandomForestClassifier model
    clf = RandomForestClassifier(n_estimators=100, random_state=1)

    # Use RFECV to select n_features_to_select best features
    rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='accuracy', n_jobs=-1, verbose=1,
                  min_features_to_select=n_features_to_select)
    rfecv.fit(X, y)

    # Return a DataFrame with the best features, 'label' column, and 'labels' column
    selected_features = X.columns[rfecv.get_support(indices=True)]
    selected_data = data[selected_features.tolist() + ['label', 'labels']]  # Convert selected_features to a list

    # Print the table of best features
    print("\nBest features:")
    print(pd.DataFrame({'Feature': selected_features}))

    return selected_data

labeled_data_engineered = feature_engineering(labeled_data_clustered, n_features_to_select=10)