import numpy as np
import pandas as pd
import random  
from datetime import datetime
import MetaTrader5 as mt5
import time
import concurrent.futures
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import cross_val_score, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.mixture import GaussianMixture
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler

# GLOBALS
MARKUP = 0.00001
BACKWARD = datetime(2000, 1, 1) 
FORWARD = datetime(2010, 1, 1)
EXAMWARD = datetime(2024, 1, 1) 
MAX_OPEN_TRADES = 3
symbol = "EURUSD"

def retrieve_data(symbol, retries_limit=300):
    terminal_path = "C:/Program Files/RoboForex - MetaTrader 5/Arima/terminal64.exe"
    
    attempt = 0
    raw_data = None
    
    while attempt < retries_limit:
        if not mt5.initialize(path=terminal_path): 
            print("MetaTrader initialization failed")
            return None
            
        instrument_count = mt5.symbols_total() 
        if instrument_count > 0:
            print(f"Instruments in terminal: {instrument_count}")
        else:
            print("No instruments in terminal")
            
        rates = mt5.copy_rates_range(symbol, mt5.TIMEFRAME_H1, BACKWARD, EXAMWARD)  
        mt5.shutdown()

        if rates is None or len(rates) == 0:
            print(f"Data for {symbol} not available (attempt {attempt+1})")  
            attempt += 1
            time.sleep(1) 
        else:
            raw_data = pd.DataFrame(rates[:-1], columns=['time', 'open', 'high', 'low', 'close', 'tick_volume'])
            raw_data['time'] = pd.to_datetime(raw_data['time'], unit='s') 
            raw_data.set_index('time', inplace=True)
            break
            
    if raw_data is None:
        print(f"Data retrieval failed after {retries_limit} attempts")
        return None
    
    # Adding simple features
    raw_data['raw_SMA_10'] = raw_data['close'].rolling(window=10).mean()
    raw_data['raw_SMA_20'] = raw_data['close'].rolling(window=20).mean()
    raw_data['Price_Change'] = raw_data['close'].pct_change() * 100  

    # Additional features
    raw_data['raw_Std_Dev_Close'] = raw_data['close'].rolling(window=20).std() 
    raw_data['raw_Volume_Change'] = raw_data['tick_volume'].pct_change() * 100
    
    raw_data['raw_Prev_Day_Price_Change'] = raw_data['close'] - raw_data['close'].shift(1) 
    raw_data['raw_Prev_Week_Price_Change'] = raw_data['close'] - raw_data['close'].shift(7)
    raw_data['raw_Prev_Month_Price_Change'] = raw_data['close'] - raw_data['close'].shift(30)
    
    raw_data['Consecutive_Positive_Changes'] = (raw_data['Price_Change'] > 0).astype(int).groupby((raw_data['Price_Change'] > 0).astype(int).diff().ne(0).cumsum()).cumsum()  # Number of consecutive positive price changes
    raw_data['Consecutive_Negative_Changes'] = (raw_data['Price_Change'] < 0).astype(int).groupby((raw_data['Price_Change'] < 0).astype(int).diff().ne(0).cumsum()).cumsum()  # Number of consecutive negative price changes
    raw_data['Price_Density'] = raw_data['close'].rolling(window=10).apply(lambda x: len(set(x)))  # Price density
    raw_data['Fractal_Analysis'] = raw_data['close'].rolling(window=10).apply(lambda x: 1 if x.idxmax() else (-1 if x.idxmin() else 0))  # Fractal analysis
    raw_data['Price_Volume_Ratio'] = raw_data['close'] / raw_data['tick_volume']  # Price to volume ratio
    raw_data['Median_Close_7'] = raw_data['close'].rolling(window=7).median()  # Median price for the last 7 days
    raw_data['Median_Close_30'] = raw_data['close'].rolling(window=30).median()  # Median price for the last 30 days
    raw_data['Price_Volatility'] = raw_data['close'].rolling(window=20).std() / raw_data['close'].rolling(window=20).mean()  # Price volatility

    print("\nOriginal columns:")
    print(raw_data[['close', 'high', 'low', 'open', 'tick_volume']].tail(100))

    print("\nList of features:")
    print(raw_data.columns.tolist())

    print("\nLast 100 features:")  
    print(raw_data.tail(100))

    # Replace NaN values with mean
    raw_data.fillna(raw_data.mean(), inplace=True)
    
    return raw_data

retrieve_data(symbol)

def augment_data(raw_data, noise_level=0.01, time_shift=1, scale_range=(0.9, 1.1)):
    print(f"Number of rows before augmentation: {len(raw_data)}")

    # Copy raw_data to augmented_data
    augmented_data = raw_data.copy()

    # Adding noise
    noisy_data = raw_data.copy()
    noisy_data += np.random.normal(0, noise_level, noisy_data.shape)

    # Replace NaN with mean
    noisy_data.fillna(noisy_data.mean(), inplace=True)

    augmented_data = pd.concat([augmented_data, noisy_data])
    print(f"Added {len(noisy_data)} rows after adding noise")

    # Time shift
    shifted_data = raw_data.copy()
    shifted_data.index += pd.DateOffset(hours=time_shift)

    # Replace NaN with mean
    shifted_data.fillna(shifted_data.mean(), inplace=True)

    augmented_data = pd.concat([augmented_data, shifted_data])
    print(f"Added {len(shifted_data)} rows after time shift")

    # Scaling
    scale = np.random.uniform(scale_range[0], scale_range[1])
    scaled_data = raw_data.copy()
    scaled_data *= scale

    # Replace NaN with mean
    scaled_data.fillna(scaled_data.mean(), inplace=True)

    augmented_data = pd.concat([augmented_data, scaled_data])
    print(f"Added {len(scaled_data)} rows after scaling")

    # Inversion
    inverted_data = raw_data.copy()
    inverted_data *= -1

    # Replace NaN with mean
    inverted_data.fillna(inverted_data.mean(), inplace=True)

    augmented_data = pd.concat([augmented_data, inverted_data])
    print(f"Added {len(inverted_data)} rows after inversion")

    print(f"Number of rows after augmentation: {len(augmented_data)}")

    # Print dates per year
    print("Print dates per year:")
    for year, group in augmented_data.groupby(augmented_data.index.year):
        print(f"Year {year}: {group.index}")
        
    return augmented_data

# Data retrieval
raw_data = retrieve_data(symbol)

# Applying augmentation
augmented_data = augment_data(raw_data)

def markup_data(data, target_column, label_column, markup_ratio=0.00002):
    # Create a new column for labels (buy/sell) based on the target column (e.g., 'close') and end_date
    data.loc[:, label_column] = np.where(data.loc[:, target_column].shift(-1) > data.loc[:, target_column] + markup_ratio, 1, 0)

    # Replace NaN values with 0 (no trade)
    data.loc[data[label_column].isna(), label_column] = 0

    # Print the number of labels set
    print(f"Number of price change labels above markup: {data[label_column].sum()}")

    return data



def label_data(data, symbol, min=2, max=48):
    terminal_path = "C:/Program Files/RoboForex - MetaTrader 5/Arima/terminal64.exe"

    if not mt5.initialize(path=terminal_path):
        print("Error connecting to MetaTrader 5 terminal")
        return

    symbol_info = mt5.symbol_info(symbol)
    stop_level = 100 * symbol_info.point
    take_level = 800 * symbol_info.point

    labels = []

    # Limit the loop by the end_date
    for i in range(data.shape[0] - max):
        rand = random.randint(min, max)
        curr_pr = data['close'].iloc[i]
        future_pr = data['close'].iloc[i + rand]
        min_pr = data['low'].iloc[i:i + rand].min()
        max_pr = data['high'].iloc[i:i + rand].max()

        price_change = abs(future_pr - curr_pr)

        if price_change > take_level and future_pr > curr_pr and min_pr > curr_pr - stop_level:
            labels.append(1)  # Rise
        elif price_change > take_level and future_pr < curr_pr and max_pr < curr_pr + stop_level:
            labels.append(0)  # Fall
        else:
            labels.append(None)

    data = data.iloc[:len(labels)].copy()
    data['labels'] = labels

    # Remove rows with missing values
    data.dropna(inplace=True)

    # Split data into features (X) and labels (y)
    X = data.drop('labels', axis=1)
    y = data['labels']

    # Balance classes
    rus = RandomUnderSampler(random_state=2)
    X_balanced, y_balanced = rus.fit_resample(X, y)

    # Combine balanced features and labels
    data_balanced = pd.concat([X_balanced, y_balanced], axis=1)

    return data

# Getting marked_data
marked_data = markup_data(raw_data.copy(), 'close', 'label')
labeled_data = label_data(marked_data, symbol)

# Output the number of labels in each category
print("Number of rise labels (1.0):", labeled_data['labels'].value_counts()[1.0])
print("Number of fall labels (0.0):", labeled_data['labels'].value_counts()[0.0])

def generate_new_features(data, num_features=200, random_seed=1):
    random.seed(random_seed)
    new_features = {}

    for _ in range(num_features):
        # Generate random names for new features
        feature_name = f'feature_{len(new_features)}'

        # Generate random indices to select existing features
        col1_idx, col2_idx = random.sample(range(len(data.columns)), 2)
        col1, col2 = data.columns[col1_idx], data.columns[col2_idx]

        # Select a random operation to create a new feature
        operation = random.choice(['add', 'subtract', 'multiply', 'divide', 'shift', 'rolling_mean', 'rolling_std', 'rolling_max', 'rolling_min', 'rolling_sum'])

        if operation == 'add':
            new_features[feature_name] = data[col1] + data[col2]
        elif operation == 'subtract':
            new_features[feature_name] = data[col1] - data[col2]
        elif operation == 'multiply':
            new_features[feature_name] = data[col1] * data[col2]
        elif operation == 'divide':
            new_features[feature_name] = data[col1] / data[col2]
        elif operation == 'shift':
            shift = random.randint(1, 10)
            new_features[feature_name] = data[col1].shift(shift)
        elif operation == 'rolling_mean':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).mean()
        elif operation == 'rolling_std':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).std()
        elif operation == 'rolling_max':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).max()
        elif operation == 'rolling_min':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).min()
        elif operation == 'rolling_sum':
            window = random.randint(2, 20)
            new_features[feature_name] = data[col1].rolling(window).sum()

    # Add new features to the original DataFrame
    new_data = pd.concat([data, pd.DataFrame(new_features)], axis=1)

    # Print generated features as a table
    print("\nGenerated features:")
    print(new_data[list(new_features.keys())].tail(100))

    return data

labeled_data_generate = generate_new_features(labeled_data, num_features=100, random_seed=42)

def cluster_features_by_gmm(data, n_components=4):
    # Remove the 'label' column as it is not a feature
    X = data.drop(['label', 'labels'], axis=1)

    # Replace infinite and too large values with the median value of the corresponding feature column
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())

    # Create a GMM model
    gmm = GaussianMixture(n_components=n_components, random_state=1)

    # Train the model on the data
    gmm.fit(X)

    # Return a DataFrame with clusters for each row of data
    data['cluster'] = gmm.predict(X)

    # Print a table with clusters
    print("\nFeature clusters:")
    print(data[['cluster']].tail(100))

    return data

labeled_data_clustered = cluster_features_by_gmm(labeled_data_generate, n_components=4)

def feature_engineering(data, n_features_to_select=20):
    # Remove the 'label' column as it is not a feature
    X = data.drop(['label', 'labels'], axis=1)
    y = data['labels']

    # Replace infinite and too large values with the median value of the corresponding feature column
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())

    # Create a RandomForestClassifier model
    clf = RandomForestClassifier(n_estimators=100, random_state=1)

    # Use RFECV to select the top n_features_to_select features
    rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='accuracy', n_jobs=-1, verbose=1,
                  min_features_to_select=n_features_to_select)
    rfecv.fit(X, y)

    # Return a DataFrame with the best features, the 'label' column, and the 'labels' column
    selected_features = X.columns[rfecv.get_support(indices=True)]
    selected_data = data[selected_features.tolist() + ['label', 'labels']]  # Convert selected_features to a list

    # Print a table with the best features
    print("\nBest features:")
    print(pd.DataFrame({'Feature': selected_features}))

    return selected_data

labeled_data_engineered = feature_engineering(labeled_data_clustered, n_features_to_select=10)

def train_xgboost_classifier(data, num_boost_rounds=1000):
    # Check if data is not empty
    if data.empty:
        raise ValueError("Data should not be empty")

    # Check if all required columns are present in the data
    required_columns = ['label', 'labels']
    if not all(column in data.columns for column in required_columns):
        raise ValueError(f"Data is missing required columns: {required_columns}")

    # Remove the 'label' column as it is not a feature
    X = data.drop(['label', 'labels'], axis=1)
    y = data['labels']

    # Check if all features have numeric data type
    if not all(pd.api.types.is_numeric_dtype(X[column]) for column in X.columns):
        raise ValueError("All features should have numeric data type")

    # Create an XGBoostClassifier model
    clf = xgb.XGBClassifier(objective='binary:logistic', random_state=1)

    # Define hyperparameters for grid search
    param_grid = {
        'max_depth': [3, 7, 12],
        'learning_rate': [0.1, 0.3, 0.5],
        'n_estimators': [100, 600, 1200]
    }

    # Train the model on the data using cross-validation and grid search for hyperparameters
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)

    # Calculate the mean prediction accuracy
    accuracy = grid_search.best_score_

    print(f"Mean prediction accuracy on cross-validation: {accuracy:.2f}")

    # Return the trained model
    return grid_search.best_estimator_

labeled_data_engineered = feature_engineering(labeled_data_clustered, n_features_to_select=20)

# Get all data
raw_data = labeled_data_engineered

# Test the model on all data
train_data = raw_data[raw_data.index <= FORWARD]

# Test the model on all data
test_data = raw_data[raw_data.index >= FORWARD]

# Train the XGBoost model on the filtered data
xgb_clf = train_xgboost_classifier(train_data, num_boost_rounds=100)

# Test the model on all data
test_data = raw_data[raw_data.index >= FORWARD]
X_test = test_data.drop(['label', 'labels'], axis=1)
y_test = test_data['labels']
predicted_labels = xgb_clf.predict(X_test)

# Calculate prediction accuracy
accuracy = (predicted_labels == y_test).mean()
print(f"Prediction accuracy: {accuracy:.2f}")

def test_model(model, X_test, y_test, markup, initial_balance=10000.0, point_cost=0.00001):
    balance = initial_balance
    trades = 0
    profits = []

    # Test the model on the test data
    predicted_labels = model.predict(X_test)
    for i in range(len(predicted_labels) - 10):
        if predicted_labels[i] == 1:
            # Open a long position
            entry_price = X_test.iloc[i]['close']
            exit_price = X_test.iloc[i+10]['close']
            if exit_price > entry_price + markup:
                # Close the long position with profit
                profit = (exit_price - entry_price - markup) / point_cost
                balance += profit
                trades += 1
                profits.append(profit)
            else:
                # Close the long position with loss
                loss = (entry_price - exit_price + markup) / point_cost
                balance -= loss
                trades += 1
                profits.append(-loss)
        elif predicted_labels[i] == 0:
            # Open a short position
            entry_price = X_test.iloc[i]['close']
            exit_price = X_test.iloc[i+10]['close']
            if exit_price < entry_price - markup:
                # Close the short position with profit
                profit = (entry_price - exit_price - markup) / point_cost
                balance += profit
                trades += 1
                profits.append(profit)
            else:
                # Close the short position with loss
                loss = (exit_price - entry_price + markup) / point_cost
                balance -= loss
                trades += 1
                profits.append(-loss)

    # Calculate the cumulative profit or loss
    total_profit = balance - initial_balance

    # Plot the balance change over the number of trades
    plt.plot(range(trades), [balance + sum(profits[:i]) for i in range(trades)])
    plt.title('Balance Change')
    plt.xlabel('Trades')
    plt.ylabel('Balance ($)')
    plt.xticks(range(0, len(X_test), int(len(X_test)/10)), X_test.index[::int(len(X_test)/10)].strftime('%Y-%m-%d'))  # Add dates to the x-axis
    plt.show()

    # Print the results
    print(f"Cumulative profit or loss: {total_profit:.2f}")
    print(f"Number of trades: {trades}")

# Get test data
test_data = raw_data[raw_data.index >= FORWARD]
X_test = test_data.drop(['label', 'labels'], axis=1)
y_test = test_data['labels']

# Test the model with markup and target labels
initial_balance = 10000.0
markup = 0.00001
test_model(xgb_clf, X_test, y_test, markup, initial_balance)