import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load data
data = pd.read_csv('merged_astro_financial_data.csv')

# Convert date to datetime
data['date'] = pd.to_datetime(data['date'])

# Create lags for financial data
for col in ['open', 'high', 'low', 'close']:
    for lag in range(1, 6):  # Create lags from 1 to 5
        data[f'{col}_lag{lag}'] = data[col].shift(lag)

# Create lags for astronomical data
astro_cols = ['mercury', 'venus', 'mars', 'jupiter', 'saturn', 'uranus', 'neptune']
for col in astro_cols:
    data[f'{col}_ra'] = data[col].apply(lambda x: eval(x)['ra'] if pd.notna(x) else np.nan)
    data[f'{col}_dec'] = data[col].apply(lambda x: eval(x)['dec'] if pd.notna(x) else np.nan)
    for lag in range(1, 6):  # Lags from 1 to 5
        data[f'{col}_ra_lag{lag}'] = data[f'{col}_ra'].shift(lag)
        data[f'{col}_dec_lag{lag}'] = data[f'{col}_dec'].shift(lag)
    data.drop(columns=[col, f'{col}_ra', f'{col}_dec'], inplace=True)

# Convert aspects to numerical features
aspect_cols = ['mercury_saturn', 'venus_mars', 'venus_jupiter', 'venus_uranus',
               'mars_jupiter', 'mars_uranus', 'jupiter_uranus', 'mercury_neptune',
               'venus_saturn', 'venus_neptune', 'mars_saturn', 'mercury_venus',
               'mars_neptune', 'mercury_uranus', 'saturn_neptune', 'mercury_jupiter',
               'mercury_mars', 'jupiter_saturn']

# Use LabelEncoder to encode aspects
label_encoders = {}
for col in aspect_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col].astype(str))

# Fill missing values with mean values for numerical columns
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Drop rows with missing values
data = data.dropna()

# Prepare features and target variable
features = [col for col in data.columns if col not in ['date', 'time', 'close']]
X = data[features]
y = data['close']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

# Create and train CatBoost model
model = CatBoostRegressor(iterations=500, learning_rate=0.5, depth=8, random_state=1)
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=2000, verbose=10)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

# Visualize feature importance
feature_importance = model.feature_importances_
feature_names = X.columns
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5

plt.figure(figsize=(12, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(feature_names)[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Feature Importance')
plt.show()

# Predict the next value
def predict_next():
    # Select the last row of data
    last_data = data.iloc[-1]
    input_features = last_data[features].values.reshape(1, -1)

    # Predict
    prediction = model.predict(input_features)
    print(f"Prediction for the next closing price: {prediction[0]}")

# Example usage of the prediction function
predict_next()

