# -*- coding: utf-8 -*-
"""Regression_Model.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1W-LjG6Z8EB401gOX-5GbOqbW8AGxKo_U
"""

from os import path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/code25/metadata.csv' #file location

melanindata = pd.read_csv(path, delimiter=',') # name of the file

# defining data - posibly melanin lever and canser rates
X = melanindata.iloc[0:5634, 12].values
y = melanindata.iloc[0:5634, 19].values
a = melanindata.iloc[0:5634, 4].values
g = melanindata.iloc[0:5634, 30].values
b = melanindata.iloc[0:5634, 5].values

"""Notes from site "The code processes categorical data by encoding it numerically, combines the processed data with numerical data, and trains a Random Forest Regression model using the prepared data."
"""

# Sample data (replace this with your actual data)
melanindata = pd.DataFrame({
    'Tumor_Type': X,  # Example values  # used to be [X]
    'Fizpatrick_Scale': y,  # Example values
    'Age': a,
    'Gender': g,
    'Anatom_Site_General': b,
    # Add other features if needed
})

melanindata = melanindata.dropna()

# Step 1: Label encoding for Tumor_Type (x-values)
label_encoder = LabelEncoder()
melanindata['Tumor_Type_Encoded'] = label_encoder.fit_transform(melanindata['Tumor_Type'])


# Step 2: Map Roman numerals to integers for Fizpatrick_Scale (y-values)
fizpatrick_map = {
    'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7
}

Anatom_Site_General_map = {
     'oral/genital': 0, 'head/neck': 1, 'upper extremity': 2, 'palms/soles': 3, 'lower extremity': 4, 'anterior torso': 5, 'lateral torso': 6, 'posterior torso': 7,
}
melanindata['Fizpatrick_Scale_Encoded'] = melanindata['Fizpatrick_Scale'].map(fizpatrick_map)
melanindata['Gender_Encoded'] = label_encoder.fit_transform(melanindata['Gender'])
melanindata['Anatom_Site_General_Encoded'] = melanindata['Anatom_Site_General'].map(Anatom_Site_General_map)


# Step 3: Selecting the feature columns (x-values) and target variable (y-values)
X = melanindata[['Tumor_Type_Encoded', 'Age', 'Gender_Encoded']]  # You can add more features here if needed
y = melanindata['Fizpatrick_Scale_Encoded']
# Step 4: Handle other categorical features if needed
# This step handles categorical features one by one for label encoding
x_categorical = melanindata.drop(['Tumor_Type', 'Fizpatrick_Scale', 'Gender'], axis=1)

# prompt: print the key used for encoding telling the user which number corresponds to which site

print("Encoding for Anatom_Site_General:")
for class_label, encoded_value in zip(melanindata['Anatom_Site_General'].unique(), melanindata['Anatom_Site_General_Encoded'].unique()):
    print(f"{class_label}: {encoded_value}")

print("\nEncoding for Gender:")
for class_label, encoded_value in zip(melanindata['Gender'].unique(), melanindata['Gender_Encoded'].unique()):
    print(f"{class_label}: {encoded_value}")

print("\nEncoding for Tumor_Type:")
for class_label, encoded_value in zip(melanindata['Tumor_Type'].unique(), melanindata['Tumor_Type_Encoded'].unique()):
    print(f"{class_label}: {encoded_value}")

print("\nEncoding for Fizpatrick_Scale:")
for class_label, encoded_value in zip(melanindata['Fizpatrick_Scale'].unique(), melanindata['Fizpatrick_Scale_Encoded'].unique()):
    print(f"{class_label}: {encoded_value}")

x_values = x_categorical[['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded']].values
y_values = x_categorical['Tumor_Type_Encoded'].values

from sklearn.model_selection import train_test_split
# Assuming X is your feature matrix and y is your target vector
X_train, X_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)

melanindata.to_excel("melanindata.xlsx")

# @title Fizpatrick_Scale

from matplotlib import pyplot as plt
import seaborn as sns
melanindata.groupby('Fizpatrick_Scale').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.xlabel('Number of Samples')
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.title('Number of Samples by Fizpatrick Scale')
melanindata.groupby('Fizpatrick_Scale').size()
plt.savefig('FizpatrickDataSize.png')  # Save first plot

display(melanindata.groupby('Fizpatrick_Scale').size().to_frame('Count').style.background_gradient(cmap='binary'))

# @title Gender

from matplotlib import pyplot as plt
import seaborn as sns
melanindata.groupby('Gender').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.xlabel('Number of Samples')
plt.title('Number of Samples by Gender')
plt.savefig('GenderSize.png')  # Save first plot

display(melanindata.groupby('Gender').size().to_frame('Count').style.background_gradient(cmap='binary'))

# @title Anatom_Site_General

from matplotlib import pyplot as plt
import seaborn as sns
melanindata.groupby('Anatom_Site_General').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
melanindata.groupby('Anatom_Site_General').size()
plt.xlabel('Number of Samples')
plt.title('Number of Samples by Anatomical Position')
plt.savefig('Anatom_Site_GeneralSize.png')  # Save first plot

display(melanindata.groupby('Anatom_Site_General').size().to_frame('Count').style.background_gradient(cmap='binary'))

# @title Tumor_Type

from matplotlib import pyplot as plt
import seaborn as sns
melanindata.groupby('Tumor_Type').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

melanindata.groupby('Tumor_Type').size()
plt.xlabel('Number of Samples')
plt.title('Number of Samples by Tumor Type')
plt.savefig('Tumor_TypeSize.png')
display(melanindata.groupby('Tumor_Type').size().to_frame('Count').style.background_gradient(cmap='binary'))

# @title Age

from matplotlib import pyplot as plt
melanindata['Age'].plot(kind='hist', bins=80, title='Age')
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.xlabel('Age')
plt.title('Number of Samples by Age')
plt.savefig('AgeSize.png')

# Step 6: Initialize and train the Random Forest Regressor
basic_regressor = RandomForestRegressor(n_estimators=500, random_state=0, oob_score=True)
basic_regressor.fit(x_values, y_values)

# @title Grid Search with RandomForestRegressor

'''
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Create the grid search with cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=0, oob_score=True),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available cores
    verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_regressor2 = grid_search.best_estimator_

# Print results
print(f"Best parameters: {best_params}")
print(f"Best score: {-grid_search.best_score_}")
'''

# @title Randomized Search with RandomForestRegressor

'''

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions for randomized search
param_distributions = {
    'n_estimators': randint(10, 200),
    'max_depth': [None] + list(randint(5, 25).rvs(2)),
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create the randomized search with cross-validation
random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=0, oob_score=True),
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings sampled
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available cores
    verbose=1,
    random_state=42
)

# Fit the randomized search to the data
random_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_regressor = random_search.best_params_
best_regressor = random_search.best_estimator_

# Print results
print(f"Best parameters: {best_params_regressor}")
print(f"Best score: {-random_search.best_score_}")

'''

best_regressor = RandomForestRegressor(random_state=0, oob_score=True, **{'bootstrap': True, 'max_depth': np.int64(11), 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 151})
best_regressor.fit(X_train, y_train)

# @title Randomized Search with RandomForestClassifier

'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions for randomized search
param_distributions = {
    'n_estimators': randint(10, 200),
    'max_depth': [None] + list(randint(5, 25).rvs(2)),
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create the randomized search with cross-validation
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=0, oob_score=True),
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings sampled
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available cores
    verbose=2,
    random_state=42
)

# Fit the randomized search to the data
random_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_classifiers = random_search.best_params_
best_classifiers = random_search.best_estimator_

# Print results
print(f"Best parameters: {best_params_classifiers}")
print(f"Best score: {-random_search.best_score_}")
'''

best_regressor2 = RandomForestRegressor(random_state=0, oob_score=True, **{'bootstrap': True, 'max_depth': np.int64(9), 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 151})
best_regressor2.fit(X_train, y_train)

"""`Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}`
Best score: 0.3403038353752471
"""

regressor0=basic_regressor
basic_regressor.get_params()

regressor=best_regressor
best_regressor.get_params()

regressor2=best_regressor2
best_regressor2.get_params()

feature_names = ['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded']
importances = regressor.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df)

importances0 = regressor0.feature_importances_
feature_imp_df0 = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances0}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df0)

importances2 = regressor2.feature_importances_
feature_imp_df2 = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances2}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df2)

# prompt: Create a bar graphy for feature importance for feature_imp_df

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming feature_imp_df is already defined as in your provided code

plt.figure(figsize=(10, 6))
sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df)
plt.title('Feature Importance for random grid search optimized regressor')
plt.show()

plt.savefig('FeatureImportance.png')

plt.figure(figsize=(10, 6))
sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df0)
plt.title('Feature Importance for basic regressor')
plt.show()

plt.savefig('FeatureImportance0.png')

plt.figure(figsize=(10, 6))
sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df2)
plt.title('Feature Importance for grid search optimized regressor')
plt.show()

plt.savefig('FeatureImportance2.png')

# Step 7: Now you can make predictions or evaluate the model
# Example: Predict the Fizpatrick Scale for a new sample

predictions_loop_data = []

for gender in range(1, 8):
  for fizpatrick in range(0, 125, 10):
    for site in range(0, 7):
      for age in range(1, 2):
        prediction = regressor.predict([[fizpatrick, age, gender, site]])  # Make sure to pass it as a 2D array
        # Create a dictionary with hashable keys
        data_point = {'fizpatrick': fizpatrick, 'age': age, 'site': site, 'gender': gender, 'prediction': prediction[0]}
        # Append the dictionary to the list
        predictions_loop_data.append(data_point)
        #print(f"Prediction for new sample fizpatric={fizpatrick}, age={age},location={site}, gender={gender},: {prediction}")
        warnings.filterwarnings(action="ignore")

#print(predictions_loop_data)

"""For out of bag data and prediction models prediction and errors"""

# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score

# Access the OOB Score
ob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {ob_score}')

# Making predictions on the same data or new data
predictions = regressor.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

regressor.get_params()

ob_score2 = regressor2.oob_score_
print(f'Out-of-Bag Score: {ob_score2}')

predictions2 = regressor2.predict(X_test)

mse2 = mean_squared_error(y_test, predictions2)
print(f'Mean Squared Error: {mse2}')

r22 = r2_score(y_test, predictions2)
print(f'R-squared: {r22}')

ob_score0 = regressor0.oob_score_
print(f'Out-of-Bag Score: {ob_score0}')

predictions0 = regressor0.predict(X_test)

mse0 = mean_squared_error(y_test, predictions0)
print(f'Mean Squared Error: {mse0}')

r20 = r2_score(y_test, predictions0)
print(f'R-squared: {r20}')

# prompt: Graph comparing the different ob_scores, mse scores, and r2 scores for the three different regression models.

import matplotlib.pyplot as plt

# Sample data (replace with your actual scores)
models = ['Basic Regressor', 'Randomized Search Optimized Regressor', 'Grid Search Optimized Regressor']
ob_scores = [ob_score0, ob_score, ob_score2]
mse_scores = [mse0, mse, mse2]
r2_scores = [r20, r2, r22]

# Create the plot
x = range(len(models))
width = 0.2

fig, ax = plt.subplots(figsize=(10, 6))

rects1 = ax.bar(x, ob_scores, width, label='OOB Score')
rects2 = ax.bar([i + width for i in x], mse_scores, width, label='MSE Score')
rects3 = ax.bar([i + 2 * width for i in x], r2_scores, width, label='R^2 Score')


# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores')
ax.set_title('Model Performance Comparison')
ax.set_xticks([i + width for i in x])
ax.set_xticklabels(models)

ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{:.4f}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

fig.tight_layout()

plt.savefig('ModelPerformanceComparison.png')

plt.show()

"""This will print our data as a plot with predicted values in green and actual values in blue"""

# Pick a desicion tree - index 0 is our 0
tree_to_plot = regressor.estimators_[1]

# Plot the decision tree
plt.figure(figsize=(90, 30))
plot_tree(tree_to_plot, feature_names=x_categorical.columns.tolist(), filled=True, rounded=True, fontsize=10)
plt.title("Decision Tree from Random Forest")
plt.show()