# -*- coding: utf-8 -*-
"""Classifier_Model.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1P2dkCxEfqinqv9awSniRdWilnG-flNzb
"""

import numpy as np
import sklearn

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from os import path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

\
# Load example dataset
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/code25/metadata.csv' #file location

melanindata = pd.read_csv(path, delimiter=',') # name of the fileX, y = data.data, data.target

# defining data - posibly melanin lever and canser rates
X = melanindata.iloc[0:5634, 12].values
y = melanindata.iloc[0:5634, 19].values
a = melanindata.iloc[0:5634, 4].values
g = melanindata.iloc[0:5634, 30].values
b = melanindata.iloc[0:5634, 5].values

# Sample data (replace this with your actual data)
melanindata = pd.DataFrame({
    'Tumor_Type': X,  # Example values  # used to be [X]
    'Fizpatrick_Scale': y,  # Example values
    'Age': a,
    'Gender': g,
    'Anatom_Site_General': b,
    # Add other features if needed
})

melanindata = melanindata.dropna()

# Step 1: Label encoding for Tumor_Type (x-values)
label_encoder = LabelEncoder()
melanindata['Tumor_Type_Encoded'] = label_encoder.fit_transform(melanindata['Tumor_Type'])


# Step 2: Map Roman numerals to integers for Fizpatrick_Scale (y-values)
fizpatrick_map = {
    'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7
}

Anatom_Site_General_map = {
     'oral/genital': 0, 'head/neck': 1, 'upper extremity': 2, 'palms/soles': 3, 'lower extremity': 4, 'anterior torso': 5, 'lateral torso': 6, 'posterior torso': 7,
}
melanindata['Fizpatrick_Scale_Encoded'] = melanindata['Fizpatrick_Scale'].map(fizpatrick_map)
melanindata['Gender_Encoded'] = label_encoder.fit_transform(melanindata['Gender'])
melanindata['Anatom_Site_General_Encoded'] = melanindata['Anatom_Site_General'].map(Anatom_Site_General_map)


# Step 3: Selecting the feature columns (x-values) and target variable (y-values)
X = melanindata[['Tumor_Type_Encoded', 'Age', 'Gender_Encoded']]  # You can add more features here if needed
y = melanindata['Fizpatrick_Scale_Encoded']
# Step 4: Handle other categorical features if needed
# This step handles categorical features one by one for label encoding
x_categorical = melanindata.drop(['Tumor_Type', 'Fizpatrick_Scale', 'Gender'], axis=1)

# Step 1: Label encoding for Tumor_Type (x-values)
label_encoder = LabelEncoder()
melanindata['Tumor_Type_Encoded'] = label_encoder.fit_transform(melanindata['Tumor_Type'])


# Step 2: Map Roman numerals to integers for Fizpatrick_Scale (y-values)
fizpatrick_map = {
    'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7
}

Anatom_Site_General_map = {
     'oral/genital': 0, 'head/neck': 1, 'upper extremity': 2, 'palms/soles': 3, 'lower extremity': 4, 'anterior torso': 5, 'lateral torso': 6, 'posterior torso': 7,
}
melanindata['Fizpatrick_Scale_Encoded'] = melanindata['Fizpatrick_Scale'].map(fizpatrick_map)
melanindata['Gender_Encoded'] = label_encoder.fit_transform(melanindata['Gender'])
melanindata['Anatom_Site_General_Encoded'] = melanindata['Anatom_Site_General'].map(Anatom_Site_General_map)


# Step 3: Selecting the feature columns (x-values) and target variable (y-values)
X = melanindata[['Tumor_Type_Encoded', 'Age', 'Gender_Encoded']]  # You can add more features here if needed
y = melanindata['Fizpatrick_Scale_Encoded']
# Step 4: Handle other categorical features if needed
# This step handles categorical features one by one for label encoding
x_categorical = melanindata.drop(['Tumor_Type', 'Fizpatrick_Scale', 'Gender'], axis=1)

# prompt: print the key used for encoding telling the user which number corresponds to which site

print("Encoding for Anatom_Site_General:")
for class_label, encoded_value in zip(melanindata['Anatom_Site_General'].unique(), melanindata['Anatom_Site_General_Encoded'].unique()):
    print(f"{class_label}: {encoded_value}")

print("\nEncoding for Gender:")
for class_label, encoded_value in zip(melanindata['Gender'].unique(), melanindata['Gender_Encoded'].unique()):
    print(f"{class_label}: {encoded_value}")

print("\nEncoding for Tumor_Type:")
for class_label, encoded_value in zip(melanindata['Tumor_Type'].unique(), melanindata['Tumor_Type_Encoded'].unique()):
    print(f"{class_label}: {encoded_value}")

print("\nEncoding for Fizpatrick_Scale:")
for class_label, encoded_value in zip(melanindata['Fizpatrick_Scale'].unique(), melanindata['Fizpatrick_Scale_Encoded'].unique()):
    print(f"{class_label}: {encoded_value}")

x_values = x_categorical[['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded']].values
y_values = x_categorical['Tumor_Type_Encoded'].values

# Assuming X is your feature matrix and y is your target vector
X_train, X_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)

# ---------- RANDOM FOREST ----------
print("Setting up Random Forest model...")

# 1. Define the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


# ---------- SVM ----------
print("\nTraining SVM model...")

# 1. Define the model with preprocessing pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # SVMs require scaled features
    ('svm', SVC(kernel='rbf', C=1.0, probability=True, random_state=42))
])

# 2. Train the model
svm_pipeline.fit(X_train, y_train)

# 3. Make predictions
svm_predictions = svm_pipeline.predict(X_test)

# 4. Evaluate
svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"SVM Accuracy: {svm_accuracy:.4f}")
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_predictions))

# ---------- HYPERPARAMETER TUNING SVM ----------
print("\nTuning SVM hyperparameters...")

# Define the parameter grid
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.1, 0.01],
    'svm__kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    svm_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Evaluate the model with best parameters
best_svm = grid_search.best_estimator_
best_predictions = best_svm.predict(X_test)
best_accuracy = accuracy_score(y_test, best_predictions)
print(f"Tuned SVM Accuracy: {best_accuracy:.4f}")
print("\nTuned SVM Classification Report:")
print(classification_report(y_test, best_predictions))

# @title Grid Search with RandomForestClassifier

'''
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [200, 500 ],
    'max_depth': [None, 20, 40],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# Create the grid search with cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=0, oob_score=True),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=1,
)
try:
  # Fit the grid search to the data
  grid_search.fit(X_train, y_train)

  # Get the best parameters and best estimator
  best_params = grid_search.best_params_
  best_classifiers = grid_search.best_estimator_

  # Print results
  print(f"Best parameters: {best_params}")
  print(f"Best accuracy: {grid_search.best_score_}")

except Exception as e:
  print(f"Error during grid search: {e}")

'''

"""Notes from site "The code processes categorical data by encoding it numerically, combines the processed data with numerical data, and trains a Random Forest Regression model using the prepared data."
"""

classifier=RandomForestClassifier(random_state=0, ** {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200})
#RandomForestRegressor(random_state=0, oob_score=True, **{'bootstrap': True, 'max_depth': np.int64(9), 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 151})
best_classifiers.get_params()


# @title Randomized Search with RandomForestClassifier

'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions for randomized search
param_distributions = {
    'n_estimators': randint(10, 200),
    'max_depth': [None] + list(randint(5, 25).rvs(2)),
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create the randomized search with cross-validation
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=0, oob_score=True),
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings sampled
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available cores
    verbose=2,
    random_state=42
)

# Fit the randomized search to the data
random_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_classifiers = random_search.best_params_
best_classifiers = random_search.best_estimator_

# Print results
print(f"Best parameters: {best_params_classifiers}")
print(f"Best score: {-random_search.best_score_}")
'''

'''

best_regressor2 = RandomForestClassifier(random_state=0, oob_score=True, **{'bootstrap': True, 'max_depth': np.int64(9), 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 151})
best_regressor2.fit(X_train, y_train)

'''

# @title Random forest training and predictions

# 2. Train the model
rf_model.fit(X_train, y_train)
classifier.fit(X_train, y_train)

# 3. Make predictions
rf_predictions = rf_model.predict(X_test)
rf_grid_predictions = classifier.predict(X_test)

# 4. Evaluate
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

rf_grid_accuracy = accuracy_score(y_test, rf_grid_predictions)
print(f"Random Forest Grid Search Optimized Accuracy: {rf_grid_accuracy:.4f}")
print("\nOptimized Random Forest Classification Report:")
print(classification_report(y_test, rf_grid_predictions))

"""`Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}`
Best score: 0.3403038353752471
"""

feature_names = ['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded']
importances = classifier.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df)

importances2 = rf_model.feature_importances_
feature_imp_df2 = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances2}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df2)

# prompt: Create a bar graphy for feature importance for feature_imp_df

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming feature_imp_df is already defined as in your provided code

plt.figure(figsize=(10, 6))
sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df)
plt.title('Optimized Feature Importance Random Forest')
plt.show()

plt.savefig('FeatureImportance.png')

plt.figure(figsize=(10, 6))
sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df2)
plt.title('Feature Importance Random Forest')
plt.show()

plt.savefig('FeatureImportance2.png')

"""For out of bag data and prediction models prediction and errors

This will print our data as a plot with predicted values in green and actual values in blue
"""

'''
# Pick a desicion tree - index 0 is our 0
tree_to_plot = classifier.estimators_[1]

# Plot the decision tree
plt.figure(figsize=(90, 30))
plot_tree(tree_to_plot, feature_names=x_categorical.columns.tolist(), filled=True, rounded=True, fontsize=10)
plt.title("Decision Tree from Random Forest")
plt.show()
'''


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


rf_predictions = rf_model.predict(X_test)
rf_model.fit(X_train, y_train)

rf_grid_predictions = classifier.predict(X_test)
classifier.fit(X_train, y_train)

svm_predictions = svm_pipeline.predict(X_test)
svm_pipeline.fit(X_train, y_train)


# For your basic Random Forest model (classifier)
rf_basic_prob = classifier.predict_proba(X_test) # Get probabilities for positive class

# For your optimized Random Forest model
rf_opt_prob = rf_model.predict_proba(X_test)

# Note: SVM needs probability=True during initialization to enable probability estimates
svm_prob = svm_pipeline.predict_proba(X_test)

# Calculate evaluation metrics
models = ['Basic RF', 'Optimized RF', 'SVM']
predictions = [rf_predictions, rf_grid_predictions, svm_predictions]
probabilities = [rf_basic_prob, rf_opt_prob, svm_prob]

# Accuracy metrics
metrics = {
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'ROC AUC': []
}

for i, model in enumerate(models):
    metrics['Accuracy'].append(accuracy_score(y_test, predictions[i]))
    metrics['Precision'].append(precision_score(y_test, predictions[i], average='weighted', zero_division=0))
    metrics['Recall'].append(recall_score(y_test, predictions[i], average='weighted', zero_division=0))
    metrics['F1 Score'].append(f1_score(y_test, predictions[i], average='weighted', zero_division=0))
    metrics['ROC AUC'].append(roc_auc_score(y_test, probabilities[i], multi_class='ovr'))

# Create a table visualization
plt.figure(figsize=(10, 5))
ax = plt.subplot(111, frame_on=False)
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)

# Format the table with 4 decimal places
table_vals = [[f"{val:.4f}" for val in metrics_df.loc[row_label]] for row_label in metrics_df.index] # corrected table_vals to get values from metrics_df

# Create the table using corrected table_vals
table = plt.table(
    cellText=table_vals,  # Use the formatted values for the table cells
    rowLabels=metrics_df.index,  # Model names as row labels
    colLabels=metrics_df.columns,  # Column names as column labels
    cellLoc='center',
    loc='center',
    colWidths=[0.15] * len(metrics_df.columns)
)

# Add value labels on top of bars
for i, p in enumerate(plt.gca().patches):
    height = p.get_height()
    plt.gca().text(
        p.get_x() + p.get_width()/2.,
        height + 0.01,
        f'{height:.4f}',
        ha="center",
        fontsize=8
    )

plt.tight_layout()
plt.savefig('model_metrics_comparison.png', dpi=300)

# Feature importance from Random Forest models
feature_names = ['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded']

# Mock feature importances - replace with your actual values
importances = np.array([0.42, 0.30, 0.20, 0.08])  # basic RF
importances2 = np.array([0.45, 0.28, 0.18, 0.09])  # optimized RF

# Create dataframes
feature_imp_df = feature_imp_df.sort_values('Gini Importance', ascending=False)

feature_imp_df2 = feature_imp_df2.sort_values('Gini Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 6))
width = 0.35
x = np.arange(len(feature_names))

# Create side-by-side bars for both RF models
plt.bar(x - width/2, feature_imp_df.set_index('Feature').loc[feature_names, 'Gini Importance'],
        width, label='Basic RF')
plt.bar(x + width/2, feature_imp_df2.set_index('Feature').loc[feature_names, 'Gini Importance'],
        width, label='Optimized RF')

plt.xlabel('Features', fontsize=14)
plt.ylabel('Gini Importance', fontsize=14)
plt.title('Feature Importance Comparison between RF Models', fontsize=16)
plt.xticks(x, feature_names, rotation=45, ha='right')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add value labels
for i, v in enumerate(feature_imp_df.set_index('Feature').loc[feature_names, 'Gini Importance']):
    plt.text(i - width/2, v + 0.01, f'{v:.4f}', ha='center', fontsize=9)

for i, v in enumerate(feature_imp_df2.set_index('Feature').loc[feature_names, 'Gini Importance']):
    plt.text(i + width/2, v + 0.01, f'{v:.4f}', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig('feature_importance_comparison.png', dpi=300)