# -*- coding: utf-8 -*- """Classifier_Model.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1P2dkCxEfqinqv9awSniRdWilnG-flNzb """ import numpy as np import sklearn from sklearn.preprocessing import LabelEncoder from sklearn.impute import KNNImputer from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.tree import plot_tree import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, classification_report from sklearn.svm import SVC from sklearn.pipeline import Pipeline from os import path import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings \ # Load example dataset from google.colab import drive drive.mount('/content/drive') path = '/content/drive/MyDrive/code25/metadata.csv' #file location melanindata = pd.read_csv(path, delimiter=',') # name of the fileX, y = data.data, data.target # defining data - posibly melanin lever and canser rates X = melanindata.iloc[0:5634, 12].values y = melanindata.iloc[0:5634, 19].values a = melanindata.iloc[0:5634, 4].values g = melanindata.iloc[0:5634, 30].values b = melanindata.iloc[0:5634, 5].values # Sample data (replace this with your actual data) melanindata = pd.DataFrame({ 'Tumor_Type': X, # Example values # used to be [X] 'Fizpatrick_Scale': y, # Example values 'Age': a, 'Gender': g, 'Anatom_Site_General': b, # Add other features if needed }) melanindata = melanindata.dropna() # Step 1: Label encoding for Tumor_Type (x-values) label_encoder = LabelEncoder() melanindata['Tumor_Type_Encoded'] = label_encoder.fit_transform(melanindata['Tumor_Type']) # Step 2: Map Roman numerals to integers for Fizpatrick_Scale (y-values) fizpatrick_map = { 'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7 } Anatom_Site_General_map = { 'oral/genital': 0, 'head/neck': 1, 'upper extremity': 2, 'palms/soles': 3, 'lower extremity': 4, 'anterior torso': 5, 'lateral torso': 6, 'posterior torso': 7, } melanindata['Fizpatrick_Scale_Encoded'] = melanindata['Fizpatrick_Scale'].map(fizpatrick_map) melanindata['Gender_Encoded'] = label_encoder.fit_transform(melanindata['Gender']) melanindata['Anatom_Site_General_Encoded'] = melanindata['Anatom_Site_General'].map(Anatom_Site_General_map) # Step 3: Selecting the feature columns (x-values) and target variable (y-values) X = melanindata[['Tumor_Type_Encoded', 'Age', 'Gender_Encoded']] # You can add more features here if needed y = melanindata['Fizpatrick_Scale_Encoded'] # Step 4: Handle other categorical features if needed # This step handles categorical features one by one for label encoding x_categorical = melanindata.drop(['Tumor_Type', 'Fizpatrick_Scale', 'Gender'], axis=1) # Step 1: Label encoding for Tumor_Type (x-values) label_encoder = LabelEncoder() melanindata['Tumor_Type_Encoded'] = label_encoder.fit_transform(melanindata['Tumor_Type']) # Step 2: Map Roman numerals to integers for Fizpatrick_Scale (y-values) fizpatrick_map = { 'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7 } Anatom_Site_General_map = { 'oral/genital': 0, 'head/neck': 1, 'upper extremity': 2, 'palms/soles': 3, 'lower extremity': 4, 'anterior torso': 5, 'lateral torso': 6, 'posterior torso': 7, } melanindata['Fizpatrick_Scale_Encoded'] = melanindata['Fizpatrick_Scale'].map(fizpatrick_map) melanindata['Gender_Encoded'] = label_encoder.fit_transform(melanindata['Gender']) melanindata['Anatom_Site_General_Encoded'] = melanindata['Anatom_Site_General'].map(Anatom_Site_General_map) # Step 3: Selecting the feature columns (x-values) and target variable (y-values) X = melanindata[['Tumor_Type_Encoded', 'Age', 'Gender_Encoded']] # You can add more features here if needed y = melanindata['Fizpatrick_Scale_Encoded'] # Step 4: Handle other categorical features if needed # This step handles categorical features one by one for label encoding x_categorical = melanindata.drop(['Tumor_Type', 'Fizpatrick_Scale', 'Gender'], axis=1) # prompt: print the key used for encoding telling the user which number corresponds to which site print("Encoding for Anatom_Site_General:") for class_label, encoded_value in zip(melanindata['Anatom_Site_General'].unique(), melanindata['Anatom_Site_General_Encoded'].unique()): print(f"{class_label}: {encoded_value}") print("\nEncoding for Gender:") for class_label, encoded_value in zip(melanindata['Gender'].unique(), melanindata['Gender_Encoded'].unique()): print(f"{class_label}: {encoded_value}") print("\nEncoding for Tumor_Type:") for class_label, encoded_value in zip(melanindata['Tumor_Type'].unique(), melanindata['Tumor_Type_Encoded'].unique()): print(f"{class_label}: {encoded_value}") print("\nEncoding for Fizpatrick_Scale:") for class_label, encoded_value in zip(melanindata['Fizpatrick_Scale'].unique(), melanindata['Fizpatrick_Scale_Encoded'].unique()): print(f"{class_label}: {encoded_value}") x_values = x_categorical[['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded']].values y_values = x_categorical['Tumor_Type_Encoded'].values # Assuming X is your feature matrix and y is your target vector X_train, X_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42) # ---------- RANDOM FOREST ---------- print("Setting up Random Forest model...") # 1. Define the model rf_model = RandomForestClassifier(n_estimators=100, random_state=42) # ---------- SVM ---------- print("\nTraining SVM model...") # 1. Define the model with preprocessing pipeline svm_pipeline = Pipeline([ ('scaler', StandardScaler()), # SVMs require scaled features ('svm', SVC(kernel='rbf', C=1.0, probability=True, random_state=42)) ]) # 2. Train the model svm_pipeline.fit(X_train, y_train) # 3. Make predictions svm_predictions = svm_pipeline.predict(X_test) # 4. Evaluate svm_accuracy = accuracy_score(y_test, svm_predictions) print(f"SVM Accuracy: {svm_accuracy:.4f}") print("\nSVM Classification Report:") print(classification_report(y_test, svm_predictions)) # ---------- HYPERPARAMETER TUNING SVM ---------- print("\nTuning SVM hyperparameters...") # Define the parameter grid param_grid = { 'svm__C': [0.1, 1, 10, 100], 'svm__gamma': ['scale', 'auto', 0.1, 0.01], 'svm__kernel': ['rbf', 'linear', 'poly', 'sigmoid'] } # Set up GridSearchCV grid_search = GridSearchCV( svm_pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1 ) # Perform the grid search grid_search.fit(X_train, y_train) # Get the best parameters print(f"Best parameters: {grid_search.best_params_}") # Evaluate the model with best parameters best_svm = grid_search.best_estimator_ best_predictions = best_svm.predict(X_test) best_accuracy = accuracy_score(y_test, best_predictions) print(f"Tuned SVM Accuracy: {best_accuracy:.4f}") print("\nTuned SVM Classification Report:") print(classification_report(y_test, best_predictions)) # @title Grid Search with RandomForestClassifier ''' from sklearn.model_selection import GridSearchCV # Define parameter grid param_grid = { 'n_estimators': [200, 500 ], 'max_depth': [None, 20, 40], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'max_features': ['sqrt', 'log2', None], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy'] } # Create the grid search with cross-validation grid_search = GridSearchCV( estimator=RandomForestClassifier(random_state=0, oob_score=True), param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, # Use all available cores verbose=1, ) try: # Fit the grid search to the data grid_search.fit(X_train, y_train) # Get the best parameters and best estimator best_params = grid_search.best_params_ best_classifiers = grid_search.best_estimator_ # Print results print(f"Best parameters: {best_params}") print(f"Best accuracy: {grid_search.best_score_}") except Exception as e: print(f"Error during grid search: {e}") ''' """Notes from site "The code processes categorical data by encoding it numerically, combines the processed data with numerical data, and trains a Random Forest Regression model using the prepared data." """ classifier=RandomForestClassifier(random_state=0, ** {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}) #RandomForestRegressor(random_state=0, oob_score=True, **{'bootstrap': True, 'max_depth': np.int64(9), 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 151}) best_classifiers.get_params() # @title Randomized Search with RandomForestClassifier ''' from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint, uniform # Define parameter distributions for randomized search param_distributions = { 'n_estimators': randint(10, 200), 'max_depth': [None] + list(randint(5, 25).rvs(2)), 'min_samples_split': randint(2, 15), 'min_samples_leaf': randint(1, 10), 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True, False] } # Create the randomized search with cross-validation random_search = RandomizedSearchCV( estimator=RandomForestClassifier(random_state=0, oob_score=True), param_distributions=param_distributions, n_iter=50, # Number of parameter settings sampled cv=5, scoring='neg_mean_squared_error', n_jobs=-1, # Use all available cores verbose=2, random_state=42 ) # Fit the randomized search to the data random_search.fit(X_train, y_train) # Get the best parameters and best estimator best_params_classifiers = random_search.best_params_ best_classifiers = random_search.best_estimator_ # Print results print(f"Best parameters: {best_params_classifiers}") print(f"Best score: {-random_search.best_score_}") ''' ''' best_regressor2 = RandomForestClassifier(random_state=0, oob_score=True, **{'bootstrap': True, 'max_depth': np.int64(9), 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 151}) best_regressor2.fit(X_train, y_train) ''' # @title Random forest training and predictions # 2. Train the model rf_model.fit(X_train, y_train) classifier.fit(X_train, y_train) # 3. Make predictions rf_predictions = rf_model.predict(X_test) rf_grid_predictions = classifier.predict(X_test) # 4. Evaluate rf_accuracy = accuracy_score(y_test, rf_predictions) print(f"Random Forest Accuracy: {rf_accuracy:.4f}") print("\nRandom Forest Classification Report:") print(classification_report(y_test, rf_predictions)) rf_grid_accuracy = accuracy_score(y_test, rf_grid_predictions) print(f"Random Forest Grid Search Optimized Accuracy: {rf_grid_accuracy:.4f}") print("\nOptimized Random Forest Classification Report:") print(classification_report(y_test, rf_grid_predictions)) """`Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}` Best score: 0.3403038353752471 """ feature_names = ['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded'] importances = classifier.feature_importances_ feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) print(feature_imp_df) importances2 = rf_model.feature_importances_ feature_imp_df2 = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances2}).sort_values('Gini Importance', ascending=False) print(feature_imp_df2) # prompt: Create a bar graphy for feature importance for feature_imp_df import matplotlib.pyplot as plt import seaborn as sns # Assuming feature_imp_df is already defined as in your provided code plt.figure(figsize=(10, 6)) sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df) plt.title('Optimized Feature Importance Random Forest') plt.show() plt.savefig('FeatureImportance.png') plt.figure(figsize=(10, 6)) sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df2) plt.title('Feature Importance Random Forest') plt.show() plt.savefig('FeatureImportance2.png') """For out of bag data and prediction models prediction and errors This will print our data as a plot with predicted values in green and actual values in blue """ ''' # Pick a desicion tree - index 0 is our 0 tree_to_plot = classifier.estimators_[1] # Plot the decision tree plt.figure(figsize=(90, 30)) plot_tree(tree_to_plot, feature_names=x_categorical.columns.tolist(), filled=True, rounded=True, fontsize=10) plt.title("Decision Tree from Random Forest") plt.show() ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix rf_predictions = rf_model.predict(X_test) rf_model.fit(X_train, y_train) rf_grid_predictions = classifier.predict(X_test) classifier.fit(X_train, y_train) svm_predictions = svm_pipeline.predict(X_test) svm_pipeline.fit(X_train, y_train) # For your basic Random Forest model (classifier) rf_basic_prob = classifier.predict_proba(X_test) # Get probabilities for positive class # For your optimized Random Forest model rf_opt_prob = rf_model.predict_proba(X_test) # Note: SVM needs probability=True during initialization to enable probability estimates svm_prob = svm_pipeline.predict_proba(X_test) # Calculate evaluation metrics models = ['Basic RF', 'Optimized RF', 'SVM'] predictions = [rf_predictions, rf_grid_predictions, svm_predictions] probabilities = [rf_basic_prob, rf_opt_prob, svm_prob] # Accuracy metrics metrics = { 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': [], 'ROC AUC': [] } for i, model in enumerate(models): metrics['Accuracy'].append(accuracy_score(y_test, predictions[i])) metrics['Precision'].append(precision_score(y_test, predictions[i], average='weighted', zero_division=0)) metrics['Recall'].append(recall_score(y_test, predictions[i], average='weighted', zero_division=0)) metrics['F1 Score'].append(f1_score(y_test, predictions[i], average='weighted', zero_division=0)) metrics['ROC AUC'].append(roc_auc_score(y_test, probabilities[i], multi_class='ovr')) # Create a table visualization plt.figure(figsize=(10, 5)) ax = plt.subplot(111, frame_on=False) ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) # Format the table with 4 decimal places table_vals = [[f"{val:.4f}" for val in metrics_df.loc[row_label]] for row_label in metrics_df.index] # corrected table_vals to get values from metrics_df # Create the table using corrected table_vals table = plt.table( cellText=table_vals, # Use the formatted values for the table cells rowLabels=metrics_df.index, # Model names as row labels colLabels=metrics_df.columns, # Column names as column labels cellLoc='center', loc='center', colWidths=[0.15] * len(metrics_df.columns) ) # Add value labels on top of bars for i, p in enumerate(plt.gca().patches): height = p.get_height() plt.gca().text( p.get_x() + p.get_width()/2., height + 0.01, f'{height:.4f}', ha="center", fontsize=8 ) plt.tight_layout() plt.savefig('model_metrics_comparison.png', dpi=300) # Feature importance from Random Forest models feature_names = ['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded'] # Mock feature importances - replace with your actual values importances = np.array([0.42, 0.30, 0.20, 0.08]) # basic RF importances2 = np.array([0.45, 0.28, 0.18, 0.09]) # optimized RF # Create dataframes feature_imp_df = feature_imp_df.sort_values('Gini Importance', ascending=False) feature_imp_df2 = feature_imp_df2.sort_values('Gini Importance', ascending=False) # Plot feature importances plt.figure(figsize=(12, 6)) width = 0.35 x = np.arange(len(feature_names)) # Create side-by-side bars for both RF models plt.bar(x - width/2, feature_imp_df.set_index('Feature').loc[feature_names, 'Gini Importance'], width, label='Basic RF') plt.bar(x + width/2, feature_imp_df2.set_index('Feature').loc[feature_names, 'Gini Importance'], width, label='Optimized RF') plt.xlabel('Features', fontsize=14) plt.ylabel('Gini Importance', fontsize=14) plt.title('Feature Importance Comparison between RF Models', fontsize=16) plt.xticks(x, feature_names, rotation=45, ha='right') plt.legend() plt.grid(axis='y', linestyle='--', alpha=0.7) # Add value labels for i, v in enumerate(feature_imp_df.set_index('Feature').loc[feature_names, 'Gini Importance']): plt.text(i - width/2, v + 0.01, f'{v:.4f}', ha='center', fontsize=9) for i, v in enumerate(feature_imp_df2.set_index('Feature').loc[feature_names, 'Gini Importance']): plt.text(i + width/2, v + 0.01, f'{v:.4f}', ha='center', fontsize=9) plt.tight_layout() plt.savefig('feature_importance_comparison.png', dpi=300)