# -*- coding: utf-8 -*- """Regression_Model.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1W-LjG6Z8EB401gOX-5GbOqbW8AGxKo_U """ from os import path import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import sklearn import warnings import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.impute import KNNImputer from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.tree import plot_tree import matplotlib.pyplot as plt from google.colab import drive drive.mount('/content/drive') path = '/content/drive/MyDrive/code25/metadata.csv' #file location melanindata = pd.read_csv(path, delimiter=',') # name of the file # defining data - posibly melanin lever and canser rates X = melanindata.iloc[0:5634, 12].values y = melanindata.iloc[0:5634, 19].values a = melanindata.iloc[0:5634, 4].values g = melanindata.iloc[0:5634, 30].values b = melanindata.iloc[0:5634, 5].values """Notes from site "The code processes categorical data by encoding it numerically, combines the processed data with numerical data, and trains a Random Forest Regression model using the prepared data." """ # Sample data (replace this with your actual data) melanindata = pd.DataFrame({ 'Tumor_Type': X, # Example values # used to be [X] 'Fizpatrick_Scale': y, # Example values 'Age': a, 'Gender': g, 'Anatom_Site_General': b, # Add other features if needed }) melanindata = melanindata.dropna() # Step 1: Label encoding for Tumor_Type (x-values) label_encoder = LabelEncoder() melanindata['Tumor_Type_Encoded'] = label_encoder.fit_transform(melanindata['Tumor_Type']) # Step 2: Map Roman numerals to integers for Fizpatrick_Scale (y-values) fizpatrick_map = { 'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7 } Anatom_Site_General_map = { 'oral/genital': 0, 'head/neck': 1, 'upper extremity': 2, 'palms/soles': 3, 'lower extremity': 4, 'anterior torso': 5, 'lateral torso': 6, 'posterior torso': 7, } melanindata['Fizpatrick_Scale_Encoded'] = melanindata['Fizpatrick_Scale'].map(fizpatrick_map) melanindata['Gender_Encoded'] = label_encoder.fit_transform(melanindata['Gender']) melanindata['Anatom_Site_General_Encoded'] = melanindata['Anatom_Site_General'].map(Anatom_Site_General_map) # Step 3: Selecting the feature columns (x-values) and target variable (y-values) X = melanindata[['Tumor_Type_Encoded', 'Age', 'Gender_Encoded']] # You can add more features here if needed y = melanindata['Fizpatrick_Scale_Encoded'] # Step 4: Handle other categorical features if needed # This step handles categorical features one by one for label encoding x_categorical = melanindata.drop(['Tumor_Type', 'Fizpatrick_Scale', 'Gender'], axis=1) # prompt: print the key used for encoding telling the user which number corresponds to which site print("Encoding for Anatom_Site_General:") for class_label, encoded_value in zip(melanindata['Anatom_Site_General'].unique(), melanindata['Anatom_Site_General_Encoded'].unique()): print(f"{class_label}: {encoded_value}") print("\nEncoding for Gender:") for class_label, encoded_value in zip(melanindata['Gender'].unique(), melanindata['Gender_Encoded'].unique()): print(f"{class_label}: {encoded_value}") print("\nEncoding for Tumor_Type:") for class_label, encoded_value in zip(melanindata['Tumor_Type'].unique(), melanindata['Tumor_Type_Encoded'].unique()): print(f"{class_label}: {encoded_value}") print("\nEncoding for Fizpatrick_Scale:") for class_label, encoded_value in zip(melanindata['Fizpatrick_Scale'].unique(), melanindata['Fizpatrick_Scale_Encoded'].unique()): print(f"{class_label}: {encoded_value}") x_values = x_categorical[['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded']].values y_values = x_categorical['Tumor_Type_Encoded'].values from sklearn.model_selection import train_test_split # Assuming X is your feature matrix and y is your target vector X_train, X_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42) melanindata.to_excel("melanindata.xlsx") # @title Fizpatrick_Scale from matplotlib import pyplot as plt import seaborn as sns melanindata.groupby('Fizpatrick_Scale').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2')) plt.xlabel('Number of Samples') plt.gca().spines[['top', 'right',]].set_visible(False) plt.title('Number of Samples by Fizpatrick Scale') melanindata.groupby('Fizpatrick_Scale').size() plt.savefig('FizpatrickDataSize.png') # Save first plot display(melanindata.groupby('Fizpatrick_Scale').size().to_frame('Count').style.background_gradient(cmap='binary')) # @title Gender from matplotlib import pyplot as plt import seaborn as sns melanindata.groupby('Gender').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2')) plt.gca().spines[['top', 'right',]].set_visible(False) plt.xlabel('Number of Samples') plt.title('Number of Samples by Gender') plt.savefig('GenderSize.png') # Save first plot display(melanindata.groupby('Gender').size().to_frame('Count').style.background_gradient(cmap='binary')) # @title Anatom_Site_General from matplotlib import pyplot as plt import seaborn as sns melanindata.groupby('Anatom_Site_General').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2')) plt.gca().spines[['top', 'right',]].set_visible(False) melanindata.groupby('Anatom_Site_General').size() plt.xlabel('Number of Samples') plt.title('Number of Samples by Anatomical Position') plt.savefig('Anatom_Site_GeneralSize.png') # Save first plot display(melanindata.groupby('Anatom_Site_General').size().to_frame('Count').style.background_gradient(cmap='binary')) # @title Tumor_Type from matplotlib import pyplot as plt import seaborn as sns melanindata.groupby('Tumor_Type').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2')) plt.gca().spines[['top', 'right',]].set_visible(False) melanindata.groupby('Tumor_Type').size() plt.xlabel('Number of Samples') plt.title('Number of Samples by Tumor Type') plt.savefig('Tumor_TypeSize.png') display(melanindata.groupby('Tumor_Type').size().to_frame('Count').style.background_gradient(cmap='binary')) # @title Age from matplotlib import pyplot as plt melanindata['Age'].plot(kind='hist', bins=80, title='Age') plt.gca().spines[['top', 'right',]].set_visible(False) plt.xlabel('Age') plt.title('Number of Samples by Age') plt.savefig('AgeSize.png') # Step 6: Initialize and train the Random Forest Regressor basic_regressor = RandomForestRegressor(n_estimators=500, random_state=0, oob_score=True) basic_regressor.fit(x_values, y_values) # @title Grid Search with RandomForestRegressor ''' from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV # Define parameter grid param_grid = { 'n_estimators': [100, 300, 500], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt'] } # Create the grid search with cross-validation grid_search = GridSearchCV( estimator=RandomForestRegressor(random_state=0, oob_score=True), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, # Use all available cores verbose=1 ) # Fit the grid search to the data grid_search.fit(X_train, y_train) # Get the best parameters and best estimator best_params = grid_search.best_params_ best_regressor2 = grid_search.best_estimator_ # Print results print(f"Best parameters: {best_params}") print(f"Best score: {-grid_search.best_score_}") ''' # @title Randomized Search with RandomForestRegressor ''' from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint, uniform # Define parameter distributions for randomized search param_distributions = { 'n_estimators': randint(10, 200), 'max_depth': [None] + list(randint(5, 25).rvs(2)), 'min_samples_split': randint(2, 15), 'min_samples_leaf': randint(1, 10), 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True, False] } # Create the randomized search with cross-validation random_search = RandomizedSearchCV( estimator=RandomForestRegressor(random_state=0, oob_score=True), param_distributions=param_distributions, n_iter=50, # Number of parameter settings sampled cv=5, scoring='neg_mean_squared_error', n_jobs=-1, # Use all available cores verbose=1, random_state=42 ) # Fit the randomized search to the data random_search.fit(X_train, y_train) # Get the best parameters and best estimator best_params_regressor = random_search.best_params_ best_regressor = random_search.best_estimator_ # Print results print(f"Best parameters: {best_params_regressor}") print(f"Best score: {-random_search.best_score_}") ''' best_regressor = RandomForestRegressor(random_state=0, oob_score=True, **{'bootstrap': True, 'max_depth': np.int64(11), 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 151}) best_regressor.fit(X_train, y_train) # @title Randomized Search with RandomForestClassifier ''' from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint, uniform # Define parameter distributions for randomized search param_distributions = { 'n_estimators': randint(10, 200), 'max_depth': [None] + list(randint(5, 25).rvs(2)), 'min_samples_split': randint(2, 15), 'min_samples_leaf': randint(1, 10), 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True, False] } # Create the randomized search with cross-validation random_search = RandomizedSearchCV( estimator=RandomForestClassifier(random_state=0, oob_score=True), param_distributions=param_distributions, n_iter=50, # Number of parameter settings sampled cv=5, scoring='neg_mean_squared_error', n_jobs=-1, # Use all available cores verbose=2, random_state=42 ) # Fit the randomized search to the data random_search.fit(X_train, y_train) # Get the best parameters and best estimator best_params_classifiers = random_search.best_params_ best_classifiers = random_search.best_estimator_ # Print results print(f"Best parameters: {best_params_classifiers}") print(f"Best score: {-random_search.best_score_}") ''' best_regressor2 = RandomForestRegressor(random_state=0, oob_score=True, **{'bootstrap': True, 'max_depth': np.int64(9), 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 151}) best_regressor2.fit(X_train, y_train) """`Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}` Best score: 0.3403038353752471 """ regressor0=basic_regressor basic_regressor.get_params() regressor=best_regressor best_regressor.get_params() regressor2=best_regressor2 best_regressor2.get_params() feature_names = ['Fizpatrick_Scale_Encoded', 'Age', 'Gender_Encoded', 'Anatom_Site_General_Encoded'] importances = regressor.feature_importances_ feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) print(feature_imp_df) importances0 = regressor0.feature_importances_ feature_imp_df0 = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances0}).sort_values('Gini Importance', ascending=False) print(feature_imp_df0) importances2 = regressor2.feature_importances_ feature_imp_df2 = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances2}).sort_values('Gini Importance', ascending=False) print(feature_imp_df2) # prompt: Create a bar graphy for feature importance for feature_imp_df import matplotlib.pyplot as plt import seaborn as sns # Assuming feature_imp_df is already defined as in your provided code plt.figure(figsize=(10, 6)) sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df) plt.title('Feature Importance for random grid search optimized regressor') plt.show() plt.savefig('FeatureImportance.png') plt.figure(figsize=(10, 6)) sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df0) plt.title('Feature Importance for basic regressor') plt.show() plt.savefig('FeatureImportance0.png') plt.figure(figsize=(10, 6)) sns.barplot(x='Gini Importance', y='Feature', data=feature_imp_df2) plt.title('Feature Importance for grid search optimized regressor') plt.show() plt.savefig('FeatureImportance2.png') # Step 7: Now you can make predictions or evaluate the model # Example: Predict the Fizpatrick Scale for a new sample predictions_loop_data = [] for gender in range(1, 8): for fizpatrick in range(0, 125, 10): for site in range(0, 7): for age in range(1, 2): prediction = regressor.predict([[fizpatrick, age, gender, site]]) # Make sure to pass it as a 2D array # Create a dictionary with hashable keys data_point = {'fizpatrick': fizpatrick, 'age': age, 'site': site, 'gender': gender, 'prediction': prediction[0]} # Append the dictionary to the list predictions_loop_data.append(data_point) #print(f"Prediction for new sample fizpatric={fizpatrick}, age={age},location={site}, gender={gender},: {prediction}") warnings.filterwarnings(action="ignore") #print(predictions_loop_data) """For out of bag data and prediction models prediction and errors""" # Evaluating the model from sklearn.metrics import mean_squared_error, r2_score # Access the OOB Score ob_score = regressor.oob_score_ print(f'Out-of-Bag Score: {ob_score}') # Making predictions on the same data or new data predictions = regressor.predict(X_test) # Evaluating the model mse = mean_squared_error(y_test, predictions) print(f'Mean Squared Error: {mse}') r2 = r2_score(y_test, predictions) print(f'R-squared: {r2}') regressor.get_params() ob_score2 = regressor2.oob_score_ print(f'Out-of-Bag Score: {ob_score2}') predictions2 = regressor2.predict(X_test) mse2 = mean_squared_error(y_test, predictions2) print(f'Mean Squared Error: {mse2}') r22 = r2_score(y_test, predictions2) print(f'R-squared: {r22}') ob_score0 = regressor0.oob_score_ print(f'Out-of-Bag Score: {ob_score0}') predictions0 = regressor0.predict(X_test) mse0 = mean_squared_error(y_test, predictions0) print(f'Mean Squared Error: {mse0}') r20 = r2_score(y_test, predictions0) print(f'R-squared: {r20}') # prompt: Graph comparing the different ob_scores, mse scores, and r2 scores for the three different regression models. import matplotlib.pyplot as plt # Sample data (replace with your actual scores) models = ['Basic Regressor', 'Randomized Search Optimized Regressor', 'Grid Search Optimized Regressor'] ob_scores = [ob_score0, ob_score, ob_score2] mse_scores = [mse0, mse, mse2] r2_scores = [r20, r2, r22] # Create the plot x = range(len(models)) width = 0.2 fig, ax = plt.subplots(figsize=(10, 6)) rects1 = ax.bar(x, ob_scores, width, label='OOB Score') rects2 = ax.bar([i + width for i in x], mse_scores, width, label='MSE Score') rects3 = ax.bar([i + 2 * width for i in x], r2_scores, width, label='R^2 Score') # Add some text for labels, title and custom x-axis tick labels, etc. ax.set_ylabel('Scores') ax.set_title('Model Performance Comparison') ax.set_xticks([i + width for i in x]) ax.set_xticklabels(models) ax.legend() def autolabel(rects): """Attach a text label above each bar in *rects*, displaying its height.""" for rect in rects: height = rect.get_height() ax.annotate('{:.4f}'.format(height), xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), # 3 points vertical offset textcoords="offset points", ha='center', va='bottom') autolabel(rects1) autolabel(rects2) autolabel(rects3) fig.tight_layout() plt.savefig('ModelPerformanceComparison.png') plt.show() """This will print our data as a plot with predicted values in green and actual values in blue""" # Pick a desicion tree - index 0 is our 0 tree_to_plot = regressor.estimators_[1] # Plot the decision tree plt.figure(figsize=(90, 30)) plot_tree(tree_to_plot, feature_names=x_categorical.columns.tolist(), filled=True, rounded=True, fontsize=10) plt.title("Decision Tree from Random Forest") plt.show()