#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd df = pd.read_csv('NHANES_COMPLETE_NEWEST_V2.csv') # In[5]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # In[2]: corrmat = df.corr() corrmat # In[3]: corrmat.index.get_loc('EverToldHadCancerOrMalignancy?') # In[53]: import pandas as pd import seaborn as sns import matplotlib.pyplot as plt plt.rcParams['font.family'] = 'Tahoma' # Change 'Arial' to the desired font family # Function to create heatmap for each DataFrame def create_heatmap(dataframe, title, subplot_index, start_index, end_index, show_xticklabels=False): corrmat = dataframe.corr(method='spearman') cols_to_drop = ['SEQN', 'Unnamed: 0.1', 'Unnamed: 0','Hypoglycemic?','GestationalDiabetes', 'PolycysticOvarianSyndrome?', 'CoveredByMedicare?', 'CoveredByMedicaid?', 'CoveredByMilitaryIns?', 'NoHealthCoverage?'] corrmat = corrmat.drop(cols_to_drop, axis=1) plt.subplot(3, 1, subplot_index) sns.heatmap(corrmat[start_index:end_index], annot=True, cmap='seismic', center=0, linewidths=0.5, annot_kws={'size': 9, 'rotation': 90, 'fontweight': 'bold', 'color': 'black'}, yticklabels=False, xticklabels=show_xticklabels) plt.ylabel(title, fontsize=7.5) plt.xticks(rotation=90) # Adjust the angle of x-axis labels if needed if show_xticklabels: plt.tick_params(axis='x', labelsize=8) # Set the xtick label size # Assuming you have your DataFrame 'df' loaded here # Create a single figure with multiple subplots plt.figure(figsize=(28, 18)) plt.suptitle('Cancer, Kidney Failure, and Metal Burden: Spearman Correlations', fontsize=20) create_heatmap(df, 'EverToldHadCancerOrMalignancy?', 1, 91, 92) create_heatmap(df, 'EverToldHaveWeakOrFailingKidneys?', 2, 83, 84) create_heatmap(df, 'TotalUrineMetalBurden_ug/L', 3, 99, 100, show_xticklabels=True) # Adjust spacing between subplots plt.tight_layout() # Save the plot as a high-resolution PNG file plt.savefig('heatmap2.png', dpi=1500) # Adjust the dpi value for higher resolution # Display the plot plt.show() # ## Now, on to machine learning # In[53]: from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.feature_selection import RFE from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso from sklearn.impute import SimpleImputer # In[54]: df = pd.read_csv('NHANES_COMPLETE_NEWEST_V2.csv') data1 = df data1.head() # In[55]: data1.shape # In[56]: data1.columns # In[57]: data2 = data1.drop(['EverToldHadCancerOrMalignancy?', 'Unnamed: 0.1', 'Unnamed: 0', 'SEQN', 'EverToldHaveAngina?', 'EverToldhadCoronaryHeartDisease?','EverToldHadHeartAttack?', 'EverToldHadStroke?', 'SerumCreatinine_umol/L', 'UrineArsenic_ug/L', 'UrineChromium_ug/L', 'UrineMercury_ug/L', 'UrineBarium_ug/L', 'UrineCadmium_ug/L', 'UrineCobalt_ug/L', 'UrineCesium_ug/L', 'UrineMolybendum_ug/L', 'UrineManganese_ug/L', 'UrineLead_ug/L', 'UrineAntimony_ug/L', 'UrineTin_ug/L', 'UrineThallium_ug/L', 'UrineTungsten_ug/L', 'UrineNickel_ug/L', 'ToldHaveHighCholesterol?', 'SerumCholesterol_mmol/L', 'HowManyTimesSeenDoctorPastYear?', 'LastA1CLevel?', 'CoveredByHealthIns?', 'CoveredByPrivateIns?', 'CoveredByMedicare?', 'CoveredByMedicaid?', 'CoveredByMilitaryIns?', 'NoHealthCoverage?', 'GeneralHealthCondition?', 'PlaceGoMostOftenForHealthcare?', 'HowLongSincleLastHealthcareVisit?', 'OvernightHospitalPatientLastYear?', 'EverUsedECig?', 'EverToldHaveHighBloodPressure?', 'ToldHaveHighCholesterol?', 'ToldHaveDiabetes?', 'ToldHavePreDiabetes?', 'Hypoglycemic?', 'GestationalDiabetes', 'PolycysticOvarianSyndrome?','HowManyTimesSeenDoctorPastYear?', 'LastA1CLevel?', 'SerumTriglycerides_mmol/L', 'EverHadKidneyStones?', 'MonthlyFamilyIncome', 'AgeFirstSmokedCigarette?', 'EverUsedSmokelessTobacco?', 'FirstCanderWhatKind?', 'SerumGlucose_mmol/L', 'AgeFistCancerDiagnosed?', 'AnnualHouseIncome', 'RatioFamilyIncomeToPoverty'], axis=1) # In[58]: data2.columns # In[59]: def impute_df(df): #Returns a dataframe with mean imputed values for NaN my_imputer = SimpleImputer(missing_values=np.nan) data_with_imputed_values = pd.DataFrame(my_imputer.fit_transform(df), columns = df.columns) return data_with_imputed_values # In[60]: df = impute_df(data2) df.shape # In[61]: df.info() # In[62]: #Add Category back in df['EverToldHadCancerOrMalignancy?'] = data1['EverToldHadCancerOrMalignancy?'] df.shape # In[63]: # Ever Told Had Cancer} 1 = yes; 2 = no; 7 = refused; 9 = don’t know df['EverToldHadCancerOrMalignancy?'] = df['EverToldHadCancerOrMalignancy?'].replace({1: 1, 2: -1}) df.info() # In[64]: df['EverToldHadCancerOrMalignancy?'].unique() # In[65]: df = df.astype('float64') # In[66]: df.info() # In[67]: #Loading the dataset X = df.drop(['EverToldHadCancerOrMalignancy?'], axis=1) #Feature Matrix y = df["EverToldHadCancerOrMalignancy?"] #Target Variable print('X', X.shape) print('y', y.shape) # In[68]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.feature_selection import RFE, SelectKBest, f_regression, chi2 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split # In[69]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.feature_selection import RFE, SelectKBest, f_regression, chi2 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split # Assuming you have your X (features) and y (target) datasets ready # X should be a DataFrame or a 2-dimensional array # y should be a 1-dimensional array or Series # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Scale the features (if needed, especially for some feature selection methods) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Method 1: Recursive Feature Elimination (RFE) with a Random Forest Classifier or Regressor model_rfe = RandomForestClassifier() # For classification tasks, use RandomForestClassifier() # model_rfe = RandomForestRegressor() # For regression tasks, use RandomForestRegressor() rfe = RFE(model_rfe, n_features_to_select=10) rfe.fit(X_train_scaled, y_train) # Method 2: Feature Importance from a Random Forest Classifier or Regressor model_rf = RandomForestClassifier() # For classification tasks, use RandomForestClassifier() # model_rf = RandomForestRegressor() # For regression tasks, use RandomForestRegressor() model_rf.fit(X_train_scaled, y_train) # Method 3: SelectKBest with chi-square (for classification tasks) or f_regression (for regression tasks) # For classification tasks: # selector = SelectKBest(chi2, k=10) # For regression tasks: selector = SelectKBest(f_regression, k=10) selector.fit(X_train_scaled, y_train) # Get the selected feature indices from each method selected_features_rfe = np.where(rfe.support_)[0] selected_features_rf = model_rf.feature_importances_.argsort()[-10:][::-1] selected_features_skb = selector.get_support(indices=True) # Get the feature names (assuming X is a DataFrame) feature_names = X.columns # Print the top 10 most important features for each method print("Top 10 Most Important Features from RFE:") print(feature_names[selected_features_rfe]) print("\nTop 10 Most Important Features from Random Forest:") print(feature_names[selected_features_rf]) print("\nTop 10 Most Important Features from SelectKBest:") print(feature_names[selected_features_skb]) # Visualization of feature importance from Random Forest plt.figure(figsize=(10, 6)) plt.bar(range(len(selected_features_rf)), model_rf.feature_importances_[selected_features_rf], align='center') plt.xticks(range(len(selected_features_rf)), feature_names[selected_features_rf], rotation=90) plt.xlabel('Feature') plt.ylabel('Feature Importance') plt.title('Top 10 Most Important Features from Random Forest') plt.tight_layout() plt.show() # # I will eventually repeat all of this; but with the kidney column and heavy metals as "Y"/target variables. Let me finish the process with EverToldHadCancerOrMalignancy? # In[70]: X = X[['Height_cm', 'WaistCircumference_cm', 'DiastolicBP_mmHg', 'UrineAlbumin_mg/L', 'AlbuminCreatRatio_mg/g', 'LDL_mmol/L', 'AlkalinePhosphatase_IU/L', 'CreatinePhosphokinase_IU/L', 'LactateDehydrogenase_IU/L', 'UricAcid_umol/L', 'SystolicBP_mmHg', 'BUN_mmol/L','Weight_kg', 'Osmolality_mmol/Kg', 'TotalCholesterol_mg/dL', 'AlanineAminotransferase_U/L', 'Globulin_g/L', 'TotalProtein_g/L', 'DialysisPast12Months?' ]] X.shape # In[71]: #the dataset is now cleaned, reduced, and ready for modeling. # I will save this form of the dataset as the "final" for modeling dataset as a csv y.reset_index(drop=True, inplace=True) df = X df = df.assign(EverToldHadCancerOrMalignancy=y) df = df.astype('float64') df.to_csv('./df_final.csv', index=None) # In[72]: import pandas as pd df = pd.read_csv('./df_final.csv') df.head() # In[73]: df.columns # In[74]: df.shape # In[75]: df = df.drop(df[df['EverToldHadCancerOrMalignancy'] == 9].index) # In[76]: df.shape # In[77]: df['EverToldHadCancerOrMalignancy'].unique() # In[78]: X = df.drop(columns = ['EverToldHadCancerOrMalignancy']) y = df['EverToldHadCancerOrMalignancy'] # In[79]: #baseline, NULL Model y.value_counts(normalize=True) # In[80]: #Set up test and train sets X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,random_state=42) # In[81]: ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) # In[82]: np.savetxt('./positive_X_train.csv',X_train,delimiter=',') np.savetxt('./positive_y_train.csv',y_train,delimiter=',') np.savetxt('./positive_X_test.csv',X_test,delimiter=',') np.savetxt('./positive_y_test.csv',y_test,delimiter=',') # In[83]: from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier, VotingClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,recall_score,precision_score # from treeinterpreter import treeinterpreter as ti from sklearn.preprocessing import StandardScaler from sklearn.model_selection import RandomizedSearchCV from sklearn import metrics models = { 'LogReg': LogisticRegression(), 'Decision Tree':DecisionTreeClassifier(), 'Random Forest':RandomForestClassifier(), 'Gradient Boost':GradientBoostingClassifier(), 'Ada Boost':AdaBoostClassifier(), 'SVC':SVC(), 'Naive Bayes':GaussianNB()} # In[84]: #adapted from Dan Brown lecture final = pd.DataFrame(columns = ['cross_val_train','cross_val_test','test_recall','test_precision']) idx=0 while idx < len(models.keys()): for name,model in models.items(): results = {} results['name']=name name=model.fit(X_train, y_train) y_pred_train = model.predict(X_train) y_pred_test = model.predict(X_test) results['cross_val_train'] = np.mean(cross_val_score(model,X_train,y_train,cv=4)) results['cross_val_test'] = np.mean(cross_val_score(model,X_test,y_test,cv=4)) results['test_recall'] = recall_score(y_test, y_pred_test) results['test_precision'] = precision_score(y_test, y_pred_test) final = final.append(results,ignore_index=True) idx+=1 # In[85]: final.set_index('name') # In[86]: ### Find the best parameters LogReg = LogisticRegression() params = { 'penalty': ['none', 'l2', 'l1', 'elasticnet'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'warm_start': [True, False], 'n_jobs': [None, 0, 2, 4, 10], 'random_state': [42] } random_search = RandomizedSearchCV(LogReg, param_distributions=params, n_iter=10, cv=5, n_jobs=-1, random_state=42) random_search.fit(X_train, y_train) print(random_search.best_score_) print(random_search.best_params_) # In[87]: #Set the model lg = LogisticRegression(n_jobs=2, penalty='l2', warm_start=False, solver='liblinear', random_state=42) lg.fit(X_train, y_train) # In[88]: from sklearn.metrics import recall_score, precision_score, average_precision_score, roc_auc_score, accuracy_score models = {'lg': lg} #adapted from Dan Brown lecture final = pd.DataFrame(columns = ['cross_val_train','cross_val_test','test_recall', 'test_precision', 'average_precision', 'test_roc_auc_score', 'accuracy_score']) idx=0 while idx < len(models.keys()): for name,model in models.items(): results = {} results['name']=name name=model.fit(X_train, y_train) y_pred_train = model.predict(X_train) y_pred_test = model.predict(X_test) results['cross_val_train'] = np.mean(cross_val_score(model,X_train,y_train,cv=4)) results['cross_val_test'] = np.mean(cross_val_score(model,X_test,y_test,cv=4)) results['test_recall'] = recall_score(y_test, y_pred_test).round(3) results['test_precision'] = precision_score(y_test, y_pred_test).round(3) results['average_precision'] = average_precision_score(y_test, y_pred_test) results['test_roc_auc_score'] = roc_auc_score(y_test, y_pred_test) results['accuracy_score'] = accuracy_score(y_test, y_pred_test) final = final.append(results,ignore_index=True) idx+=1 # In[89]: final.set_index('name') final['name'] = 'Log Reg' FullResults = final FullResults # In[90]: # model can be any trained classifier that supports predict_proba() lg.fit(X_train, y_train) y_preds = lg.predict_proba(X_test) # take the second column because the classifier outputs scores for # the 0 class as well preds = y_preds[:,1] # fpr means false-positive-rate # tpr means true-positive-rate fpr, tpr, _ = metrics.roc_curve(y_test, preds) auc_score = metrics.auc(fpr, tpr) # clear current figure plt.clf() plt.title('Log Reg: ROC Curve') plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc_score)) # it's helpful to add a diagonal to indicate where chance # scores lie (i.e. just flipping a coin) plt.plot([0,1],[0,1],'r--') plt.xlim([-0.1,1.1]) plt.ylim([-0.1,1.1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc='lower right') plt.show() # In[91]: from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay y_pred = lg.predict(X_test) # Calculate the confusion matrix cm = confusion_matrix(y_test, y_pred, labels=lg.classes_) # Plot the confusion matrix disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lg.classes_) disp.plot(cmap=plt.cm.Blues) plt.show() # In[92]: TP = 2 FP = 4 TN = 223 FN = 26 # In[93]: Precision = TP / (TP + FP) Sensitivity = Recall = TP / (TP + FN) Specificity = TN / (TN + FP) F1 = 2*(Sensitivity*Precision) / (Sensitivity+Precision) Accuracy = (TP + TN) / (TP + FP + TN + FN) Missed = (FN/(TP+FN))*100 Caught = ((TP)/(TP+FN))*100 print('EverToldHadCancerOrMalignancy: Best Tuned Model: Logistic Regression NO PCA') print('MODEL') print("") print('Precision: {:.03f}'.format(Precision)) print('Sensitivity: {:.03f}'.format(Sensitivity)) print('Specificity: {:.03f}'.format(Specificity)) print('F1 Score: {:.03f}'.format(F1)) print('Accuracy: {:.03f}'.format(Accuracy)) print('Caught: {:.02f}%'.format(Caught)) print('Missed: {:.02f}%'.format(Missed)) from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay y_pred = lg.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels=lg.classes_) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lg.classes_) disp.plot(cmap= plt.cm.Blues) plt.show() # In[1]: import pandas as pd # In[ ]: