From 32f31bce5ba04ce5cf2e807b824091b243c81de2 Mon Sep 17 00:00:00 2001 From: Syed Zohiab Ali <95756200+ZohaiAli@users.noreply.github.com> Date: Sun, 12 Oct 2025 13:24:29 +0500 Subject: [PATCH] Refactor RandomForest script: Python 3, modern sklearn, modular structure --- Code/DM_Project_RandomForest.py | 721 +++++--------------------------- 1 file changed, 112 insertions(+), 609 deletions(-) diff --git a/Code/DM_Project_RandomForest.py b/Code/DM_Project_RandomForest.py index 1448764..e52a888 100755 --- a/Code/DM_Project_RandomForest.py +++ b/Code/DM_Project_RandomForest.py @@ -1,629 +1,132 @@ - # coding: utf-8 - -# In[1]: - import numpy as np import pandas as pd - - -# # Sensor located at Dominant Wrist: - -# In[12]: - +import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn import metrics -from sklearn.cross_validation import cross_val_score -import matplotlib.pyplot as plt +from sklearn.model_selection import GridSearchCV, cross_val_score from sklearn.metrics import confusion_matrix -from sklearn.grid_search import GridSearchCV - -# ## Reading the training files - -# In[4]: - -dominant_wrist_file = pd.read_csv('sensor_based_files_train/dominant_wrist_train.csv') -train_data_wrist = dominant_wrist_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_wrist = dominant_wrist_file[['Activity']].values.ravel() - - -# In[6]: -print "Training data dimensions for sensor at dominant_wrist position" -print train_data_wrist.shape -print target_label_wrist.shape - -print "----------------------------------------------------------------------------" -print "\n" -# ## Reading the testing files - -# In[8]: - -dominant_wrist_file_test = pd.read_csv('sensor_based_files_test/dominant_wrist_test.csv') -test_data_wrist = dominant_wrist_file_test[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_wrist_test = dominant_wrist_file_test[['Activity']].values.ravel() - - -# In[9]: -print "Testing data dimensions for sensor at dominant_wrist position" -print test_data_wrist.shape -print target_label_wrist_test.shape - -print "----------------------------------------------------------------------------" -print "\n" - -# In[10]: - -# clf = RandomForestClassifier(n_estimators=100, criterion='entropy') -# clf.fit(train_data_wrist, target_label_wrist) -# predicted = clf.predict(test_data_wrist) -rfc_wrist = RandomForestClassifier(n_estimators=100, criterion='entropy',n_jobs=-1) -param_grid = { - 'n_estimators': [50, 100, 200], - 'criterion': ['entropy', 'gini'] -} -rfc_wrist_gs = GridSearchCV(rfc_wrist, param_grid=param_grid) -rfc_wrist_gs.fit(train_data_wrist, target_label_wrist) -predicted = rfc_wrist_gs.predict(test_data_wrist) -# print "Best parameters to be used for training the model",rfc_wrist_gs.best_params_ - - -# In[13]: - -# Evaluation -print "Classification report for sensor at Wrist position" -print metrics.classification_report(target_label_wrist_test, predicted) -print metrics.confusion_matrix(target_label_wrist_test, predicted) -print "F-Score: ",metrics.f1_score(target_label_wrist_test, predicted) - - -# In[43]: - -activities = ['sitting:-legs-straight','cycling:-70-rpm_-50-watts_-.7-kg','walking:-natural','lying:-on-back'] -def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(activities)) - plt.xticks(tick_marks, activities, rotation=45) - plt.yticks(tick_marks, activities) - plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - - -# Compute confusion matrix -cm = confusion_matrix(target_label_wrist_test, predicted, labels=activities) -np.set_printoptions(precision=2) -# print('Confusion matrix, without normalization') -# print(cm) -# plt.figure(figsize=(10,10)) -# plot_confusion_matrix(cm) - -# Normalize the confusion matrix by row (i.e by the number of samples -# in each class) -cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] -print('Normalized confusion matrix for Sensor located at Dominant Wrist position') -print(cm_normalized) -plt.figure(figsize=(10,10)) -plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for Sensor located at Dominant Wrist position') -plt.savefig('confusion_matrix.png') -plt.show() - -print "--------------------------------------------------------------------------------------" -print "\n" -# ## 10Fold Cross Validation - -# In[18]: - -# Reading all 33 subject files -dominant_wrist_file = pd.read_csv('sensor_based_files_33/dominant_wrist_train.csv') -train_data_wrist = dominant_wrist_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_wrist = dominant_wrist_file[['Activity']].values.ravel() - -print "Beginning 10-Fold cross validation on all the 33 subjects for sensor at wrist position..." -# In[19]: -# print "Train data dimensions for all 33 subjects" -# print train_data_wrist.shape -# print target_label_wrist.shape - - -# In[20]: - -clf_cross_val = RandomForestClassifier(n_estimators=50, criterion='gini') -scores = cross_val_score(clf_cross_val, train_data_wrist,target_label_wrist,cv=10) -print "Scores for each fold:" -print scores -print "\n" -print "---------------------------------------------------------------------------------" -print ("Accuracy for 10-Fold cross validation using Random Forest: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) - - -# # Sensor located at Upper-Arm -# ## Reading Training files - -# In[44]: - -dominant_upperarm_file = pd.read_csv('sensor_based_files_train/dominant_Upper_Arm_train.csv') -train_data_upperarm = dominant_upperarm_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_upperarm = dominant_upperarm_file[['Activity']].values.ravel() - - -# In[45]: -print "Training data dimensions for sensor at dominant_upper_arm position" -print train_data_upperarm.shape -print target_label_upperarm.shape -print "----------------------------------------------------------------------------" -print "\n" - -# ## Reading Testing files - -# In[47]: - -dominant_upperarm_file_test = pd.read_csv('sensor_based_files_test/dominant_Upper_Arm_test.csv') -test_data_upperarm = dominant_upperarm_file_test[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_upperarm_test = dominant_upperarm_file_test[['Activity']].values.ravel() - - -# In[48]: -print "Testing data dimensions for sensor at dominant_upper_arm position" -print test_data_upperarm.shape -print target_label_upperarm_test.shape -print "----------------------------------------------------------------------------" -print "\n" - - -# ## Using Random Forest - -# In[49]: - -# clf_arm = RandomForestClassifier(n_estimators=100, criterion='entropy') -# clf_arm.fit(train_data_upperarm, target_label_upperarm) -# predicted = clf_arm.predict(test_data_upperarm) -rfc_upperarm = RandomForestClassifier(n_estimators=100, criterion='entropy',n_jobs=-1) -param_grid = { - 'n_estimators': [50, 100, 200], - 'criterion': ['entropy', 'gini'] -} -rfc_arm_gs = GridSearchCV(rfc_upperarm, param_grid=param_grid) -rfc_arm_gs.fit(train_data_upperarm, target_label_upperarm) -predicted = rfc_arm_gs.predict(test_data_upperarm) -# print "Best parameters to be used for training the model",rfc_arm_gs.best_params_ - - -# In[50]: - -# Evaluation -print "Classification report for sensor at Upper_Arm position" -print metrics.classification_report(target_label_upperarm_test, predicted) -print metrics.confusion_matrix(target_label_upperarm_test, predicted) -print "F-Score: ",metrics.f1_score(target_label_upperarm_test, predicted) - - -# In[51]: - -activities = ['sitting:-legs-straight','cycling:-70-rpm_-50-watts_-.7-kg','walking:-natural','lying:-on-back'] -def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(activities)) - plt.xticks(tick_marks, activities, rotation=45) - plt.yticks(tick_marks, activities) - plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - - -# Compute confusion matrix -cm = confusion_matrix(target_label_upperarm_test, predicted, labels=activities) -np.set_printoptions(precision=2) -# print('Confusion matrix, without normalization') -# print(cm) -# plt.figure(figsize=(10,10)) -# plot_confusion_matrix(cm) - -# Normalize the confusion matrix by row (i.e by the number of samples -# in each class) -cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] -print('Normalized confusion matrix for Sensor located at Upper Arm position') -print(cm_normalized) -plt.figure(figsize=(10,10)) -plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for Sensor located at Upper Arm position') -plt.savefig('confusion_matrix.png') -plt.show() - -print "--------------------------------------------------------------------------------------" -print "\n" -# ## 10Fold Cross Validation - -# In[53]: - -# Reading all 33 subject files -dominant_upperarm_file = pd.read_csv('sensor_based_files_33/dominant_Upper_Arm_train.csv') -train_data_upperarm = dominant_upperarm_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_upperarm = dominant_upperarm_file[['Activity']].values.ravel() - - -# In[54]: - -# print train_data_upperarm.shape -# print target_label_upperarm.shape - - -# In[78]: -print "Beginning 10-Fold cross validation on all the 33 subjects for sensor at upper arm position..." -clf_upperarm_cross_val = RandomForestClassifier(n_estimators=100, criterion='gini') -scores = cross_val_score(clf_upperarm_cross_val, train_data_upperarm,target_label_upperarm,cv=10) -print "Scores for each fold:" -print scores -print "\n" -print "---------------------------------------------------------------------------------" -print ("Accuracy for 10Fold cross validation using Random Forest: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) - - -# # Sensor located at Thigh -# ## Reading Training Files - -# In[70]: - -dominant_thigh_file = pd.read_csv('sensor_based_files_train/dominant_Thigh_train.csv') -train_data_thigh = dominant_thigh_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_thigh = dominant_thigh_file[['Activity']].values.ravel() - - -# In[71]: -print "Training data dimensions for sensor at dominant_thigh position" -print train_data_thigh.shape -print target_label_thigh.shape -print "----------------------------------------------------------------------------" -print "\n" - -# ## Reading Testing files - -# In[72]: +import os -dominant_thigh_file_test = pd.read_csv('sensor_based_files_test/dominant_Thigh_test.csv') -test_data_thigh = dominant_thigh_file_test[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_thigh_test = dominant_thigh_file_test[['Activity']].values.ravel() +def load_data(train_path, test_path): + """Loads and returns train and test datasets.""" + try: + train_df = pd.read_csv(train_path) + test_df = pd.read_csv(test_path) + except FileNotFoundError: + print(f"[ERROR] File not found: {train_path} or {test_path}") + return None, None, None, None -# In[67]: -print "Testing data dimensions for sensor at dominant_thigh position" -print test_data_thigh.shape -print target_label_thigh_test.shape -print "----------------------------------------------------------------------------" -print "\n" + features = [ + 'MeanSM', 'StDevSM', 'MdnSM', 'belowPer25SM', 'belowPer75SM', + 'TotPower_0.3_15', 'FirsDomFre_0.3_15', 'PowFirsDomFre_0.3_15', + 'SecDomFre_0.3_15', 'PowSecDomFre_0.3_15', + 'FirsDomFre_0.6_2.5', 'PowFirsDomFre_0.6_2.5', + 'FirsDomFre_per_TotPower_0.3_15' + ] -# ## using Random Forest + X_train = train_df[features].values + y_train = train_df['Activity'].values + X_test = test_df[features].values + y_test = test_df['Activity'].values -# In[91]: + return X_train, y_train, X_test, y_test -# clf_thigh = RandomForestClassifier(n_estimators=100, criterion='entropy') -# clf_thigh.fit(train_data_thigh, target_label_thigh) -# predicted = clf_thigh.predict(test_data_thigh) -rfc_thigh = RandomForestClassifier(n_estimators=100, criterion='entropy',n_jobs=-1) -param_grid = { - 'n_estimators': [50, 100, 200], - 'criterion': ['entropy', 'gini'] -} -rfc_thigh_gs = GridSearchCV(rfc_thigh, param_grid=param_grid) -rfc_thigh_gs.fit(train_data_thigh, target_label_thigh) -predicted = rfc_thigh_gs.predict(test_data_thigh) -# print "Best parameters to be used for training the model",rfc_thigh_gs.best_params_ -# In[92]: - -# Evaluation -print "Classification report for sensor at Thigh position" -print metrics.classification_report(target_label_thigh_test, predicted) -print metrics.confusion_matrix(target_label_thigh_test, predicted) -print "F-Score: ",metrics.f1_score(target_label_thigh_test, predicted) - - -# In[93]: - -activities = ['sitting:-legs-straight','cycling:-70-rpm_-50-watts_-.7-kg','walking:-natural','lying:-on-back'] -def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(activities)) - plt.xticks(tick_marks, activities, rotation=45) - plt.yticks(tick_marks, activities) - plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - - -# Compute confusion matrix -cm = confusion_matrix(target_label_thigh_test, predicted, labels=activities) -np.set_printoptions(precision=2) -# print('Confusion matrix, without normalization') -# print(cm) -# plt.figure(figsize=(10,10)) -# plot_confusion_matrix(cm) - -# Normalize the confusion matrix by row (i.e by the number of samples -# in each class) -cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] -print('Normalized confusion matrix for Sensor located at Dominant Thigh position') -print(cm_normalized) -plt.figure(figsize=(10,10)) -plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for Sensor located at Dominant Thigh position') -plt.savefig('confusion_matrix.png') -plt.show() -print "--------------------------------------------------------------------------------------" -print "\n" - -# ## 10Fold Cross Validation - -# In[76]: - -# Reading all 33 subject files -dominant_thigh_file = pd.read_csv('sensor_based_files_33/dominant_Thigh_train.csv') -train_data_thigh = dominant_thigh_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_thigh = dominant_thigh_file[['Activity']].values.ravel() - - -# In[77]: -# -# print train_data_thigh.shape -# print target_label_thigh.shape - - -# In[79]: -print "Beginning 10-Fold cross validation on all the 33 subjects for sensor at Thigh position..." -clf_thigh_cross_val = RandomForestClassifier(n_estimators=200, criterion='gini') -scores = cross_val_score(clf_thigh_cross_val, train_data_thigh,target_label_thigh,cv=10) -print "Scores for each fold:" -print scores -print "---------------------------------------------------------------------------------" -print "\n" -print ("Accuracy for 10Fold cross validation using Random Forest: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) - - -# # Sensor Located at Hip -# ## Reading Training Files - -# In[94]: - -dominant_hip_file = pd.read_csv('sensor_based_files_train/dominant_Hip_train.csv') -train_data_hip = dominant_hip_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_hip = dominant_hip_file[['Activity']].values.ravel() - - -# In[95]: -print "Training data dimensions for sensor at dominant_hip position" -print train_data_hip.shape -print target_label_hip.shape -print "----------------------------------------------------------------------------" -print "\n" - - -# ## Reading Test Files - -# In[96]: - -dominant_hip_file_test = pd.read_csv('sensor_based_files_test/dominant_Hip_test.csv') -test_data_hip = dominant_hip_file_test[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_hip_test = dominant_hip_file_test[['Activity']].values.ravel() - - -# In[102]: -print "Testing data dimensions for sensor at dominant_hip position" -print test_data_hip.shape -print target_label_hip_test.shape -print "----------------------------------------------------------------------------" -print "\n" - -# ## Using Random Forest - -# In[104]: - -# clf_hip = RandomForestClassifier(n_estimators=100, criterion='entropy') -# clf_hip.fit(train_data_hip, target_label_hip) -# predicted = clf_hip.predict(test_data_hip) -rfc_hip = RandomForestClassifier(n_estimators=100, criterion='entropy',n_jobs=-1) -param_grid = { - 'n_estimators': [50, 100, 200], - 'criterion': ['entropy', 'gini'] -} -rfc_hip_gs = GridSearchCV(rfc_hip, param_grid=param_grid) -rfc_hip_gs.fit(train_data_hip, target_label_hip) -predicted = rfc_hip_gs.predict(test_data_hip) -# print "Best parameters to be used for training the model",rfc_hip_gs.best_params_ - -# In[105]: - -# Evaluation -print "Classification report for sensor at Hip position" -print metrics.classification_report(target_label_hip_test, predicted) -print metrics.confusion_matrix(target_label_hip_test, predicted) -print "F-Score",metrics.f1_score(target_label_hip_test, predicted) - - -# In[106]: - -activities = ['sitting:-legs-straight','cycling:-70-rpm_-50-watts_-.7-kg','walking:-natural','lying:-on-back'] -def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): - plt.imshow(cm, interpolation='nearest', cmap=cmap) +def plot_confusion_matrix(cm, activities, title, filename): + plt.figure(figsize=(8, 8)) + plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) plt.title(title) plt.colorbar() tick_marks = np.arange(len(activities)) - plt.xticks(tick_marks, activities, rotation=45) + plt.xticks(tick_marks, activities, rotation=45, ha="right") plt.yticks(tick_marks, activities) - plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') - - -# Compute confusion matrix -cm = confusion_matrix(target_label_hip_test, predicted, labels=activities) -np.set_printoptions(precision=2) -# print('Confusion matrix, without normalization') -# print(cm) -# plt.figure(figsize=(10,10)) -# plot_confusion_matrix(cm) - -# Normalize the confusion matrix by row (i.e by the number of samples -# in each class) -cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] -print('Normalized confusion matrix for Sensor located at Dominant Hip position') -print(cm_normalized) -plt.figure(figsize=(10,10)) -plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for Sensor located at Dominant Hip position') -plt.savefig('confusion_matrix.png') -plt.show() -print "--------------------------------------------------------------------------------------" -print "\n" - -# ## 10Fold Cross validation - -# In[107]: - -# Reading all 33 subject files -dominant_hip_file = pd.read_csv('sensor_based_files_33/dominant_Hip_train.csv') -train_data_hip = dominant_hip_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_hip = dominant_hip_file[['Activity']].values.ravel() - - -# In[108]: -# -# print train_data_hip.shape -# print target_label_hip.shape - - -# In[109]: -print "Beginning 10-Fold cross validation on all the 33 subjects for sensor at Hip position..." -clf_hip_cross_val = RandomForestClassifier(n_estimators=200, criterion='gini') -scores = cross_val_score(clf_hip_cross_val, train_data_hip,target_label_hip,cv=10) -print "Scores for each fold:" -print scores -print "---------------------------------------------------------------------------------" -print "\n" -print ("Accuracy for 10Fold cross validation using Random Forest: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) - - -# # Sensor Located at Ankle -# ## Reading Training files - -# In[111]: - -dominant_ankle_file = pd.read_csv('sensor_based_files_train/dominant_Ankle_train.csv') -train_data_ankle = dominant_ankle_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_ankle = dominant_ankle_file[['Activity']].values.ravel() - - -# In[112]: -print "Training data dimensions for sensor at dominant_ankle position" -print train_data_ankle.shape -print target_label_ankle.shape -print "----------------------------------------------------------------------------" -print "\n" - - -# ## Reading Test files - -# In[114]: - -dominant_ankle_file_test = pd.read_csv('sensor_based_files_test/dominant_Ankle_test.csv') -test_data_ankle = dominant_ankle_file_test[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_ankle_test = dominant_ankle_file_test[['Activity']].values.ravel() - - -# In[115]: -print "Testing data dimensions for sensor at dominant_ankle position" -print test_data_ankle.shape -print target_label_ankle_test.shape -print "----------------------------------------------------------------------------" -print "\n" - -# ## Using Random Forest - -# In[119]: - -# clf_ankle = RandomForestClassifier(n_estimators=100, criterion='entropy') -# clf_ankle.fit(train_data_ankle, target_label_ankle) -# predicted = clf_ankle.predict(test_data_ankle) -rfc_ankle = RandomForestClassifier(n_estimators=100, criterion='entropy',n_jobs=-1) -param_grid = { - 'n_estimators': [50, 100, 200], - 'criterion': ['entropy', 'gini'] -} -rfc_ankle_gs = GridSearchCV(rfc_ankle, param_grid=param_grid) -rfc_ankle_gs.fit(train_data_ankle, target_label_ankle) -predicted = rfc_ankle_gs.predict(test_data_ankle) -# print "Best parameters to be used for training the model",rfc_ankle_gs.best_params_ - -# In[120]: - -# Evaluation -print "Classification report for sensor at Ankle position" -print metrics.classification_report(target_label_ankle_test, predicted) -print metrics.confusion_matrix(target_label_ankle_test, predicted) -print "F-Score",metrics.f1_score(target_label_ankle_test, predicted) - - -# In[121]: - -activities = ['sitting:-legs-straight','cycling:-70-rpm_-50-watts_-.7-kg','walking:-natural','lying:-on-back'] -def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(activities)) - plt.xticks(tick_marks, activities, rotation=45) - plt.yticks(tick_marks, activities) plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - - -# Compute confusion matrix -cm = confusion_matrix(target_label_ankle_test, predicted, labels=activities) -np.set_printoptions(precision=2) -# print('Confusion matrix, without normalization') -# print(cm) -# plt.figure(figsize=(10,10)) -# plot_confusion_matrix(cm) - -# Normalize the confusion matrix by row (i.e by the number of samples -# in each class) -cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] -print('Normalized confusion matrix for Sensor located at Dominant Ankle position') -print(cm_normalized) -plt.figure(figsize=(10,10)) -plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix for Sensor located at Dominant Ankle position') -plt.savefig('confusion_matrix.png') -plt.show() -print "--------------------------------------------------------------------------------------" -print "\n" - -# ## 10Fold Cross Validation - -# In[122]: - -# Reading all 33 subject files -dominant_ankle_file = pd.read_csv('sensor_based_files_33/dominant_Ankle_train.csv') -train_data_ankle = dominant_ankle_file[['MeanSM','StDevSM','MdnSM','belowPer25SM','belowPer75SM','TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15']].values -target_label_ankle = dominant_ankle_file[['Activity']].values.ravel() - - -# In[123]: - -# print train_data_ankle.shape -# print target_label_ankle.shape - - -# In[124]: -print "Beginning 10-Fold cross validation on all the 33 subjects for sensor at Ankle position..." -clf_ankle_cross_val = RandomForestClassifier(n_estimators=100, criterion='entropy') -scores = cross_val_score(clf_ankle_cross_val, train_data_ankle,target_label_ankle,cv=10) -print "Scores for each fold:" -print scores -print "---------------------------------------------------------------------------------" -print "\n" -print ("Accuracy for 10Fold cross validation using Random Forest: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) - - - + plt.savefig(filename) + plt.close() + + +def train_random_forest(sensor_name, train_path, test_path): + print(f"\n===== Sensor: {sensor_name} =====") + X_train, y_train, X_test, y_test = load_data(train_path, test_path) + if X_train is None: + return + + print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}") + + rfc = RandomForestClassifier(n_jobs=-1) + param_grid = { + 'n_estimators': [50, 100, 200], + 'criterion': ['entropy', 'gini'] + } + + grid_search = GridSearchCV(rfc, param_grid, cv=3) + grid_search.fit(X_train, y_train) + predicted = grid_search.predict(X_test) + + print(f"Best params for {sensor_name}: {grid_search.best_params_}") + print(metrics.classification_report(y_test, predicted)) + print("Confusion matrix:\n", metrics.confusion_matrix(y_test, predicted)) + + cm = confusion_matrix(y_test, predicted, labels=np.unique(y_test)) + cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + + out_dir = "results" + os.makedirs(out_dir, exist_ok=True) + plot_confusion_matrix( + cm_norm, + activities=np.unique(y_test), + title=f"Normalized CM - {sensor_name}", + filename=f"{out_dir}/conf_matrix_{sensor_name}.png" + ) + + f1 = metrics.f1_score(y_test, predicted, average="weighted") + print(f"Weighted F1 Score for {sensor_name}: {f1:.3f}") + + return grid_search.best_estimator_ + + +def cross_validation(sensor_name, data_path): + print(f"\n===== Cross Validation: {sensor_name} =====") + df = pd.read_csv(data_path) + features = [ + 'MeanSM', 'StDevSM', 'MdnSM', 'belowPer25SM', 'belowPer75SM', + 'TotPower_0.3_15', 'FirsDomFre_0.3_15', 'PowFirsDomFre_0.3_15', + 'SecDomFre_0.3_15', 'PowSecDomFre_0.3_15', + 'FirsDomFre_0.6_2.5', 'PowFirsDomFre_0.6_2.5', + 'FirsDomFre_per_TotPower_0.3_15' + ] + X = df[features].values + y = df['Activity'].values + + clf = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1) + scores = cross_val_score(clf, X, y, cv=10) + print("Scores for each fold:", scores) + print(f"Mean Accuracy: {scores.mean():.2f} ± {scores.std() * 2:.2f}") + + +if __name__ == "__main__": + sensors = { + "Wrist": ("sensor_based_files_train/dominant_wrist_train.csv", + "sensor_based_files_test/dominant_wrist_test.csv", + "sensor_based_files_33/dominant_wrist_train.csv"), + "UpperArm": ("sensor_based_files_train/dominant_Upper_Arm_train.csv", + "sensor_based_files_test/dominant_Upper_Arm_test.csv", + "sensor_based_files_33/dominant_Upper_Arm_train.csv"), + "Thigh": ("sensor_based_files_train/dominant_Thigh_train.csv", + "sensor_based_files_test/dominant_Thigh_test.csv", + "sensor_based_files_33/dominant_Thigh_train.csv"), + "Hip": ("sensor_based_files_train/dominant_Hip_train.csv", + "sensor_based_files_test/dominant_Hip_test.csv", + "sensor_based_files_33/dominant_Hip_train.csv"), + "Ankle": ("sensor_based_files_train/dominant_Ankle_train.csv", + "sensor_based_files_test/dominant_Ankle_test.csv", + "sensor_based_files_33/dominant_Ankle_train.csv") + } + + for name, (train_p, test_p, cross_p) in sensors.items(): + train_random_forest(name, train_p, test_p) + cross_validation(name, cross_p)