Merge branch 'dev' of github.com:Pezz89/QMUL_Final_Project into dev

This commit is contained in:
2017-09-21 10:00:49 +01:00
6 changed files with 86 additions and 30 deletions
+6 -5
View File
@@ -23,6 +23,7 @@ to the user's path and is accessible from the commandline)
- G++ >= 4.9.X ([sudo] apt-get install g++-4.9 | brew install gcc@4.9)
- Python 2.7.11 ([sudo] apt-get install python | brew install python)
- Pip (Python's package manager - not always included with python distribution)
- multitail (Not essential, used for viewing logs)
(It is likely that code will run on other versions than those stated, however
this is not guaranteed)
@@ -57,7 +58,7 @@ The included scripts are:
run_demo.sh
Complete run of system from start to finish: Trains a model on the
demo_dataset, optimizes for to evaluation, picks 3 features and scores
demo_dataset, optimizes for 2 evaluations, picks 3 features and scores
performance using metrics described in the report.
run_optimized_model_demo.sh
@@ -74,13 +75,13 @@ Train a model from scratch on the full dataset. This will take a considerable
amount of time and requires significant computing power. This demonstrates the
method used for development of the final algorithm.
view_logs
Opens log files created when running the program. It is advised that this is
run during particle swarm optimisation, as this is when threaded logging occurs
./src/main.py --help
The underlying interface used for training, optimization and scoring of models.
Running the help flag displays a list of all arguments available to the user.
This can be used for the generation of new models on any dataset (however, it
is highly recommended that the full Physionet dataset is used for best results:
https://physionet.org/physiobank/database/challenge/2016/)
Although this script is fully functional, with documented argument parser, it
was not intended for use by anyone other than the author. As such, errors as a
result of unexpected user input are likely and may not be handled gracefully
+2 -2
View File
@@ -38,6 +38,7 @@ x = ['A', 'B', 'C', 'D', 'E', 'F']
plt.xticks(np.arange(6), x)
plt.title('Leave-one-out Specificity');
plt.tight_layout()
plt.show()
########################################################################
fig = plt.figure() # create a plot figure
@@ -69,9 +70,8 @@ e = np.array([0.0293, 0.0267, 0.0208, 0.0280, 0.0226, 0.0214, 0.0229, 0.0206, 0.
plt.errorbar(x, y, yerr=e, fmt='o', color='black', ecolor='darkgray', elinewidth=3, capsize=0)
plt.xticks(x)
plt.xticks(x)
plt.xticks(np.arange(10), x)
plt.tight_layout()
plt.show()
plt.title('10-fold Specificity');
fig.set_size_inches(9*1.3, 3*1.3*3)
+67 -21
View File
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import division
################################################################################
@@ -12,16 +13,16 @@ import sys
import multiprocessing
import six
import textwrap
import warnings
from tabulate import tabulate
################################################################################
# Scikit-Learn imports
################################################################################
from sklearn.model_selection import cross_val_score, GroupKFold, train_test_split, StratifiedKFold, GroupShuffleSplit, StratifiedShuffleSplit, LeaveOneGroupOut
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold, StratifiedShuffleSplit, LeaveOneGroupOut
from sklearn.metrics.scorer import make_scorer
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
# Potential classifier models
@@ -43,6 +44,13 @@ from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
################################################################################
# Resampling library
################################################################################
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
################################################################################
# Particle swarm optimization library
################################################################################
@@ -61,6 +69,9 @@ from multiscorer import MultiScorer
logger = logging.getLogger(__name__)
logging.getLogger("imblearn").setLevel(logging.CRITICAL)
warnings.filterwarnings("ignore", category=UserWarning, module='imblearn')
# Generate random seeds to ensure reproducible performance
random_state = np.random.RandomState(42)
np.random.seed(42)
@@ -154,8 +165,8 @@ def buildModel(
# Create sklearn pipe using an imputer to handle Nan values, a scaler for
# ensuring all values are in the range of 0-1 and the final stacking
# classifier
pipe_components = [("imputer", preprocessing.Imputer()), ("scaler", preprocessing.MinMaxScaler()), ("model", StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, use_probas=True))]
pipe = Pipeline(pipe_components)
#pipe_components = [("resampler", SMOTE()), ("imputer", preprocessing.Imputer()), ("scaler", preprocessing.MinMaxScaler()), ("model", StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, use_probas=True))]
pipe = make_pipeline(ClusterCentroids(), preprocessing.Imputer(), preprocessing.MinMaxScaler(), StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, use_probas=True))
return pipe
@@ -301,8 +312,9 @@ def scoreOptimizedModel(features, classifications, groups, train_features, test_
# Score model on hidden test set using custom Physionet metric
finalScore = physionetScorer(model, test_features, test_classifications)
physionetScorer(model, test_features, test_classifications)
results = physionetScorer.get_results()
finalScore = results['score'][0]
sens = results['sensitivity'][0]
spec = results['specificity'][0]
@@ -317,16 +329,33 @@ def scoreOptimizedModel(features, classifications, groups, train_features, test_
logging.info("--------------------------------------------------------------------------------------------")
logo = LeaveOneGroupOut()
skf = StratifiedKFold(n_splits=10, random_state=42)
skf = RepeatedStratifiedKFold(n_splits=10, random_state=42, n_repeats=10)
# Evaluate model using leav-one-out and startified 10-fold cross-validation
logo_scores = cross_val_score(model, features, classifications, groups, physionetScorer2, logo)
results = physionetScorer2.get_results().copy()
logo_sens = results['sensitivity']
logo_sens = np.append(logo_sens, np.mean(logo_sens))
logo_spec = results['specificity']
logo_spec = np.append(logo_spec, np.mean(logo_spec))
logo_scores = np.append(logo_scores, np.mean(logo_scores))
for i in xrange(10):
cross_val_score(model, features, classifications, groups, physionetScorer2, logo)
logo_results = physionetScorer2.get_results().copy()
logo_scores = np.array(logo_results['score']).reshape((10, 6)).mean(axis=0)
logo_scores_std = np.array(logo_results['score']).reshape((10, 6)).std(axis=0)
logo_scores_mean = np.mean(logo_scores)
logo_scores_stdd = np.std(logo_scores)
logo_scores = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(logo_scores, logo_scores_std)]
logo_scores.append(u"{:.4f}±{:.4f}".format(logo_scores_mean, logo_scores_stdd))
logo_sens = np.array(logo_results['sensitivity']).reshape((10, 6)).mean(axis=0)
logo_sens_std = np.array(logo_results['sensitivity']).reshape((10, 6)).std(axis=0)
logo_sens_mean = np.mean(logo_sens)
logo_sens_stdd = np.std(logo_sens)
logo_sens = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(logo_sens, logo_sens_std)]
logo_sens.append(u"{:.4f}±{:.4f}".format(logo_sens_mean, logo_sens_stdd))
logo_spec = np.array(logo_results['specificity']).reshape((10, 6)).mean(axis=0)
logo_spec_std = np.array(logo_results['specificity']).reshape((10, 6)).std(axis=0)
logo_spec_mean = np.mean(logo_spec)
logo_spec_stdd = np.std(logo_spec)
logo_spec = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(logo_spec, logo_spec_std)]
logo_spec.append(u"{:.4f}±{:.4f}".format(logo_spec_mean, logo_spec_stdd))
# Pretty print results to logger
@@ -344,13 +373,31 @@ def scoreOptimizedModel(features, classifications, groups, train_features, test_
for line in logo_table.split('\n'):
logging.info(line.ljust(92))
skf_scores = cross_val_score(model, features, classifications, groups, physionetScorer3, skf)
logging.debug("Running 10 repeats of 10-fold stratified cross-validation...".ljust(92))
cross_val_score(model, features, classifications, groups, physionetScorer3, skf)
skf_results = physionetScorer3.get_results().copy()
skf_sens = skf_results['sensitivity']
skf_sens = np.append(skf_sens, np.mean(skf_sens))
skf_spec = skf_results['specificity']
skf_spec = np.append(skf_spec, np.mean(skf_spec))
skf_scores = np.append(skf_scores, np.mean(skf_scores))
skf_scores = np.array(skf_results['score']).reshape((10, 10)).mean(axis=0)
logging.info("Stratified K-fold cross-validation score: {}".format(np.mean(skf_scores)).ljust(92))
skf_scores_std = np.array(skf_results['score']).reshape((10, 10)).std(axis=0)
skf_scores_mean = np.mean(skf_scores)
skf_scores_stdd = np.std(skf_scores)
skf_scores = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(skf_scores, skf_scores_std)]
skf_scores.append(u"{:.4f}±{:.4f}".format(skf_scores_mean, skf_scores_stdd))
skf_sens = np.array(skf_results['sensitivity']).reshape((10, 10)).mean(axis=0)
skf_sens_std = np.array(skf_results['sensitivity']).reshape((10, 10)).std(axis=0)
skf_sens_mean = np.mean(skf_sens)
skf_sens_stdd = np.std(skf_sens)
skf_sens = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(skf_sens, skf_sens_std)]
skf_sens.append(u"{:.4f}±{:.4f}".format(skf_sens_mean, skf_sens_stdd))
skf_spec = np.array(skf_results['specificity']).reshape((10, 10)).mean(axis=0)
skf_spec_std = np.array(skf_results['specificity']).reshape((10, 10)).std(axis=0)
skf_spec_mean = np.mean(skf_spec)
skf_spec_stdd = np.std(skf_spec)
skf_spec = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(skf_spec, skf_spec_std)]
skf_spec.append(u"{:.4f}±{:.4f}".format(skf_spec_mean, skf_spec_stdd))
table_header = list(np.arange(1, 11))
table_header.append('Mean')
@@ -361,7 +408,6 @@ def scoreOptimizedModel(features, classifications, groups, train_features, test_
row3 = list(skf_sens)
row3.insert(0, 'Sensitivity')
skf_table = tabulate([row1, row2, row3], headers=table_header, tablefmt='grid', floatfmt=".4f")
logging.info("Stratified K-fold cross-validation score: {}".format(np.mean(skf_scores)).ljust(92))
for line in skf_table.split('\n'):
logging.info(line.ljust(92))
@@ -386,7 +432,7 @@ def group_train_test_split(features, classifications, groups):
g_group = groups[groups == i]
try:
train_inds, test_inds = gss.split(g_feat, g_class, g_group).next()
train_inds, test_inds = gss.split(g_feat, g_class['class']).next()
except ValueError:
raise ValueError("A database in the dataset has too few samples, at least 3 should be provided per sub-database")
+3
View File
@@ -536,6 +536,7 @@ def generateFeatures(dataFilepaths, output_dir, filename=None, parallelize=True,
if filename:
pathops.dir_must_exist(output_dir)
outputFile = os.path.join(output_dir, filename)
if not reanalyse:
try:
logger.debug("Attempting to load previously generated features from file: {0}".format(os.path.relpath(outputFile)))
@@ -553,6 +554,8 @@ def generateFeatures(dataFilepaths, output_dir, filename=None, parallelize=True,
args = []
# Find all files that are in the current dataset that have not been
# processed previously
if not dataFilepaths:
raise ValueError("No files found in dataset")
for pcgData in dataFilepaths:
if pcgData['name'] not in features.index:
args.append((pcgData['name'],pcgData['audio'],pcgData['seg']))
+4 -1
View File
@@ -239,7 +239,7 @@ python-pydown == 0.1.0
# optunity/bin/examples/python/sklearn/svc_structured.py: 6,11
# optunity/bin/examples/python/sklearn/svr.py: 7,12
# optunity/docs/examples/python/sklearn/svc.py: 3
scikit_learn == 0.18.2
scikit_learn == 0.19.0
# generateFeatures.py: 29
# mlxtend/mlxtend/classifier/multilayerperceptron.py: 11
@@ -270,3 +270,6 @@ tabulate == 0.7.7
# optunity/bin/examples/python/theano/logistic_regression.py: 2,3
theano == 0.10.0b1
imbalanced-learn==0.2.1
+4 -1
View File
@@ -11,7 +11,10 @@ def groupResample(features, classification, mix=0.5):
groups = generateGroups(features)
resampledFeatures = []
resampledClassifications = []
groupCount = np.max(groups)+1
try:
groupCount = np.max(groups)+1
except:
raise ValueError("No sample features were generated/loaded from file...")
clusters = []
for i in xrange(groupCount):