Merge branch 'dev' of github.com:Pezz89/QMUL_Final_Project into dev

2017-09-21 10:00:49 +01:00
parent dfd5fc7465 b6ad0c0977
commit 45c93dfbbf
6 changed files with 86 additions and 30 deletions
@@ -23,6 +23,7 @@ to the user's path and is accessible from the commandline)
 - G++ >= 4.9.X                  ([sudo] apt-get install g++-4.9 | brew install gcc@4.9)
 - Python 2.7.11                 ([sudo] apt-get install python | brew install python)
 - Pip (Python's package manager - not always included with python distribution)
+- multitail                     (Not essential, used for viewing logs)

 (It is likely that code will run on other versions than those stated, however
 this is not guaranteed)
@@ -57,7 +58,7 @@ The included scripts are:

 run_demo.sh
 Complete run of system from start to finish: Trains a model on the
-demo_dataset, optimizes for to evaluation, picks 3 features and scores
+demo_dataset, optimizes for 2 evaluations, picks 3 features and scores
 performance using metrics described in the report.

 run_optimized_model_demo.sh 
@@ -74,13 +75,13 @@ Train a model from scratch on the full dataset. This will take a considerable
 amount of time and requires significant computing power. This demonstrates the
 method used for development of the final algorithm.

+view_logs
+Opens log files created when running the program. It is advised that this is
+run during particle swarm optimisation, as this is when threaded logging occurs
+
 ./src/main.py --help
 The underlying interface used for training, optimization and scoring of models.
 Running the help flag displays a list of all arguments available to the user.
 This can be used for the generation of new models on any dataset (however, it
 is highly recommended that the full Physionet dataset is used for best results:
 https://physionet.org/physiobank/database/challenge/2016/)
-
-Although this script is fully functional, with documented argument parser, it
-was not intended for use by anyone other than the author. As such, errors as a
-result of unexpected user input are likely and may not be handled gracefully
@@ -38,6 +38,7 @@ x = ['A', 'B', 'C', 'D', 'E', 'F']
 plt.xticks(np.arange(6), x)
 plt.title('Leave-one-out Specificity');
 plt.tight_layout()
+plt.show()

 ########################################################################
 fig = plt.figure() # create a plot figure
@@ -69,9 +70,8 @@ e = np.array([0.0293, 0.0267, 0.0208, 0.0280, 0.0226, 0.0214, 0.0229, 0.0206, 0.

 plt.errorbar(x, y, yerr=e, fmt='o', color='black', ecolor='darkgray', elinewidth=3, capsize=0)
 plt.xticks(x)
-plt.xticks(x)
+plt.xticks(np.arange(10), x)
 plt.tight_layout()

-plt.show()
 plt.title('10-fold Specificity');
 fig.set_size_inches(9*1.3, 3*1.3*3)
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from __future__ import division

 ################################################################################
@@ -12,16 +13,16 @@ import sys
 import multiprocessing
 import six
 import textwrap
+import warnings
 from tabulate import tabulate

 ################################################################################
 # Scikit-Learn imports
 ################################################################################
-from sklearn.model_selection import cross_val_score, GroupKFold, train_test_split, StratifiedKFold, GroupShuffleSplit, StratifiedShuffleSplit, LeaveOneGroupOut
+from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold, StratifiedShuffleSplit, LeaveOneGroupOut
 from sklearn.metrics.scorer import make_scorer
 from sklearn import preprocessing
 from sklearn.externals import joblib
-from sklearn.pipeline import Pipeline

 # Potential classifier models

@@ -43,6 +44,13 @@ from mlxtend.classifier import StackingCVClassifier

 from mlxtend.feature_selection import SequentialFeatureSelector as SFS

+################################################################################
+# Resampling library
+################################################################################
+from imblearn.pipeline import make_pipeline
+from imblearn.over_sampling import SMOTE
+from imblearn.under_sampling import ClusterCentroids
+
 ################################################################################
 # Particle swarm optimization library
 ################################################################################
@@ -61,6 +69,9 @@ from multiscorer import MultiScorer


 logger = logging.getLogger(__name__)
+logging.getLogger("imblearn").setLevel(logging.CRITICAL)
+
+warnings.filterwarnings("ignore", category=UserWarning, module='imblearn')
 # Generate random seeds to ensure reproducible performance
 random_state = np.random.RandomState(42)
 np.random.seed(42)
@@ -154,8 +165,8 @@ def buildModel(
    # Create sklearn pipe using an imputer to handle Nan values, a scaler for
    # ensuring all values are in the range of 0-1 and the final stacking
    # classifier
-    pipe_components = [("imputer", preprocessing.Imputer()), ("scaler", preprocessing.MinMaxScaler()), ("model", StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, use_probas=True))]
-    pipe = Pipeline(pipe_components)
+    #pipe_components = [("resampler", SMOTE()), ("imputer", preprocessing.Imputer()), ("scaler", preprocessing.MinMaxScaler()), ("model", StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, use_probas=True))]
+    pipe = make_pipeline(ClusterCentroids(), preprocessing.Imputer(), preprocessing.MinMaxScaler(), StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, use_probas=True))

    return pipe

@@ -301,8 +312,9 @@ def scoreOptimizedModel(features, classifications, groups, train_features, test_


    # Score model on hidden test set using custom Physionet metric
-    finalScore = physionetScorer(model, test_features, test_classifications)
+    physionetScorer(model, test_features, test_classifications)
    results = physionetScorer.get_results()
+    finalScore = results['score'][0]
    sens = results['sensitivity'][0]
    spec = results['specificity'][0]

@@ -317,16 +329,33 @@ def scoreOptimizedModel(features, classifications, groups, train_features, test_
    logging.info("--------------------------------------------------------------------------------------------")

    logo = LeaveOneGroupOut()
-    skf = StratifiedKFold(n_splits=10, random_state=42)
+    skf = RepeatedStratifiedKFold(n_splits=10, random_state=42, n_repeats=10)

    # Evaluate model using leav-one-out and startified 10-fold cross-validation
-    logo_scores = cross_val_score(model, features, classifications, groups, physionetScorer2, logo)
-    results = physionetScorer2.get_results().copy()
-    logo_sens = results['sensitivity']
-    logo_sens = np.append(logo_sens, np.mean(logo_sens))
-    logo_spec = results['specificity']
-    logo_spec = np.append(logo_spec, np.mean(logo_spec))
-    logo_scores = np.append(logo_scores, np.mean(logo_scores))
+    for i in xrange(10):
+        cross_val_score(model, features, classifications, groups, physionetScorer2, logo)
+    logo_results = physionetScorer2.get_results().copy()
+
+    logo_scores = np.array(logo_results['score']).reshape((10, 6)).mean(axis=0)
+    logo_scores_std = np.array(logo_results['score']).reshape((10, 6)).std(axis=0)
+    logo_scores_mean = np.mean(logo_scores)
+    logo_scores_stdd = np.std(logo_scores)
+    logo_scores = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(logo_scores, logo_scores_std)]
+    logo_scores.append(u"{:.4f}±{:.4f}".format(logo_scores_mean, logo_scores_stdd))
+
+    logo_sens = np.array(logo_results['sensitivity']).reshape((10, 6)).mean(axis=0)
+    logo_sens_std = np.array(logo_results['sensitivity']).reshape((10, 6)).std(axis=0)
+    logo_sens_mean = np.mean(logo_sens)
+    logo_sens_stdd = np.std(logo_sens)
+    logo_sens = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(logo_sens, logo_sens_std)]
+    logo_sens.append(u"{:.4f}±{:.4f}".format(logo_sens_mean, logo_sens_stdd))
+
+    logo_spec = np.array(logo_results['specificity']).reshape((10, 6)).mean(axis=0)
+    logo_spec_std = np.array(logo_results['specificity']).reshape((10, 6)).std(axis=0)
+    logo_spec_mean = np.mean(logo_spec)
+    logo_spec_stdd = np.std(logo_spec)
+    logo_spec = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(logo_spec, logo_spec_std)]
+    logo_spec.append(u"{:.4f}±{:.4f}".format(logo_spec_mean, logo_spec_stdd))


    # Pretty print results to logger
@@ -344,13 +373,31 @@ def scoreOptimizedModel(features, classifications, groups, train_features, test_
    for line in logo_table.split('\n'):
        logging.info(line.ljust(92))

-    skf_scores = cross_val_score(model, features, classifications, groups, physionetScorer3, skf)
+    logging.debug("Running 10 repeats of 10-fold stratified cross-validation...".ljust(92))
+    cross_val_score(model, features, classifications, groups, physionetScorer3, skf)
    skf_results = physionetScorer3.get_results().copy()
-    skf_sens = skf_results['sensitivity']
-    skf_sens = np.append(skf_sens, np.mean(skf_sens))
-    skf_spec = skf_results['specificity']
-    skf_spec = np.append(skf_spec, np.mean(skf_spec))
-    skf_scores = np.append(skf_scores, np.mean(skf_scores))
+
+    skf_scores = np.array(skf_results['score']).reshape((10, 10)).mean(axis=0)
+    logging.info("Stratified K-fold cross-validation score:    {}".format(np.mean(skf_scores)).ljust(92))
+    skf_scores_std = np.array(skf_results['score']).reshape((10, 10)).std(axis=0)
+    skf_scores_mean = np.mean(skf_scores)
+    skf_scores_stdd = np.std(skf_scores)
+    skf_scores = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(skf_scores, skf_scores_std)]
+    skf_scores.append(u"{:.4f}±{:.4f}".format(skf_scores_mean, skf_scores_stdd))
+
+    skf_sens = np.array(skf_results['sensitivity']).reshape((10, 10)).mean(axis=0)
+    skf_sens_std = np.array(skf_results['sensitivity']).reshape((10, 10)).std(axis=0)
+    skf_sens_mean = np.mean(skf_sens)
+    skf_sens_stdd = np.std(skf_sens)
+    skf_sens = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(skf_sens, skf_sens_std)]
+    skf_sens.append(u"{:.4f}±{:.4f}".format(skf_sens_mean, skf_sens_stdd))
+
+    skf_spec = np.array(skf_results['specificity']).reshape((10, 10)).mean(axis=0)
+    skf_spec_std = np.array(skf_results['specificity']).reshape((10, 10)).std(axis=0)
+    skf_spec_mean = np.mean(skf_spec)
+    skf_spec_stdd = np.std(skf_spec)
+    skf_spec = [u"{0:.4f}±{1:.4f}".format(x, y) for x, y in zip(skf_spec, skf_spec_std)]
+    skf_spec.append(u"{:.4f}±{:.4f}".format(skf_spec_mean, skf_spec_stdd))

    table_header = list(np.arange(1, 11))
    table_header.append('Mean')
@@ -361,7 +408,6 @@ def scoreOptimizedModel(features, classifications, groups, train_features, test_
    row3 = list(skf_sens)
    row3.insert(0, 'Sensitivity')
    skf_table = tabulate([row1, row2, row3], headers=table_header, tablefmt='grid', floatfmt=".4f")
-    logging.info("Stratified K-fold cross-validation score:    {}".format(np.mean(skf_scores)).ljust(92))
    for line in skf_table.split('\n'):
        logging.info(line.ljust(92))

@@ -386,7 +432,7 @@ def group_train_test_split(features, classifications, groups):
        g_group = groups[groups == i]

        try:
-            train_inds, test_inds = gss.split(g_feat, g_class, g_group).next()
+            train_inds, test_inds = gss.split(g_feat, g_class['class']).next()
        except ValueError:
            raise ValueError("A database in the dataset has too few samples, at least 3 should be provided per sub-database")

@@ -536,6 +536,7 @@ def generateFeatures(dataFilepaths, output_dir, filename=None, parallelize=True,
    if filename:
        pathops.dir_must_exist(output_dir)
        outputFile = os.path.join(output_dir, filename)
+
        if not reanalyse:
            try:
                logger.debug("Attempting to load previously generated features from file: {0}".format(os.path.relpath(outputFile)))
@@ -553,6 +554,8 @@ def generateFeatures(dataFilepaths, output_dir, filename=None, parallelize=True,
    args = []
    # Find all files that are in the current dataset that have not been
    # processed previously
+    if not dataFilepaths:
+        raise ValueError("No files found in dataset")
    for pcgData in dataFilepaths:
        if pcgData['name'] not in features.index:
            args.append((pcgData['name'],pcgData['audio'],pcgData['seg']))
@@ -239,7 +239,7 @@ python-pydown == 0.1.0
 # optunity/bin/examples/python/sklearn/svc_structured.py: 6,11
 # optunity/bin/examples/python/sklearn/svr.py: 7,12
 # optunity/docs/examples/python/sklearn/svc.py: 3
-scikit_learn == 0.18.2
+scikit_learn == 0.19.0

 # generateFeatures.py: 29
 # mlxtend/mlxtend/classifier/multilayerperceptron.py: 11
@@ -270,3 +270,6 @@ tabulate == 0.7.7

 # optunity/bin/examples/python/theano/logistic_regression.py: 2,3
 theano == 0.10.0b1
+
+
+imbalanced-learn==0.2.1
@@ -11,7 +11,10 @@ def groupResample(features, classification, mix=0.5):
    groups = generateGroups(features)
    resampledFeatures = []
    resampledClassifications = []
-    groupCount = np.max(groups)+1
+    try:
+        groupCount = np.max(groups)+1
+    except:
+        raise ValueError("No sample features were generated/loaded from file...")

    clusters = []
    for i in xrange(groupCount):