Multiprocessing over features rather than CV folds in Sequential Feature Selection (addressing #191) (#193)

2017-05-19 01:07:28 +03:00
parent 89a2a0e61c
commit 1b0decf71f
4 changed files with 66 additions and 42 deletions
@@ -9,6 +9,7 @@ The CHANGELOG for the current development version is available at
 ### Version 0.6.1 (TBD)


+
 ##### Downloads

 - [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.6.1.zip)
@@ -24,6 +25,8 @@ The CHANGELOG for the current development version is available at
 - `plot_decision_regions` now supports plotting decision regions for more than 2 training features. (via [James Bourbeau](https://github.com/jrbourbeau)).


+- Parallel execution in `mlxtend.feature_selection.SequentialFeatureSelector` and `mlxtend.feature_selection.ExhaustiveFeatureSelector` is now performed over different feature subsets instead of the different cross-validation folds to better utilize machines with multiple processors if the number of features is large ([#193](https://github.com/rasbt/mlxtend/pull/193), via [@whalebot-helmsman](https://github.com/whalebot-helmsman)).
+
 ##### Bug Fixes

 - `SequentialFeatureSelector` now correctly accepts a `None` argument for the `scoring` parameter to infer the default scoring metric from scikit-learn classifiers and regressors.
@@ -20,6 +20,21 @@ from sklearn.base import BaseEstimator
 from sklearn.base import MetaEstimatorMixin
 from ..externals.name_estimators import _name_estimators
 from sklearn.model_selection import cross_val_score
+from sklearn.externals.joblib import Parallel, delayed
+
+
+def _calc_score(selector, X, y, indices):
+    if selector.cv:
+        scores = cross_val_score(selector.est_,
+                                 X[:, indices], y,
+                                 cv=selector.cv,
+                                 scoring=selector.scorer,
+                                 n_jobs=1,
+                                 pre_dispatch=selector.pre_dispatch)
+    else:
+        selector.est_.fit(X[:, indices], y)
+        scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
+    return indices, scores


 class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
@@ -51,10 +66,11 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
        otherwise.
        No cross-validation if cv is None, False, or 0.
    n_jobs : int (default: 1)
-        The number of CPUs to use for cross validation. -1 means 'all CPUs'.
+        The number of CPUs to use for evaluating different feature subsets
+        in parallel. -1 means 'all CPUs'.
    pre_dispatch : int, or string (default: '2*n_jobs')
        Controls the number of jobs that get dispatched
-        during parallel execution in cross_val_score.
+        during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
        Reducing this number can be useful to avoid an explosion of
        memory consumption when more jobs get dispatched than CPUs can process.
        This parameter can be:
@@ -147,8 +163,12 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):

        self.subsets_ = {}
        all_comb = len(candidates)
-        for iteration, c in enumerate(candidates):
-            cv_scores = self._calc_score(X=X, y=y, indices=c)
+        n_jobs = min(self.n_jobs, all_comb)
+        parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
+        work = enumerate(parallel(delayed(_calc_score)(self, X, y, c)
+                                  for c in candidates))
+
+        for iteration, (c, cv_scores) in work:

            self.subsets_[iteration] = {'feature_idx': c,
                                        'cv_scores': cv_scores,
@@ -173,19 +193,6 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
        self.fitted = True
        return self

-    def _calc_score(self, X, y, indices):
-        if self.cv:
-            scores = cross_val_score(self.est_,
-                                     X[:, indices], y,
-                                     cv=self.cv,
-                                     scoring=self.scorer,
-                                     n_jobs=self.n_jobs,
-                                     pre_dispatch=self.pre_dispatch)
-        else:
-            self.est_.fit(X[:, indices], y)
-            scores = np.array([self.scorer(self.est_, X[:, indices], y)])
-        return scores
-
    def transform(self, X):
        """Return the best selected features from X.

@@ -21,6 +21,21 @@ from sklearn.base import BaseEstimator
 from sklearn.base import MetaEstimatorMixin
 from ..externals.name_estimators import _name_estimators
 from sklearn.model_selection import cross_val_score
+from sklearn.externals.joblib import Parallel, delayed
+
+
+def _calc_score(selector, X, y, indices):
+    if selector.cv:
+        scores = cross_val_score(selector.est_,
+                                 X[:, indices], y,
+                                 cv=selector.cv,
+                                 scoring=selector.scorer,
+                                 n_jobs=1,
+                                 pre_dispatch=selector.pre_dispatch)
+    else:
+        selector.est_.fit(X[:, indices], y)
+        scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
+    return indices, scores


 class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
@@ -69,10 +84,11 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
        exclusion/inclusion if floating=True and
        algorithm gets stuck in cycles.
    n_jobs : int (default: 1)
-        The number of CPUs to use for cross validation. -1 means 'all CPUs'.
+        The number of CPUs to use for evaluating different feature subsets
+        in parallel. -1 means 'all CPUs'.
    pre_dispatch : int, or string (default: '2*n_jobs')
        Controls the number of jobs that get dispatched
-        during parallel execution in cross_val_score.
+        during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
        Reducing this number can be useful to avoid an explosion of
        memory consumption when more jobs get dispatched than CPUs can process.
        This parameter can be:
@@ -222,7 +238,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
                k_to_select = self.k_features[0]
            k_idx = tuple(range(X.shape[1]))
            k = len(k_idx)
-            k_score = self._calc_score(X, y, k_idx)
+            k_idx, k_score = _calc_score(self, X, y, k_idx)
            self.subsets_[k] = {
                'feature_idx': k_idx,
                'cv_scores': k_score,
@@ -325,19 +341,6 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
            stuck = True
        return stuck

-    def _calc_score(self, X, y, indices):
-        if self.cv:
-            scores = cross_val_score(self.est_,
-                                     X[:, indices], y,
-                                     cv=self.cv,
-                                     scoring=self.scorer,
-                                     n_jobs=self.n_jobs,
-                                     pre_dispatch=self.pre_dispatch)
-        else:
-            self.est_.fit(X[:, indices], y)
-            scores = np.array([self.scorer(self.est_, X[:, indices], y)])
-        return scores
-
    def _inclusion(self, orig_set, subset, X, y):
        all_avg_scores = []
        all_cv_scores = []
@@ -345,12 +348,19 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
        res = (None, None, None)
        remaining = orig_set - subset
        if remaining:
-            for feature in remaining:
-                new_subset = tuple(subset | {feature})
-                cv_scores = self._calc_score(X, y, new_subset)
+            features = len(remaining)
+            n_jobs = min(self.n_jobs, features)
+            parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
+                                pre_dispatch=self.pre_dispatch)
+            work = parallel(delayed(_calc_score)
+                            (self, X, y, tuple(subset | {feature}))
+                            for feature in remaining)
+
+            for new_subset, cv_scores in work:
                all_avg_scores.append(cv_scores.mean())
                all_cv_scores.append(cv_scores)
                all_subsets.append(new_subset)
+
            best = np.argmax(all_avg_scores)
            res = (all_subsets[best],
                   all_avg_scores[best],
@@ -364,13 +374,19 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
            all_avg_scores = []
            all_cv_scores = []
            all_subsets = []
-            for p in combinations(feature_set, r=n - 1):
-                if fixed_feature and fixed_feature not in set(p):
-                    continue
-                cv_scores = self._calc_score(X, y, p)
+            features = n
+            n_jobs = min(self.n_jobs, features)
+            parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
+                                pre_dispatch=self.pre_dispatch)
+            work = parallel(delayed(_calc_score)(self, X, y, p)
+                            for p in combinations(feature_set, r=n - 1)
+                            if not fixed_feature or fixed_feature in set(p))
+
+            for p, cv_scores in work:
                all_avg_scores.append(cv_scores.mean())
                all_cv_scores.append(cv_scores)
                all_subsets.append(p)
+
            best = np.argmax(all_avg_scores)
            res = (all_subsets[best],
                   all_avg_scores[best],
@@ -3,5 +3,3 @@ numpy>=1.10.4
 pandas>=0.17.1
 scikit-learn>=0.18
 matplotlib>=1.5.1
-
-