Multiprocessing over features rather than CV folds in Sequential Feature Selection (addressing #191) (#193)
This commit is contained in:
committed by
Sebastian Raschka
parent
89a2a0e61c
commit
1b0decf71f
@@ -9,6 +9,7 @@ The CHANGELOG for the current development version is available at
|
||||
### Version 0.6.1 (TBD)
|
||||
|
||||
|
||||
|
||||
##### Downloads
|
||||
|
||||
- [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.6.1.zip)
|
||||
@@ -24,6 +25,8 @@ The CHANGELOG for the current development version is available at
|
||||
- `plot_decision_regions` now supports plotting decision regions for more than 2 training features. (via [James Bourbeau](https://github.com/jrbourbeau)).
|
||||
|
||||
|
||||
- Parallel execution in `mlxtend.feature_selection.SequentialFeatureSelector` and `mlxtend.feature_selection.ExhaustiveFeatureSelector` is now performed over different feature subsets instead of the different cross-validation folds to better utilize machines with multiple processors if the number of features is large ([#193](https://github.com/rasbt/mlxtend/pull/193), via [@whalebot-helmsman](https://github.com/whalebot-helmsman)).
|
||||
|
||||
##### Bug Fixes
|
||||
|
||||
- `SequentialFeatureSelector` now correctly accepts a `None` argument for the `scoring` parameter to infer the default scoring metric from scikit-learn classifiers and regressors.
|
||||
|
||||
@@ -20,6 +20,21 @@ from sklearn.base import BaseEstimator
|
||||
from sklearn.base import MetaEstimatorMixin
|
||||
from ..externals.name_estimators import _name_estimators
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.externals.joblib import Parallel, delayed
|
||||
|
||||
|
||||
def _calc_score(selector, X, y, indices):
|
||||
if selector.cv:
|
||||
scores = cross_val_score(selector.est_,
|
||||
X[:, indices], y,
|
||||
cv=selector.cv,
|
||||
scoring=selector.scorer,
|
||||
n_jobs=1,
|
||||
pre_dispatch=selector.pre_dispatch)
|
||||
else:
|
||||
selector.est_.fit(X[:, indices], y)
|
||||
scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
|
||||
return indices, scores
|
||||
|
||||
|
||||
class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
@@ -51,10 +66,11 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
otherwise.
|
||||
No cross-validation if cv is None, False, or 0.
|
||||
n_jobs : int (default: 1)
|
||||
The number of CPUs to use for cross validation. -1 means 'all CPUs'.
|
||||
The number of CPUs to use for evaluating different feature subsets
|
||||
in parallel. -1 means 'all CPUs'.
|
||||
pre_dispatch : int, or string (default: '2*n_jobs')
|
||||
Controls the number of jobs that get dispatched
|
||||
during parallel execution in cross_val_score.
|
||||
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
|
||||
Reducing this number can be useful to avoid an explosion of
|
||||
memory consumption when more jobs get dispatched than CPUs can process.
|
||||
This parameter can be:
|
||||
@@ -147,8 +163,12 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
|
||||
self.subsets_ = {}
|
||||
all_comb = len(candidates)
|
||||
for iteration, c in enumerate(candidates):
|
||||
cv_scores = self._calc_score(X=X, y=y, indices=c)
|
||||
n_jobs = min(self.n_jobs, all_comb)
|
||||
parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
|
||||
work = enumerate(parallel(delayed(_calc_score)(self, X, y, c)
|
||||
for c in candidates))
|
||||
|
||||
for iteration, (c, cv_scores) in work:
|
||||
|
||||
self.subsets_[iteration] = {'feature_idx': c,
|
||||
'cv_scores': cv_scores,
|
||||
@@ -173,19 +193,6 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
self.fitted = True
|
||||
return self
|
||||
|
||||
def _calc_score(self, X, y, indices):
|
||||
if self.cv:
|
||||
scores = cross_val_score(self.est_,
|
||||
X[:, indices], y,
|
||||
cv=self.cv,
|
||||
scoring=self.scorer,
|
||||
n_jobs=self.n_jobs,
|
||||
pre_dispatch=self.pre_dispatch)
|
||||
else:
|
||||
self.est_.fit(X[:, indices], y)
|
||||
scores = np.array([self.scorer(self.est_, X[:, indices], y)])
|
||||
return scores
|
||||
|
||||
def transform(self, X):
|
||||
"""Return the best selected features from X.
|
||||
|
||||
|
||||
@@ -21,6 +21,21 @@ from sklearn.base import BaseEstimator
|
||||
from sklearn.base import MetaEstimatorMixin
|
||||
from ..externals.name_estimators import _name_estimators
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.externals.joblib import Parallel, delayed
|
||||
|
||||
|
||||
def _calc_score(selector, X, y, indices):
|
||||
if selector.cv:
|
||||
scores = cross_val_score(selector.est_,
|
||||
X[:, indices], y,
|
||||
cv=selector.cv,
|
||||
scoring=selector.scorer,
|
||||
n_jobs=1,
|
||||
pre_dispatch=selector.pre_dispatch)
|
||||
else:
|
||||
selector.est_.fit(X[:, indices], y)
|
||||
scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
|
||||
return indices, scores
|
||||
|
||||
|
||||
class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
@@ -69,10 +84,11 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
exclusion/inclusion if floating=True and
|
||||
algorithm gets stuck in cycles.
|
||||
n_jobs : int (default: 1)
|
||||
The number of CPUs to use for cross validation. -1 means 'all CPUs'.
|
||||
The number of CPUs to use for evaluating different feature subsets
|
||||
in parallel. -1 means 'all CPUs'.
|
||||
pre_dispatch : int, or string (default: '2*n_jobs')
|
||||
Controls the number of jobs that get dispatched
|
||||
during parallel execution in cross_val_score.
|
||||
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
|
||||
Reducing this number can be useful to avoid an explosion of
|
||||
memory consumption when more jobs get dispatched than CPUs can process.
|
||||
This parameter can be:
|
||||
@@ -222,7 +238,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
k_to_select = self.k_features[0]
|
||||
k_idx = tuple(range(X.shape[1]))
|
||||
k = len(k_idx)
|
||||
k_score = self._calc_score(X, y, k_idx)
|
||||
k_idx, k_score = _calc_score(self, X, y, k_idx)
|
||||
self.subsets_[k] = {
|
||||
'feature_idx': k_idx,
|
||||
'cv_scores': k_score,
|
||||
@@ -325,19 +341,6 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
stuck = True
|
||||
return stuck
|
||||
|
||||
def _calc_score(self, X, y, indices):
|
||||
if self.cv:
|
||||
scores = cross_val_score(self.est_,
|
||||
X[:, indices], y,
|
||||
cv=self.cv,
|
||||
scoring=self.scorer,
|
||||
n_jobs=self.n_jobs,
|
||||
pre_dispatch=self.pre_dispatch)
|
||||
else:
|
||||
self.est_.fit(X[:, indices], y)
|
||||
scores = np.array([self.scorer(self.est_, X[:, indices], y)])
|
||||
return scores
|
||||
|
||||
def _inclusion(self, orig_set, subset, X, y):
|
||||
all_avg_scores = []
|
||||
all_cv_scores = []
|
||||
@@ -345,12 +348,19 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
res = (None, None, None)
|
||||
remaining = orig_set - subset
|
||||
if remaining:
|
||||
for feature in remaining:
|
||||
new_subset = tuple(subset | {feature})
|
||||
cv_scores = self._calc_score(X, y, new_subset)
|
||||
features = len(remaining)
|
||||
n_jobs = min(self.n_jobs, features)
|
||||
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
|
||||
pre_dispatch=self.pre_dispatch)
|
||||
work = parallel(delayed(_calc_score)
|
||||
(self, X, y, tuple(subset | {feature}))
|
||||
for feature in remaining)
|
||||
|
||||
for new_subset, cv_scores in work:
|
||||
all_avg_scores.append(cv_scores.mean())
|
||||
all_cv_scores.append(cv_scores)
|
||||
all_subsets.append(new_subset)
|
||||
|
||||
best = np.argmax(all_avg_scores)
|
||||
res = (all_subsets[best],
|
||||
all_avg_scores[best],
|
||||
@@ -364,13 +374,19 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
all_avg_scores = []
|
||||
all_cv_scores = []
|
||||
all_subsets = []
|
||||
for p in combinations(feature_set, r=n - 1):
|
||||
if fixed_feature and fixed_feature not in set(p):
|
||||
continue
|
||||
cv_scores = self._calc_score(X, y, p)
|
||||
features = n
|
||||
n_jobs = min(self.n_jobs, features)
|
||||
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
|
||||
pre_dispatch=self.pre_dispatch)
|
||||
work = parallel(delayed(_calc_score)(self, X, y, p)
|
||||
for p in combinations(feature_set, r=n - 1)
|
||||
if not fixed_feature or fixed_feature in set(p))
|
||||
|
||||
for p, cv_scores in work:
|
||||
all_avg_scores.append(cv_scores.mean())
|
||||
all_cv_scores.append(cv_scores)
|
||||
all_subsets.append(p)
|
||||
|
||||
best = np.argmax(all_avg_scores)
|
||||
res = (all_subsets[best],
|
||||
all_avg_scores[best],
|
||||
|
||||
@@ -3,5 +3,3 @@ numpy>=1.10.4
|
||||
pandas>=0.17.1
|
||||
scikit-learn>=0.18
|
||||
matplotlib>=1.5.1
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user