Move to nanmean instead of mean (#211)

This commit is contained in:
mrkaiser
2017-07-07 13:16:02 -04:00
committed by Sebastian Raschka
parent f28bc8bc0f
commit d1fdeea614
3 changed files with 83 additions and 23 deletions
+1 -1
View File
@@ -20,7 +20,7 @@ The CHANGELOG for the current development version is available at
##### Changes
- /
- `SFS` now uses `np.nanmean` over normal mean to support scorers that may return `np.nan`
##### Bug Fixes
@@ -10,7 +10,6 @@ import datetime
import numpy as np
import scipy as sp
import scipy.stats
import warnings
import sys
from copy import deepcopy
from itertools import combinations
@@ -242,7 +241,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
self.subsets_[k] = {
'feature_idx': k_idx,
'cv_scores': k_score,
'avg_score': k_score.mean()
'avg_score': np.nanmean(k_score)
}
print(' k_to_select', k_to_select)
best_subset = None
@@ -359,7 +358,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
for feature in remaining)
for new_subset, cv_scores in work:
all_avg_scores.append(cv_scores.mean())
all_avg_scores.append(np.nanmean(cv_scores))
all_cv_scores.append(cv_scores)
all_subsets.append(new_subset)
@@ -385,7 +384,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
if not fixed_feature or fixed_feature in set(p))
for p, cv_scores in work:
all_avg_scores.append(cv_scores.mean())
all_avg_scores.append(np.nanmean(cv_scores))
all_cv_scores.append(cv_scores)
all_subsets.append(p)
@@ -5,22 +5,34 @@
# License: BSD 3 clause
import sys
import numpy as np
from numpy import nan
from numpy.testing import assert_almost_equal
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.classifier import SoftmaxRegression
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from mlxtend.utils import assert_raises
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from mlxtend.classifier import SoftmaxRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.utils import assert_raises
def dict_compare_utility(d1, d2):
def nan_roc_auc_score(y_true, y_score, average='macro', sample_weight=None):
if len(np.unique(y_true)) != 2:
return np.nan
else:
return roc_auc_score(y_true, y_score,
average=average, sample_weight=sample_weight)
def dict_compare_utility(d1, d2, decimal=3):
assert d1.keys() == d2.keys(), "%s != %s" % (d1, d2)
for i in d1:
err_msg = ("d1[%s]['feature_idx']"
@@ -28,12 +40,12 @@ def dict_compare_utility(d1, d2):
assert d1[i]['feature_idx'] == d1[i]["feature_idx"], err_msg
assert_almost_equal(d1[i]['avg_score'],
d2[i]['avg_score'],
decimal=3,
decimal=decimal,
err_msg=("d1[%s]['avg_score']"
" != d2[%s]['avg_score']" % (i, i)))
assert_almost_equal(d1[i]['cv_scores'],
d2[i]['cv_scores'],
decimal=3,
decimal=decimal,
err_msg=("d1[%s]['cv_scores']"
" != d2[%s]['cv_scores']" % (i, i)))
@@ -46,7 +58,7 @@ def test_run_default():
sfs = SFS(estimator=knn,
verbose=0)
sfs.fit(X, y)
assert sfs.k_feature_idx_ == (3, )
assert sfs.k_feature_idx_ == (3,)
def test_kfeatures_type_1():
@@ -191,6 +203,56 @@ def test_knn_cv3():
dict_compare_utility(d1=expect, d2=sfs1.subsets_)
def test_knn_rbf_groupkfold():
nan_roc_auc_scorer = make_scorer(nan_roc_auc_score)
iris = load_iris()
X = iris.data
# knn = KNeighborsClassifier(n_neighbors=4)
forest = RandomForestClassifier(n_estimators=100)
bool_01 = [True if item == 0 else False for item in iris['target']]
bool_02 = [True if (item == 1 or item == 2) else False for item in
iris['target']]
groups = []
y_new = []
for ind, _ in enumerate(bool_01):
if bool_01[ind]:
groups.append('attribute_A')
y_new.append(0)
if bool_02[ind]:
throw = np.random.rand()
if throw < 0.5:
groups.append('attribute_B')
else:
groups.append('attribute_C')
throw2 = np.random.rand()
if throw2 < 0.5:
y_new.append(0)
else:
y_new.append(1)
y_new_bool = [True if item is 1 else False for item in y_new]
cv_obj = GroupKFold(n_splits=3)
cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups))
sfs1 = SFS(forest,
k_features=3,
forward=True,
floating=False,
cv=cv_obj_list,
scoring=nan_roc_auc_scorer,
verbose=0
)
sfs1 = sfs1.fit(X, y_new)
expect = {
1: {'cv_scores': np.array([0.488, nan, 0.51]), 'avg_score': 0.499047,
'feature_idx': (3,)},
2: {'cv_scores': np.array([0.54563, nan, 0.585]), 'avg_score': 0.56531,
'feature_idx': (2, 3)},
3: {'cv_scores': np.array([0.48875661, nan, 0.515]),
'avg_score': 0.50187,
'feature_idx': (1, 2, 3)}}
dict_compare_utility(d1=expect, d2=sfs1.subsets_, decimal=1)
def test_knn_option_sfs():
iris = load_iris()
X = iris.data
@@ -452,7 +514,7 @@ def test_clone_params_pass():
verbose=0,
n_jobs=1)
sfs1 = sfs1.fit(X, y)
assert(sfs1.k_feature_idx_ == (1, 3))
assert (sfs1.k_feature_idx_ == (1, 3))
def test_transform_not_fitted():
@@ -528,7 +590,6 @@ def test_keyboard_interrupt():
def test_gridsearch():
iris = load_iris()
X = iris.data
y = iris.target
@@ -544,9 +605,9 @@ def test_gridsearch():
('knn', knn)])
param_grid = [
{'sfs__k_features': [1, 2, 3, 4],
'sfs__estimator__n_neighbors': [1, 2, 3, 4]}
]
{'sfs__k_features': [1, 2, 3, 4],
'sfs__estimator__n_neighbors': [1, 2, 3, 4]}
]
gs = GridSearchCV(estimator=pipe,
param_grid=param_grid,