flake8 checks
This commit is contained in:
@@ -4,9 +4,24 @@ doc-warnings: yes
|
||||
|
||||
ignore-paths:
|
||||
- mlxtend/data
|
||||
- mlxtend/externals
|
||||
|
||||
ignore-patterns:
|
||||
- ^example/doc_.*\.py$
|
||||
- (^|/)docs(/|$)
|
||||
- __init__.py
|
||||
|
||||
pylint:
|
||||
options:
|
||||
dummy-variables-rgx: _$|.+_$|dummy_.+
|
||||
disable:
|
||||
- missing-docstring
|
||||
- protected-access
|
||||
- too-few-public-methods
|
||||
- too-many-arguments
|
||||
- too-many-instance-attributes
|
||||
- too-many-locals
|
||||
- too-many-public-methods
|
||||
- too-many-return-statements
|
||||
- too-many-statements
|
||||
- unpacking-non-sequence
|
||||
|
||||
@@ -12,6 +12,7 @@ from .base import _BaseClassifier
|
||||
|
||||
|
||||
class Adaline(_BaseClassifier):
|
||||
|
||||
"""ADAptive LInear NEuron classifier.
|
||||
|
||||
Parameters
|
||||
@@ -94,9 +95,10 @@ class Adaline(_BaseClassifier):
|
||||
self.thres_ = 0.5
|
||||
|
||||
if init_weights:
|
||||
self.w_ = self._init_weights(shape=1 + X.shape[1],
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
self.w_ = self._init_weights(
|
||||
shape=1 + X.shape[1],
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
|
||||
self.cost_ = []
|
||||
|
||||
@@ -125,7 +127,7 @@ class Adaline(_BaseClassifier):
|
||||
cost = self._sum_squared_error_cost(y, self._activation(X))
|
||||
self.cost_.append(cost)
|
||||
if self.print_progress:
|
||||
self._print_progress(epoch=i+1, cost=cost)
|
||||
self._print_progress(epoch=i + 1, cost=cost)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
@@ -12,11 +12,17 @@ from time import time
|
||||
|
||||
|
||||
class _BaseClassifier(object):
|
||||
"""Parent Class Base Classifier"""
|
||||
|
||||
"""Parent Class Base Classifier
|
||||
|
||||
A base class that is important by
|
||||
classifier child classes.
|
||||
|
||||
"""
|
||||
def __init__(self, print_progress=0):
|
||||
self.print_progress = print_progress
|
||||
|
||||
def fit(self, X, y):
|
||||
def fit(self, X, y, init_weights=True):
|
||||
"""Learn weight coefficients from training data.
|
||||
|
||||
Parameters
|
||||
@@ -26,12 +32,18 @@ class _BaseClassifier(object):
|
||||
n_features is the number of features.
|
||||
y : array-like, shape = [n_samples]
|
||||
Target values.
|
||||
init_weights : bool (default: None)
|
||||
Reinitialize weights
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
|
||||
"""
|
||||
if not (init_weights is None or isinstance(init_weights, bool)):
|
||||
raise AttributeError("init_weights must be True, False, or None")
|
||||
init_weights
|
||||
self._check_arrays(X=X, y=y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
|
||||
@@ -19,6 +19,7 @@ import numpy as np
|
||||
|
||||
|
||||
class EnsembleVoteClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
|
||||
|
||||
"""Soft Voting/Majority Rule classifier for scikit-learn estimators.
|
||||
|
||||
Parameters
|
||||
@@ -168,8 +169,7 @@ class EnsembleVoteClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
|
||||
else: # 'hard' voting
|
||||
predictions = self._predict(X)
|
||||
|
||||
maj = np.apply_along_axis(
|
||||
lambda x:
|
||||
maj = np.apply_along_axis(lambda x:
|
||||
np.argmax(np.bincount(x,
|
||||
weights=self.weights)),
|
||||
axis=1,
|
||||
|
||||
@@ -12,6 +12,7 @@ from .base import _BaseClassifier
|
||||
|
||||
|
||||
class LogisticRegression(_BaseClassifier):
|
||||
|
||||
"""Logistic regression classifier.
|
||||
|
||||
Parameters
|
||||
@@ -51,7 +52,7 @@ class LogisticRegression(_BaseClassifier):
|
||||
epoch.
|
||||
|
||||
"""
|
||||
def __init__(self, eta=0.01, epochs=50, regularization=None,
|
||||
def __init__(self, eta=0.01, epochs=50,
|
||||
l2_lambda=0.0, minibatches=1,
|
||||
random_seed=None, zero_init_weight=False,
|
||||
print_progress=0):
|
||||
@@ -88,9 +89,10 @@ class LogisticRegression(_BaseClassifier):
|
||||
raise ValueError('Supports only binary class labels 0 and 1')
|
||||
|
||||
if init_weights:
|
||||
self.w_ = self._init_weights(shape=1 + X.shape[1],
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
self.w_ = self._init_weights(
|
||||
shape=1 + X.shape[1],
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
|
||||
self.m_ = len(self.w_)
|
||||
self.cost_ = []
|
||||
@@ -117,7 +119,7 @@ class LogisticRegression(_BaseClassifier):
|
||||
cost = self._logit_cost(y, self._activation(X))
|
||||
self.cost_.append(cost)
|
||||
if self.print_progress:
|
||||
self._print_progress(epoch=i+1, cost=cost)
|
||||
self._print_progress(epoch=i + 1, cost=cost)
|
||||
return self
|
||||
|
||||
def _predict(self, X):
|
||||
|
||||
@@ -9,11 +9,11 @@
|
||||
import numpy as np
|
||||
from .base import _BaseClassifier
|
||||
from scipy.special import expit
|
||||
import sys
|
||||
from time import time
|
||||
|
||||
|
||||
class NeuralNetMLP(_BaseClassifier):
|
||||
|
||||
""" Feedforward neural network / Multi-layer perceptron classifier.
|
||||
|
||||
Parameters
|
||||
@@ -122,11 +122,11 @@ class NeuralNetMLP(_BaseClassifier):
|
||||
|
||||
def _initialize_weights(self):
|
||||
"""Initialize weights with small random numbers."""
|
||||
w1 = self._init_weights(shape=self.n_hidden*(self.n_features + 1),
|
||||
w1 = self._init_weights(shape=self.n_hidden * (self.n_features + 1),
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
w1 = w1.reshape(self.n_hidden, self.n_features + 1)
|
||||
w2 = self._init_weights(shape=self.n_output*(self.n_hidden + 1),
|
||||
w2 = self._init_weights(shape=self.n_output * (self.n_hidden + 1),
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
w2 = w2.reshape(self.n_output, self.n_hidden + 1)
|
||||
@@ -148,10 +148,10 @@ class NeuralNetMLP(_BaseClassifier):
|
||||
def _add_bias_unit(self, X, how='column'):
|
||||
"""Add bias unit (column or row of 1s) to array at index 0."""
|
||||
if how == 'column':
|
||||
X_new = np.ones((X.shape[0], X.shape[1]+1))
|
||||
X_new = np.ones((X.shape[0], X.shape[1] + 1))
|
||||
X_new[:, 1:] = X
|
||||
elif how == 'row':
|
||||
X_new = np.ones((X.shape[0]+1, X.shape[1]))
|
||||
X_new = np.ones((X.shape[0] + 1, X.shape[1]))
|
||||
X_new[1:, :] = X
|
||||
else:
|
||||
raise AttributeError('how must be columns or row')
|
||||
@@ -193,12 +193,12 @@ class NeuralNetMLP(_BaseClassifier):
|
||||
|
||||
def _L2_reg(self, lambda_, w1, w2):
|
||||
"""Compute L2-regularization cost."""
|
||||
return ((lambda_/2.0) * (np.sum(w1[:, 1:] ** 2) +
|
||||
return ((lambda_ / 2.0) * (np.sum(w1[:, 1:] ** 2) +
|
||||
np.sum(w2[:, 1:] ** 2)))
|
||||
|
||||
def _L1_reg(self, lambda_, w1, w2):
|
||||
"""Compute L1-regularization cost."""
|
||||
return ((lambda_/2.0) * (np.abs(w1[:, 1:]).sum() +
|
||||
return ((lambda_ / 2.0) * (np.abs(w1[:, 1:]).sum() +
|
||||
np.abs(w2[:, 1:]).sum()))
|
||||
|
||||
def _get_cost(self, y_enc, output, w1, w2):
|
||||
@@ -306,7 +306,7 @@ class NeuralNetMLP(_BaseClassifier):
|
||||
for i in range(self.epochs):
|
||||
|
||||
# adaptive learning rate
|
||||
self.eta /= (1 + self.decrease_const*i)
|
||||
self.eta /= (1 + self.decrease_const * i)
|
||||
|
||||
if self.shuffle_epoch:
|
||||
idx = np.random.permutation(y_enc.shape[1])
|
||||
@@ -342,7 +342,7 @@ class NeuralNetMLP(_BaseClassifier):
|
||||
delta_w1_prev, delta_w2_prev = delta_w1, delta_w2
|
||||
|
||||
if self.print_progress:
|
||||
self._print_progress(epoch=i+1)
|
||||
self._print_progress(epoch=i + 1)
|
||||
|
||||
return self
|
||||
|
||||
@@ -364,7 +364,7 @@ class NeuralNetMLP(_BaseClassifier):
|
||||
a1, z2, a2, z3, a3 = self._feedforward(X,
|
||||
w1 - epsilon_ary1,
|
||||
w2)
|
||||
cost1 = self._get_cost(y_enc, a3, w1-epsilon_ary1, w2)
|
||||
cost1 = self._get_cost(y_enc, a3, w1 - epsilon_ary1, w2)
|
||||
a1, z2, a2, z3, a3 = self._feedforward(X,
|
||||
w1 + epsilon_ary1,
|
||||
w2)
|
||||
|
||||
@@ -12,6 +12,7 @@ from .base import _BaseClassifier
|
||||
|
||||
|
||||
class Perceptron(_BaseClassifier):
|
||||
|
||||
"""Perceptron classifier.
|
||||
|
||||
Parameters
|
||||
@@ -82,9 +83,10 @@ class Perceptron(_BaseClassifier):
|
||||
' class labels {0, 1} or {-1, 1}.')
|
||||
|
||||
if init_weights:
|
||||
self.w_ = self._init_weights(shape=1 + X.shape[1],
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
self.w_ = self._init_weights(
|
||||
shape=1 + X.shape[1],
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
|
||||
self.cost_ = []
|
||||
|
||||
@@ -105,7 +107,7 @@ class Perceptron(_BaseClassifier):
|
||||
errors += int(update != 0.0)
|
||||
|
||||
if self.print_progress:
|
||||
self._print_progress(epoch=i+1, cost=errors)
|
||||
self._print_progress(epoch=i + 1, cost=errors)
|
||||
self.cost_.append(errors)
|
||||
return self
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ from .base import _BaseClassifier
|
||||
|
||||
|
||||
class SoftmaxRegression(_BaseClassifier):
|
||||
|
||||
"""Logistic regression classifier.
|
||||
|
||||
Parameters
|
||||
@@ -72,11 +73,6 @@ class SoftmaxRegression(_BaseClassifier):
|
||||
mat[i, val] = 1
|
||||
return mat.astype(float)
|
||||
|
||||
def _init_bias(self, n_features, n_classes):
|
||||
w = np.zeros((n_features, n_classes))
|
||||
b = np.zeros(n_classes)
|
||||
return w, b
|
||||
|
||||
def _net_input(self, X, W, b):
|
||||
return (X.dot(W) + b)
|
||||
|
||||
@@ -113,13 +109,14 @@ class SoftmaxRegression(_BaseClassifier):
|
||||
if init_weights:
|
||||
self._n_classes = np.max(y) + 1
|
||||
self._n_features = X.shape[1]
|
||||
self.w_ = self._init_weights(shape=(self._n_features,
|
||||
self._n_classes),
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
self.b_ = self._init_weights(shape=self._n_classes,
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
self.w_ = self._init_weights(
|
||||
shape=(self._n_features, self._n_classes),
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
self.b_ = self._init_weights(
|
||||
shape=self._n_classes,
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
self.cost_ = []
|
||||
|
||||
n_idx = list(range(y.shape[0]))
|
||||
@@ -162,7 +159,7 @@ class SoftmaxRegression(_BaseClassifier):
|
||||
self.cost_.append(cost)
|
||||
|
||||
if self.print_progress:
|
||||
self._print_progress(epoch=i+1, cost=cost)
|
||||
self._print_progress(epoch=i + 1, cost=cost)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ import numpy as np
|
||||
|
||||
|
||||
class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
|
||||
|
||||
"""A Stacking classifier for scikit-learn estimators for classification.
|
||||
|
||||
Parameters
|
||||
|
||||
@@ -31,7 +31,7 @@ def test_array_dimensions():
|
||||
|
||||
|
||||
def test_normal_equation():
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
ada = Adaline(epochs=30,
|
||||
eta=0.01,
|
||||
minibatches=None,
|
||||
@@ -42,7 +42,7 @@ def test_normal_equation():
|
||||
|
||||
|
||||
def test_gradient_descent():
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
ada = Adaline(epochs=30,
|
||||
eta=0.01,
|
||||
minibatches=1,
|
||||
@@ -53,7 +53,7 @@ def test_gradient_descent():
|
||||
|
||||
|
||||
def test_refit_weights():
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
ada = Adaline(epochs=15,
|
||||
eta=0.01,
|
||||
minibatches=1,
|
||||
@@ -65,7 +65,7 @@ def test_refit_weights():
|
||||
|
||||
|
||||
def test_standardized_iris_data_with_zero_weights():
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
ada = Adaline(epochs=30,
|
||||
eta=0.01,
|
||||
minibatches=1,
|
||||
@@ -77,7 +77,7 @@ def test_standardized_iris_data_with_zero_weights():
|
||||
|
||||
|
||||
def test_stochastic_gradient_descent():
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
|
||||
ada = Adaline(epochs=30,
|
||||
eta=0.01,
|
||||
minibatches=len(y),
|
||||
@@ -98,7 +98,7 @@ def test_ary_persistency_in_shuffling():
|
||||
|
||||
|
||||
def test_0_1_class():
|
||||
t1 = np.array([0.51, -0.04, 0.51])
|
||||
t1 = np.array([0.51, -0.04, 0.51])
|
||||
ada = Adaline(epochs=30,
|
||||
eta=0.01,
|
||||
minibatches=1,
|
||||
|
||||
@@ -9,8 +9,6 @@ from mlxtend.data import iris_data
|
||||
import numpy as np
|
||||
|
||||
|
||||
#### Binary
|
||||
|
||||
X, y = iris_data()
|
||||
X = X[:, [0, 3]] # sepal length and petal width
|
||||
X_bin = X[0:100] # class 0 and class 1
|
||||
@@ -77,8 +75,6 @@ def test_multi_logistic_regression_gd_weights():
|
||||
|
||||
|
||||
def test_multi_logistic_regression_gd_acc():
|
||||
t = np.array([[-0.17, -2.86, 3.51],
|
||||
[-4.85, 2.0, 0.35]])
|
||||
lr = SoftmaxRegression(epochs=200,
|
||||
eta=0.005,
|
||||
minibatches=1,
|
||||
|
||||
@@ -62,7 +62,7 @@ def plot_decision_regions(X, y, clf,
|
||||
|
||||
if not y.dtype == int:
|
||||
y = y.astype(int)
|
||||
|
||||
|
||||
# check if test data is provided
|
||||
plot_testdata = True
|
||||
if not isinstance(X_highlight, np.ndarray):
|
||||
@@ -146,7 +146,7 @@ def plot_decision_regions(X, y, clf,
|
||||
X_highlight[:, 1],
|
||||
c='',
|
||||
alpha=1.0,
|
||||
linewidth=1,
|
||||
linewidths=1,
|
||||
marker='o',
|
||||
s=80)
|
||||
else:
|
||||
@@ -154,7 +154,7 @@ def plot_decision_regions(X, y, clf,
|
||||
[0 for i in X_highlight],
|
||||
c='',
|
||||
alpha=1.0,
|
||||
linewidth=1,
|
||||
linewidths=1,
|
||||
marker='o',
|
||||
s=80)
|
||||
|
||||
|
||||
@@ -120,7 +120,7 @@ def plot_learning_curves(X_train, y_train,
|
||||
plt.ylabel('Performance ({})'.format(scoring))
|
||||
if print_model:
|
||||
plt.title('Learning Curves\n\n{}\n'.format(model))
|
||||
plt.legend(loc='best', numpoints=1)
|
||||
plt.legend(loc=legend_loc, numpoints=1)
|
||||
plt.xlim([0, 110])
|
||||
max_y = max(max(test_errors), max(training_errors))
|
||||
min_y = min(min(test_errors), min(training_errors))
|
||||
|
||||
@@ -145,9 +145,9 @@ def scoring(y_target, y_predicted, metric='error',
|
||||
elif metric == 'f1':
|
||||
pre = float(tp) / (tp + fp)
|
||||
rec = float(tp) / (fn + tp)
|
||||
res = 2.0 * (pre * rec)/(pre + rec)
|
||||
res = 2.0 * (pre * rec) / (pre + rec)
|
||||
elif metric == 'matthews_corr_coef':
|
||||
res = float(tp*tn - fp*fn)
|
||||
res = res / np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
|
||||
res = float(tp * tn - fp * fn)
|
||||
res = res / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
|
||||
|
||||
return res
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
from mlxtend.evaluate import scoring
|
||||
import numpy as np
|
||||
|
||||
|
||||
def test_metric_argument():
|
||||
"Test exception is raised when user provides invalid metric argument"
|
||||
try:
|
||||
|
||||
@@ -8,8 +8,10 @@
|
||||
|
||||
|
||||
class ColumnSelector(object):
|
||||
""" A feature selector for scikit-learn's Pipeline class that returns
|
||||
specified columns from a numpy array.
|
||||
"""Select specific columns from a data set.
|
||||
|
||||
A feature selector for scikit-learn's Pipeline class that returns
|
||||
specified columns from a numpy array.
|
||||
|
||||
"""
|
||||
def __init__(self, cols):
|
||||
|
||||
@@ -20,10 +20,10 @@ from sklearn.base import BaseEstimator
|
||||
from sklearn.base import MetaEstimatorMixin
|
||||
from sklearn.cross_validation import cross_val_score
|
||||
from ..externals.name_estimators import _name_estimators
|
||||
from ..externals import six
|
||||
|
||||
|
||||
class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
|
||||
"""Sequential Feature Selection for Classification and Regression.
|
||||
|
||||
Parameters
|
||||
@@ -133,29 +133,29 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
prev_subset = set(k_idx)
|
||||
if self.forward:
|
||||
k_idx, k_score, cv_scores = \
|
||||
self._inclusion(orig_set=orig_set,
|
||||
subset=prev_subset,
|
||||
X=X, y=y)
|
||||
self._inclusion(orig_set=orig_set,
|
||||
subset=prev_subset,
|
||||
X=X, y=y)
|
||||
else:
|
||||
k_idx, k_score, cv_scores = \
|
||||
self._exclusion(feature_set=prev_subset, X=X, y=y)
|
||||
self._exclusion(feature_set=prev_subset, X=X, y=y)
|
||||
|
||||
if self.floating and not self._is_stuck(sdq):
|
||||
(new_feature,) = set(k_idx) ^ prev_subset
|
||||
if self.forward:
|
||||
k_idx_c, k_score_c, cv_scores_c = \
|
||||
self._exclusion(feature_set=k_idx,
|
||||
fixed_feature=new_feature,
|
||||
X=X, y=y)
|
||||
self._exclusion(feature_set=k_idx,
|
||||
fixed_feature=new_feature,
|
||||
X=X, y=y)
|
||||
else:
|
||||
k_idx_c, k_score_c, cv_scores_c = \
|
||||
self._inclusion(orig_set=orig_set - {new_feature},
|
||||
subset=set(k_idx),
|
||||
X=X, y=y)
|
||||
self._inclusion(orig_set=orig_set - {new_feature},
|
||||
subset=set(k_idx),
|
||||
X=X, y=y)
|
||||
|
||||
if k_score_c and k_score_c > k_score:
|
||||
k_idx, k_score, cv_scores = \
|
||||
k_idx_c, k_score_c, cv_scores_c
|
||||
k_idx_c, k_score_c, cv_scores_c
|
||||
|
||||
k = len(k_idx)
|
||||
# floating can lead to multiple same-sized subsets
|
||||
@@ -168,7 +168,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
|
||||
if self.print_progress:
|
||||
sys.stderr.write('\rFeatures: %d/%d' % (
|
||||
len(k_idx), self.k_features))
|
||||
len(k_idx), self.k_features))
|
||||
sys.stderr.flush()
|
||||
|
||||
self.k_feature_idx_ = k_idx
|
||||
@@ -221,7 +221,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
|
||||
all_avg_scores = []
|
||||
all_cv_scores = []
|
||||
all_subsets = []
|
||||
for p in combinations(feature_set, r=n-1):
|
||||
for p in combinations(feature_set, r=n - 1):
|
||||
if fixed_feature and fixed_feature not in set(p):
|
||||
continue
|
||||
cv_scores = self._calc_score(X, y, p)
|
||||
|
||||
@@ -14,7 +14,7 @@ from itertools import cycle
|
||||
|
||||
|
||||
def enrichment_plot(df, colors='bgrkcy', markers=' ', linestyles='-',
|
||||
alpha=0.5, lw=2, legend=True, where='post', grid=True,
|
||||
alpha=0.5, lw=2, where='post', grid=True,
|
||||
count_label='Count',
|
||||
xlim='auto', ylim='auto', invert_axes=False,
|
||||
legend_loc='best', ax=None):
|
||||
@@ -37,8 +37,6 @@ def enrichment_plot(df, colors='bgrkcy', markers=' ', linestyles='-',
|
||||
Transparency level from 0.0 to 1.0.
|
||||
lw : int or float (default: 2)
|
||||
Linewidth parameter.
|
||||
legend : bool (default: True)
|
||||
Plots legend if True.
|
||||
where : {'post', 'pre', 'mid'} (default: 'post')
|
||||
Starting location of the steps.
|
||||
grid : bool (default: `True`)
|
||||
@@ -74,7 +72,7 @@ def enrichment_plot(df, colors='bgrkcy', markers=' ', linestyles='-',
|
||||
color_gen = cycle(colors)
|
||||
marker_gen = cycle(markers)
|
||||
linestyle_gen = cycle(linestyles.split(','))
|
||||
r = range(1, len(df_temp.index)+1)
|
||||
r = range(1, len(df_temp.index) + 1)
|
||||
labels = df_temp.columns
|
||||
|
||||
x_data = df_temp
|
||||
@@ -99,13 +97,13 @@ def enrichment_plot(df, colors='bgrkcy', markers=' ', linestyles='-',
|
||||
ax.set_ylim, ax.set_xlim = ax.set_xlim, ax.set_ylim
|
||||
|
||||
if ylim == 'auto':
|
||||
ax.set_ylim([np.min(y_data)-1, np.max(y_data)+1])
|
||||
ax.set_ylim([np.min(y_data) - 1, np.max(y_data) + 1])
|
||||
else:
|
||||
ax.set_ylim(ylim)
|
||||
|
||||
if xlim == 'auto':
|
||||
df_min, df_max = np.min(x_data.min()), np.max(x_data.max())
|
||||
ax.set_xlim([df_min-1, df_max+1])
|
||||
ax.set_xlim([df_min - 1, df_max + 1])
|
||||
|
||||
else:
|
||||
ax.set_xlim(xlim)
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
|
||||
class DenseTransformer(object):
|
||||
|
||||
"""Convert a sparse matrix into a dense matrix."""
|
||||
|
||||
def __init__(self, some_param=True):
|
||||
|
||||
@@ -11,6 +11,7 @@ from .transformer import TransformerObj
|
||||
|
||||
|
||||
class MeanCenterer(TransformerObj):
|
||||
|
||||
"""Column centering of vectors and matrices.
|
||||
|
||||
Attributes
|
||||
|
||||
@@ -22,7 +22,7 @@ def one_hot(y, num_labels='auto', dtype='float'):
|
||||
|
||||
Returns
|
||||
----------
|
||||
onehot : numpy.ndarray, shape = [n_classlabels]
|
||||
ary : numpy.ndarray, shape = [n_classlabels]
|
||||
One-hot encoded array, where each sample is represented as
|
||||
a row vector in the returned array.
|
||||
|
||||
@@ -41,5 +41,11 @@ def one_hot(y, num_labels='auto', dtype='float'):
|
||||
else:
|
||||
uniq = num_labels
|
||||
if uniq == 1:
|
||||
return np.array([[0.]], dtype=dtype)
|
||||
return (np.arange(uniq) == yt[:, None]).astype(dtype)
|
||||
ary = np.array([[0.]], dtype=dtype)
|
||||
|
||||
else:
|
||||
ary = np.zeros((len(y), uniq))
|
||||
for i, val in enumerate(y):
|
||||
ary[i, val] = 1
|
||||
|
||||
return ary.astype(dtype)
|
||||
|
||||
@@ -48,8 +48,8 @@ def minmax_scaling(array, columns, min_val=0, max_val=1):
|
||||
ary_newt[:, columns] = numerator / denominator
|
||||
|
||||
if not min_val == 0 and not max_val == 1:
|
||||
ary_newt[:, columns] = (ary_newt[:, columns] *
|
||||
(max_val - min_val) + min_val)
|
||||
ary_newt[:, columns] = (ary_newt[:, columns] *
|
||||
(max_val - min_val) + min_val)
|
||||
|
||||
return ary_newt[:, columns]
|
||||
|
||||
|
||||
@@ -46,13 +46,13 @@ def test_list():
|
||||
@raises(AttributeError)
|
||||
def test_multidim_list():
|
||||
y = [[0, 1, 2, 3, 4, 2]]
|
||||
out = one_hot(y)
|
||||
one_hot(y)
|
||||
|
||||
|
||||
@raises(AttributeError)
|
||||
def test_multidim_array():
|
||||
y = np.array([[0], [1], [2], [3], [4], [2]])
|
||||
out = one_hot(y)
|
||||
one_hot(y)
|
||||
|
||||
|
||||
def test_oneclass():
|
||||
@@ -60,7 +60,7 @@ def test_oneclass():
|
||||
np.array([[0.]], dtype='float'))
|
||||
|
||||
|
||||
def test_list():
|
||||
def test_list_morelabels():
|
||||
y = [0, 1]
|
||||
expect = np.array([[1., 0., 0.],
|
||||
[0., 1., 0.]], dtype='float')
|
||||
|
||||
@@ -13,7 +13,8 @@ import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
def plot_linear_regression(X, y, model=LinearRegression(), corr_func='pearsonr',
|
||||
def plot_linear_regression(X, y, model=LinearRegression(),
|
||||
corr_func='pearsonr',
|
||||
scattercolor='blue', fit_style='k--', legend=True,
|
||||
xlim='auto'):
|
||||
"""Plot a linear regression line fit.
|
||||
|
||||
@@ -12,11 +12,17 @@ from time import time
|
||||
|
||||
|
||||
class _BaseRegressor(object):
|
||||
"""Parent Class Base Regressor"""
|
||||
|
||||
"""Parent Class Base Regressor
|
||||
|
||||
A base class that is important by
|
||||
regressor child classes.
|
||||
|
||||
"""
|
||||
def __init__(self, print_progress=0):
|
||||
self.print_progress = print_progress
|
||||
|
||||
def fit(self, X, y):
|
||||
def fit(self, X, y, init_weights=True):
|
||||
"""Learn weight coefficients from training data.
|
||||
|
||||
Parameters
|
||||
@@ -26,12 +32,18 @@ class _BaseRegressor(object):
|
||||
n_features is the number of features.
|
||||
y : array-like, shape = [n_samples]
|
||||
Target values.
|
||||
init_weights : bool (default: None)
|
||||
Reinitialize weights
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
|
||||
"""
|
||||
if not (init_weights is None or isinstance(init_weights, bool)):
|
||||
raise AttributeError("init_weights must be True, False, or None")
|
||||
init_weights
|
||||
self._check_arrays(X=X, y=y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
@@ -58,8 +70,8 @@ class _BaseRegressor(object):
|
||||
|
||||
def _shuffle(self, arrays):
|
||||
"""Shuffle arrays in unison."""
|
||||
r = np.random.permutation(len(y))
|
||||
return [ary[r] for r in arrays]
|
||||
r = np.random.permutation(len(arrays[0]))
|
||||
return [ary[r] for ary in arrays]
|
||||
|
||||
def _print_progress(self, epoch, cost=None, time_interval=10):
|
||||
if self.print_progress > 0:
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from sys import stderr
|
||||
from time import time
|
||||
from .base import _BaseRegressor
|
||||
|
||||
@@ -19,10 +18,9 @@ from .base import _BaseRegressor
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class LinearRegression(_BaseRegressor):
|
||||
|
||||
""" Ordinary least squares linear regression.
|
||||
|
||||
Parameters
|
||||
@@ -93,9 +91,10 @@ class LinearRegression(_BaseRegressor):
|
||||
|
||||
# initialize weights
|
||||
if init_weights:
|
||||
self.w_ = self._init_weights(shape=1 + X.shape[1],
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
self.w_ = self._init_weights(
|
||||
shape=1 + X.shape[1],
|
||||
zero_init_weight=self.zero_init_weight,
|
||||
seed=self.random_seed)
|
||||
|
||||
self.cost_ = []
|
||||
|
||||
@@ -124,7 +123,7 @@ class LinearRegression(_BaseRegressor):
|
||||
cost = self._sum_squared_error_cost(y, self.activation(X))
|
||||
self.cost_.append(cost)
|
||||
if self.print_progress:
|
||||
self._print_progress(epoch=i+1, cost=cost)
|
||||
self._print_progress(epoch=i + 1, cost=cost)
|
||||
|
||||
return self
|
||||
|
||||
@@ -135,11 +134,6 @@ class LinearRegression(_BaseRegressor):
|
||||
w = np.dot(z, np.dot(Xb.T, y))
|
||||
return w
|
||||
|
||||
def _shuffle(self, X, y):
|
||||
"""Unison shuffling."""
|
||||
r = np.random.permutation(len(y))
|
||||
return X[r], y[r]
|
||||
|
||||
def net_input(self, X):
|
||||
"""Compute the linear net input."""
|
||||
return np.dot(X, self.w_[1:]) + self.w_[0]
|
||||
|
||||
@@ -18,6 +18,7 @@ import numpy as np
|
||||
|
||||
|
||||
class StackingRegressor(BaseEstimator, RegressorMixin, TransformerMixin):
|
||||
|
||||
"""A Stacking regressor for scikit-learn estimators for regression.
|
||||
|
||||
Parameters
|
||||
|
||||
@@ -4,12 +4,10 @@
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from mlxtend.data import boston_housing_data
|
||||
from mlxtend.regressor import StackingRegressor
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.svm import SVR
|
||||
from mlxtend.data import boston_housing_data
|
||||
import numpy as np
|
||||
from sklearn.grid_search import GridSearchCV
|
||||
from numpy.testing import assert_almost_equal
|
||||
@@ -31,7 +29,7 @@ def test_different_models():
|
||||
svr_rbf = SVR(kernel='rbf')
|
||||
stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
|
||||
meta_regressor=svr_rbf)
|
||||
y_pred = stregr.fit(X1, y).predict(X1)
|
||||
stregr.fit(X1, y).predict(X1)
|
||||
mse = 0.214
|
||||
got = np.mean((stregr.predict(X1) - y) ** 2)
|
||||
assert round(got, 3) == mse
|
||||
@@ -44,7 +42,7 @@ def test_multivariate():
|
||||
svr_rbf = SVR(kernel='rbf')
|
||||
stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
|
||||
meta_regressor=svr_rbf)
|
||||
y_pred = stregr.fit(X2, y).predict(X2)
|
||||
stregr.fit(X2, y).predict(X2)
|
||||
mse = 0.218
|
||||
got = np.mean((stregr.predict(X2) - y) ** 2)
|
||||
print(got)
|
||||
@@ -129,4 +127,5 @@ def test_get_coeff_fail():
|
||||
stregr = StackingRegressor(regressors=[ridge, lr],
|
||||
meta_regressor=svr_rbf)
|
||||
stregr = stregr.fit(X1, y)
|
||||
got = stregr.coef_
|
||||
r = stregr.coef_
|
||||
assert r
|
||||
|
||||
@@ -14,4 +14,4 @@ from .tokenizer import tokenizer_words_and_emoticons
|
||||
from .tokenizer import tokenizer_emoticons
|
||||
|
||||
__all__ = ["generalize_names", "generalize_names_duplcheck",
|
||||
"tokenizer_words_and_emoticons", "tokenizer_emoticons"]
|
||||
"tokenizer_words_and_emoticons", "tokenizer_emoticons"]
|
||||
|
||||
+30
-19
@@ -17,6 +17,7 @@ if sys.version_info <= (3, 0):
|
||||
" with Python 2.x,"
|
||||
" due to its unicode intricacies")
|
||||
|
||||
|
||||
def generalize_names(name, output_sep=' ', firstname_output_letters=1):
|
||||
"""Generalize a person's first and last name.
|
||||
|
||||
@@ -48,15 +49,14 @@ def generalize_names(name, output_sep=' ', firstname_output_letters=1):
|
||||
exc = ['van der ', 'de ', 'van ', 'von ', 'di ']
|
||||
for e in exc:
|
||||
if name.startswith(e):
|
||||
repl = e.replace(' ','')
|
||||
name = (repl + name[len(e)-1:].strip())
|
||||
repl = e.replace(' ', '')
|
||||
name = (repl + name[len(e) - 1:].strip())
|
||||
|
||||
exc = [' van der ', ' de ', ' van ', ' von ', ' di ',
|
||||
', van der ', ', de', ', van ', ', von ', ', di ']
|
||||
', van der ', ', de', ', van ', ', von ', ', di ']
|
||||
|
||||
for e in exc:
|
||||
name = name.replace(e, ' '+e.replace(' ', ''))
|
||||
|
||||
name = name.replace(e, ' ' + e.replace(' ', ''))
|
||||
|
||||
if ',' in name:
|
||||
last, first = first, last
|
||||
@@ -71,12 +71,15 @@ def generalize_names(name, output_sep=' ', firstname_output_letters=1):
|
||||
if sys.version_info.major == 2:
|
||||
name = name.decode('utf-8')
|
||||
|
||||
name = ''.join(x for x in unicodedata.normalize('NFKD', name) if x in string.ascii_letters+' ')
|
||||
name = ''.join(x for x in unicodedata.normalize('NFKD', name)
|
||||
if x in string.ascii_letters + ' ')
|
||||
|
||||
# get first and last name if applicable
|
||||
m = re.match('(?P<first>\w+)\W+(?P<last>\w+)', name)
|
||||
if m:
|
||||
output = '%s%s%s' % (m.group(last), output_sep, m.group(first)[:firstname_output_letters])
|
||||
output = '%s%s%s' % (m.group(last),
|
||||
output_sep,
|
||||
m.group(first)[:firstname_output_letters])
|
||||
else:
|
||||
output = name
|
||||
|
||||
@@ -87,21 +90,24 @@ def generalize_names(name, output_sep=' ', firstname_output_letters=1):
|
||||
def generalize_names_duplcheck(df, col_name):
|
||||
""" Generalizes names and removes duplicates.
|
||||
|
||||
Description : Applies mlxtend.text.generalize_names to a DataFrame with 1 first name letter
|
||||
by default and uses more first name letters if duplicates are detected.
|
||||
Description : Applies mlxtend.text.generalize_names to a DataFrame
|
||||
with 1 first name letter by default
|
||||
and uses more first name letters if duplicates are detected.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : `pandas.DataFrame`
|
||||
DataFrame that contains a column where generalize_names should be applied.
|
||||
|
||||
DataFrame that contains a column where
|
||||
generalize_names should be applied.
|
||||
col_name : `str`
|
||||
Name of the DataFrame column where `generalize_names` function should be applied to.
|
||||
Name of the DataFrame column where `generalize_names`
|
||||
function should be applied to.
|
||||
|
||||
Returns
|
||||
----------
|
||||
df_new : `str`
|
||||
New DataFrame object where generalize_names function has been applied without duplicates.
|
||||
New DataFrame object where generalize_names function has
|
||||
been applied without duplicates.
|
||||
|
||||
"""
|
||||
df_new = df.copy()
|
||||
@@ -110,15 +116,20 @@ def generalize_names_duplcheck(df, col_name):
|
||||
|
||||
df_new[col_name] = df_new[col_name].apply(generalize_names)
|
||||
|
||||
dupl = list(df_new[df_new.duplicated(subset=col_name, take_last=True)].index) + \
|
||||
list(df_new[df_new.duplicated(subset=col_name, take_last=False)].index)
|
||||
dupl = (list(df_new[df_new.duplicated(subset=col_name,
|
||||
take_last=True)].index) +
|
||||
list(df_new[df_new.duplicated(subset=col_name,
|
||||
take_last=False)].index))
|
||||
|
||||
firstname_letters = 2
|
||||
while len(dupl) > 0:
|
||||
for idx in dupl:
|
||||
df_new.loc[idx, col_name] = generalize_names(df.loc[idx, col_name],
|
||||
firstname_output_letters=firstname_letters)
|
||||
dupl = list(df_new[df_new.duplicated(subset=col_name, take_last=True)].index) + \
|
||||
list(df_new[df_new.duplicated(subset=col_name, take_last=False)].index)
|
||||
df_new.loc[idx, col_name] = generalize_names(
|
||||
df.loc[idx, col_name],
|
||||
firstname_output_letters=firstname_letters)
|
||||
dupl = (list(df_new[df_new.duplicated(subset=col_name,
|
||||
take_last=True)].index) +
|
||||
list(df_new[df_new.duplicated(subset=col_name,
|
||||
take_last=False)].index))
|
||||
firstname_letters += 1
|
||||
return df_new
|
||||
|
||||
@@ -6,6 +6,7 @@ if sys.version_info < (3, 0):
|
||||
|
||||
from mlxtend.text import generalize_names
|
||||
|
||||
|
||||
def test_generalize_names():
|
||||
|
||||
assert(generalize_names("Samuel Eto'o") == 'etoo s')
|
||||
@@ -13,18 +14,25 @@ def test_generalize_names():
|
||||
assert(generalize_names("Eto'o, Samuel") == 'etoo s')
|
||||
assert(generalize_names('Xavi') == 'xavi')
|
||||
assert(generalize_names('Yaya Toure') == 'toure y')
|
||||
assert(generalize_names('Pozo, Jose Angel') == 'pozo j')
|
||||
assert(generalize_names('Pozo, Jose Angel') == 'pozo j')
|
||||
assert(generalize_names('Pozo, Jose Angel') == 'pozo j')
|
||||
assert(generalize_names('Jose Angel Pozo') == 'pozo j')
|
||||
assert(generalize_names('Jose Pozo') == 'pozo j')
|
||||
assert(generalize_names('Pozo, Jose Angel', firstname_output_letters=2) == 'pozo jo')
|
||||
assert(generalize_names("Eto'o, Samuel", firstname_output_letters=2) == 'etoo sa')
|
||||
assert(generalize_names("Eto'o, Samuel", firstname_output_letters=0) == 'etoo')
|
||||
assert(generalize_names('Pozo, Jose Angel', firstname_output_letters=2) ==
|
||||
'pozo jo')
|
||||
assert(generalize_names("Eto'o, Samuel", firstname_output_letters=2) ==
|
||||
'etoo sa')
|
||||
assert(generalize_names("Eto'o, Samuel", firstname_output_letters=0) ==
|
||||
'etoo')
|
||||
assert(generalize_names("Eto'o, Samuel", output_sep=', ') == 'etoo, s')
|
||||
assert(generalize_names("Eto'o, Samuel", output_sep=', ') == 'etoo, s')
|
||||
|
||||
assert(generalize_names("van Persie, Robin", output_sep=', ') == 'vanpersie, r')
|
||||
assert(generalize_names("Robin van Persie", output_sep=', ') == 'vanpersie, r')
|
||||
assert(generalize_names("Rafael van der Vaart", output_sep=', ') == 'vandervaart, r')
|
||||
assert(generalize_names("van der Vaart, Rafael", output_sep=', ') == 'vandervaart, r')
|
||||
assert(generalize_names("van Persie, Robin", output_sep=', ') ==
|
||||
'vanpersie, r')
|
||||
assert(generalize_names("Robin van Persie", output_sep=', ') ==
|
||||
'vanpersie, r')
|
||||
assert(generalize_names("Rafael van der Vaart", output_sep=', ') ==
|
||||
'vandervaart, r')
|
||||
assert(generalize_names("van der Vaart, Rafael", output_sep=', ') ==
|
||||
'vandervaart, r')
|
||||
assert(generalize_names("Ben Hamer") == 'hamer b')
|
||||
|
||||
@@ -9,18 +9,17 @@ from mlxtend.text import generalize_names_duplcheck
|
||||
from mlxtend.text import generalize_names
|
||||
from io import StringIO
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
|
||||
def test_generalize_names_duplcheck():
|
||||
|
||||
|
||||
df = pd.read_csv(StringIO(csv))
|
||||
|
||||
# duplicates before
|
||||
dupl = any(df['Name'].apply(generalize_names).duplicated())
|
||||
assert(dupl==True)
|
||||
assert dupl is True
|
||||
|
||||
# no duplicates
|
||||
df_new = generalize_names_duplcheck(df=df, col_name='Name')
|
||||
no_dupl = any(df_new['Name'].duplicated())
|
||||
assert(no_dupl==False)
|
||||
assert no_dupl is False
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
from mlxtend.text import tokenizer_words_and_emoticons
|
||||
from mlxtend.text import tokenizer_emoticons
|
||||
|
||||
def test_tokenizer_words_and_emoticons():
|
||||
assert(tokenizer_words_and_emoticons('</a>This :) is :( a test :-)!') == ['this', 'is', 'a', 'test', ':)', ':(', ':-)'])
|
||||
|
||||
def test_tokenizer_words_and_emoticons():
|
||||
assert(tokenizer_emoticons('</a>This :) is :( a test :-)!') == [':)', ':(', ':-)'])
|
||||
def test_tokenizer_words_and_emoticons_1():
|
||||
assert(tokenizer_words_and_emoticons('</a>This :) is :( a test :-)!') ==
|
||||
['this', 'is', 'a', 'test', ':)', ':(', ':-)'])
|
||||
|
||||
|
||||
def test_tokenizer_words_and_emoticons_2():
|
||||
assert(tokenizer_emoticons('</a>This :) is :( a test :-)!') ==
|
||||
[':)', ':(', ':-)'])
|
||||
|
||||
@@ -6,9 +6,9 @@
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def tokenizer_words_and_emoticons(text):
|
||||
"""Convert text to lowercase words and emoticons.
|
||||
|
||||
|
||||
@@ -10,7 +10,8 @@ import time
|
||||
import sys
|
||||
|
||||
|
||||
class Counter():
|
||||
class Counter(object):
|
||||
|
||||
"""Class to display the progress of for-loop iterators.
|
||||
|
||||
Parameters
|
||||
|
||||
Reference in New Issue
Block a user