flake8 checks

This commit is contained in:
rasbt
2016-03-11 20:52:21 -05:00
parent 60cf92c9f9
commit d85cce0645
35 changed files with 213 additions and 147 deletions
+15
View File
@@ -4,9 +4,24 @@ doc-warnings: yes
ignore-paths:
- mlxtend/data
- mlxtend/externals
ignore-patterns:
- ^example/doc_.*\.py$
- (^|/)docs(/|$)
- __init__.py
pylint:
options:
dummy-variables-rgx: _$|.+_$|dummy_.+
disable:
- missing-docstring
- protected-access
- too-few-public-methods
- too-many-arguments
- too-many-instance-attributes
- too-many-locals
- too-many-public-methods
- too-many-return-statements
- too-many-statements
- unpacking-non-sequence
+6 -4
View File
@@ -12,6 +12,7 @@ from .base import _BaseClassifier
class Adaline(_BaseClassifier):
"""ADAptive LInear NEuron classifier.
Parameters
@@ -94,9 +95,10 @@ class Adaline(_BaseClassifier):
self.thres_ = 0.5
if init_weights:
self.w_ = self._init_weights(shape=1 + X.shape[1],
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.w_ = self._init_weights(
shape=1 + X.shape[1],
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.cost_ = []
@@ -125,7 +127,7 @@ class Adaline(_BaseClassifier):
cost = self._sum_squared_error_cost(y, self._activation(X))
self.cost_.append(cost)
if self.print_progress:
self._print_progress(epoch=i+1, cost=cost)
self._print_progress(epoch=i + 1, cost=cost)
return self
+14 -2
View File
@@ -12,11 +12,17 @@ from time import time
class _BaseClassifier(object):
"""Parent Class Base Classifier"""
"""Parent Class Base Classifier
A base class that is important by
classifier child classes.
"""
def __init__(self, print_progress=0):
self.print_progress = print_progress
def fit(self, X, y):
def fit(self, X, y, init_weights=True):
"""Learn weight coefficients from training data.
Parameters
@@ -26,12 +32,18 @@ class _BaseClassifier(object):
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
init_weights : bool (default: None)
Reinitialize weights
Returns
-------
self : object
"""
if not (init_weights is None or isinstance(init_weights, bool)):
raise AttributeError("init_weights must be True, False, or None")
init_weights
self._check_arrays(X=X, y=y)
return self
def predict(self, X):
+2 -2
View File
@@ -19,6 +19,7 @@ import numpy as np
class EnsembleVoteClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
"""Soft Voting/Majority Rule classifier for scikit-learn estimators.
Parameters
@@ -168,8 +169,7 @@ class EnsembleVoteClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
else: # 'hard' voting
predictions = self._predict(X)
maj = np.apply_along_axis(
lambda x:
maj = np.apply_along_axis(lambda x:
np.argmax(np.bincount(x,
weights=self.weights)),
axis=1,
+7 -5
View File
@@ -12,6 +12,7 @@ from .base import _BaseClassifier
class LogisticRegression(_BaseClassifier):
"""Logistic regression classifier.
Parameters
@@ -51,7 +52,7 @@ class LogisticRegression(_BaseClassifier):
epoch.
"""
def __init__(self, eta=0.01, epochs=50, regularization=None,
def __init__(self, eta=0.01, epochs=50,
l2_lambda=0.0, minibatches=1,
random_seed=None, zero_init_weight=False,
print_progress=0):
@@ -88,9 +89,10 @@ class LogisticRegression(_BaseClassifier):
raise ValueError('Supports only binary class labels 0 and 1')
if init_weights:
self.w_ = self._init_weights(shape=1 + X.shape[1],
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.w_ = self._init_weights(
shape=1 + X.shape[1],
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.m_ = len(self.w_)
self.cost_ = []
@@ -117,7 +119,7 @@ class LogisticRegression(_BaseClassifier):
cost = self._logit_cost(y, self._activation(X))
self.cost_.append(cost)
if self.print_progress:
self._print_progress(epoch=i+1, cost=cost)
self._print_progress(epoch=i + 1, cost=cost)
return self
def _predict(self, X):
+10 -10
View File
@@ -9,11 +9,11 @@
import numpy as np
from .base import _BaseClassifier
from scipy.special import expit
import sys
from time import time
class NeuralNetMLP(_BaseClassifier):
""" Feedforward neural network / Multi-layer perceptron classifier.
Parameters
@@ -122,11 +122,11 @@ class NeuralNetMLP(_BaseClassifier):
def _initialize_weights(self):
"""Initialize weights with small random numbers."""
w1 = self._init_weights(shape=self.n_hidden*(self.n_features + 1),
w1 = self._init_weights(shape=self.n_hidden * (self.n_features + 1),
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
w1 = w1.reshape(self.n_hidden, self.n_features + 1)
w2 = self._init_weights(shape=self.n_output*(self.n_hidden + 1),
w2 = self._init_weights(shape=self.n_output * (self.n_hidden + 1),
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
w2 = w2.reshape(self.n_output, self.n_hidden + 1)
@@ -148,10 +148,10 @@ class NeuralNetMLP(_BaseClassifier):
def _add_bias_unit(self, X, how='column'):
"""Add bias unit (column or row of 1s) to array at index 0."""
if how == 'column':
X_new = np.ones((X.shape[0], X.shape[1]+1))
X_new = np.ones((X.shape[0], X.shape[1] + 1))
X_new[:, 1:] = X
elif how == 'row':
X_new = np.ones((X.shape[0]+1, X.shape[1]))
X_new = np.ones((X.shape[0] + 1, X.shape[1]))
X_new[1:, :] = X
else:
raise AttributeError('how must be columns or row')
@@ -193,12 +193,12 @@ class NeuralNetMLP(_BaseClassifier):
def _L2_reg(self, lambda_, w1, w2):
"""Compute L2-regularization cost."""
return ((lambda_/2.0) * (np.sum(w1[:, 1:] ** 2) +
return ((lambda_ / 2.0) * (np.sum(w1[:, 1:] ** 2) +
np.sum(w2[:, 1:] ** 2)))
def _L1_reg(self, lambda_, w1, w2):
"""Compute L1-regularization cost."""
return ((lambda_/2.0) * (np.abs(w1[:, 1:]).sum() +
return ((lambda_ / 2.0) * (np.abs(w1[:, 1:]).sum() +
np.abs(w2[:, 1:]).sum()))
def _get_cost(self, y_enc, output, w1, w2):
@@ -306,7 +306,7 @@ class NeuralNetMLP(_BaseClassifier):
for i in range(self.epochs):
# adaptive learning rate
self.eta /= (1 + self.decrease_const*i)
self.eta /= (1 + self.decrease_const * i)
if self.shuffle_epoch:
idx = np.random.permutation(y_enc.shape[1])
@@ -342,7 +342,7 @@ class NeuralNetMLP(_BaseClassifier):
delta_w1_prev, delta_w2_prev = delta_w1, delta_w2
if self.print_progress:
self._print_progress(epoch=i+1)
self._print_progress(epoch=i + 1)
return self
@@ -364,7 +364,7 @@ class NeuralNetMLP(_BaseClassifier):
a1, z2, a2, z3, a3 = self._feedforward(X,
w1 - epsilon_ary1,
w2)
cost1 = self._get_cost(y_enc, a3, w1-epsilon_ary1, w2)
cost1 = self._get_cost(y_enc, a3, w1 - epsilon_ary1, w2)
a1, z2, a2, z3, a3 = self._feedforward(X,
w1 + epsilon_ary1,
w2)
+6 -4
View File
@@ -12,6 +12,7 @@ from .base import _BaseClassifier
class Perceptron(_BaseClassifier):
"""Perceptron classifier.
Parameters
@@ -82,9 +83,10 @@ class Perceptron(_BaseClassifier):
' class labels {0, 1} or {-1, 1}.')
if init_weights:
self.w_ = self._init_weights(shape=1 + X.shape[1],
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.w_ = self._init_weights(
shape=1 + X.shape[1],
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.cost_ = []
@@ -105,7 +107,7 @@ class Perceptron(_BaseClassifier):
errors += int(update != 0.0)
if self.print_progress:
self._print_progress(epoch=i+1, cost=errors)
self._print_progress(epoch=i + 1, cost=errors)
self.cost_.append(errors)
return self
+10 -13
View File
@@ -14,6 +14,7 @@ from .base import _BaseClassifier
class SoftmaxRegression(_BaseClassifier):
"""Logistic regression classifier.
Parameters
@@ -72,11 +73,6 @@ class SoftmaxRegression(_BaseClassifier):
mat[i, val] = 1
return mat.astype(float)
def _init_bias(self, n_features, n_classes):
w = np.zeros((n_features, n_classes))
b = np.zeros(n_classes)
return w, b
def _net_input(self, X, W, b):
return (X.dot(W) + b)
@@ -113,13 +109,14 @@ class SoftmaxRegression(_BaseClassifier):
if init_weights:
self._n_classes = np.max(y) + 1
self._n_features = X.shape[1]
self.w_ = self._init_weights(shape=(self._n_features,
self._n_classes),
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.b_ = self._init_weights(shape=self._n_classes,
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.w_ = self._init_weights(
shape=(self._n_features, self._n_classes),
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.b_ = self._init_weights(
shape=self._n_classes,
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.cost_ = []
n_idx = list(range(y.shape[0]))
@@ -162,7 +159,7 @@ class SoftmaxRegression(_BaseClassifier):
self.cost_.append(cost)
if self.print_progress:
self._print_progress(epoch=i+1, cost=cost)
self._print_progress(epoch=i + 1, cost=cost)
return self
@@ -18,6 +18,7 @@ import numpy as np
class StackingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
"""A Stacking classifier for scikit-learn estimators for classification.
Parameters
+6 -6
View File
@@ -31,7 +31,7 @@ def test_array_dimensions():
def test_normal_equation():
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
ada = Adaline(epochs=30,
eta=0.01,
minibatches=None,
@@ -42,7 +42,7 @@ def test_normal_equation():
def test_gradient_descent():
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
ada = Adaline(epochs=30,
eta=0.01,
minibatches=1,
@@ -53,7 +53,7 @@ def test_gradient_descent():
def test_refit_weights():
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
ada = Adaline(epochs=15,
eta=0.01,
minibatches=1,
@@ -65,7 +65,7 @@ def test_refit_weights():
def test_standardized_iris_data_with_zero_weights():
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
ada = Adaline(epochs=30,
eta=0.01,
minibatches=1,
@@ -77,7 +77,7 @@ def test_standardized_iris_data_with_zero_weights():
def test_stochastic_gradient_descent():
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
t1 = np.array([-5.21e-16, -7.86e-02, 1.02e+00])
ada = Adaline(epochs=30,
eta=0.01,
minibatches=len(y),
@@ -98,7 +98,7 @@ def test_ary_persistency_in_shuffling():
def test_0_1_class():
t1 = np.array([0.51, -0.04, 0.51])
t1 = np.array([0.51, -0.04, 0.51])
ada = Adaline(epochs=30,
eta=0.01,
minibatches=1,
@@ -9,8 +9,6 @@ from mlxtend.data import iris_data
import numpy as np
#### Binary
X, y = iris_data()
X = X[:, [0, 3]] # sepal length and petal width
X_bin = X[0:100] # class 0 and class 1
@@ -77,8 +75,6 @@ def test_multi_logistic_regression_gd_weights():
def test_multi_logistic_regression_gd_acc():
t = np.array([[-0.17, -2.86, 3.51],
[-4.85, 2.0, 0.35]])
lr = SoftmaxRegression(epochs=200,
eta=0.005,
minibatches=1,
+3 -3
View File
@@ -62,7 +62,7 @@ def plot_decision_regions(X, y, clf,
if not y.dtype == int:
y = y.astype(int)
# check if test data is provided
plot_testdata = True
if not isinstance(X_highlight, np.ndarray):
@@ -146,7 +146,7 @@ def plot_decision_regions(X, y, clf,
X_highlight[:, 1],
c='',
alpha=1.0,
linewidth=1,
linewidths=1,
marker='o',
s=80)
else:
@@ -154,7 +154,7 @@ def plot_decision_regions(X, y, clf,
[0 for i in X_highlight],
c='',
alpha=1.0,
linewidth=1,
linewidths=1,
marker='o',
s=80)
+1 -1
View File
@@ -120,7 +120,7 @@ def plot_learning_curves(X_train, y_train,
plt.ylabel('Performance ({})'.format(scoring))
if print_model:
plt.title('Learning Curves\n\n{}\n'.format(model))
plt.legend(loc='best', numpoints=1)
plt.legend(loc=legend_loc, numpoints=1)
plt.xlim([0, 110])
max_y = max(max(test_errors), max(training_errors))
min_y = min(min(test_errors), min(training_errors))
+3 -3
View File
@@ -145,9 +145,9 @@ def scoring(y_target, y_predicted, metric='error',
elif metric == 'f1':
pre = float(tp) / (tp + fp)
rec = float(tp) / (fn + tp)
res = 2.0 * (pre * rec)/(pre + rec)
res = 2.0 * (pre * rec) / (pre + rec)
elif metric == 'matthews_corr_coef':
res = float(tp*tn - fp*fn)
res = res / np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
res = float(tp * tn - fp * fn)
res = res / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
return res
+1
View File
@@ -7,6 +7,7 @@
from mlxtend.evaluate import scoring
import numpy as np
def test_metric_argument():
"Test exception is raised when user provides invalid metric argument"
try:
+4 -2
View File
@@ -8,8 +8,10 @@
class ColumnSelector(object):
""" A feature selector for scikit-learn's Pipeline class that returns
specified columns from a numpy array.
"""Select specific columns from a data set.
A feature selector for scikit-learn's Pipeline class that returns
specified columns from a numpy array.
"""
def __init__(self, cols):
@@ -20,10 +20,10 @@ from sklearn.base import BaseEstimator
from sklearn.base import MetaEstimatorMixin
from sklearn.cross_validation import cross_val_score
from ..externals.name_estimators import _name_estimators
from ..externals import six
class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
"""Sequential Feature Selection for Classification and Regression.
Parameters
@@ -133,29 +133,29 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
prev_subset = set(k_idx)
if self.forward:
k_idx, k_score, cv_scores = \
self._inclusion(orig_set=orig_set,
subset=prev_subset,
X=X, y=y)
self._inclusion(orig_set=orig_set,
subset=prev_subset,
X=X, y=y)
else:
k_idx, k_score, cv_scores = \
self._exclusion(feature_set=prev_subset, X=X, y=y)
self._exclusion(feature_set=prev_subset, X=X, y=y)
if self.floating and not self._is_stuck(sdq):
(new_feature,) = set(k_idx) ^ prev_subset
if self.forward:
k_idx_c, k_score_c, cv_scores_c = \
self._exclusion(feature_set=k_idx,
fixed_feature=new_feature,
X=X, y=y)
self._exclusion(feature_set=k_idx,
fixed_feature=new_feature,
X=X, y=y)
else:
k_idx_c, k_score_c, cv_scores_c = \
self._inclusion(orig_set=orig_set - {new_feature},
subset=set(k_idx),
X=X, y=y)
self._inclusion(orig_set=orig_set - {new_feature},
subset=set(k_idx),
X=X, y=y)
if k_score_c and k_score_c > k_score:
k_idx, k_score, cv_scores = \
k_idx_c, k_score_c, cv_scores_c
k_idx_c, k_score_c, cv_scores_c
k = len(k_idx)
# floating can lead to multiple same-sized subsets
@@ -168,7 +168,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
if self.print_progress:
sys.stderr.write('\rFeatures: %d/%d' % (
len(k_idx), self.k_features))
len(k_idx), self.k_features))
sys.stderr.flush()
self.k_feature_idx_ = k_idx
@@ -221,7 +221,7 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
all_avg_scores = []
all_cv_scores = []
all_subsets = []
for p in combinations(feature_set, r=n-1):
for p in combinations(feature_set, r=n - 1):
if fixed_feature and fixed_feature not in set(p):
continue
cv_scores = self._calc_score(X, y, p)
+4 -6
View File
@@ -14,7 +14,7 @@ from itertools import cycle
def enrichment_plot(df, colors='bgrkcy', markers=' ', linestyles='-',
alpha=0.5, lw=2, legend=True, where='post', grid=True,
alpha=0.5, lw=2, where='post', grid=True,
count_label='Count',
xlim='auto', ylim='auto', invert_axes=False,
legend_loc='best', ax=None):
@@ -37,8 +37,6 @@ def enrichment_plot(df, colors='bgrkcy', markers=' ', linestyles='-',
Transparency level from 0.0 to 1.0.
lw : int or float (default: 2)
Linewidth parameter.
legend : bool (default: True)
Plots legend if True.
where : {'post', 'pre', 'mid'} (default: 'post')
Starting location of the steps.
grid : bool (default: `True`)
@@ -74,7 +72,7 @@ def enrichment_plot(df, colors='bgrkcy', markers=' ', linestyles='-',
color_gen = cycle(colors)
marker_gen = cycle(markers)
linestyle_gen = cycle(linestyles.split(','))
r = range(1, len(df_temp.index)+1)
r = range(1, len(df_temp.index) + 1)
labels = df_temp.columns
x_data = df_temp
@@ -99,13 +97,13 @@ def enrichment_plot(df, colors='bgrkcy', markers=' ', linestyles='-',
ax.set_ylim, ax.set_xlim = ax.set_xlim, ax.set_ylim
if ylim == 'auto':
ax.set_ylim([np.min(y_data)-1, np.max(y_data)+1])
ax.set_ylim([np.min(y_data) - 1, np.max(y_data) + 1])
else:
ax.set_ylim(ylim)
if xlim == 'auto':
df_min, df_max = np.min(x_data.min()), np.max(x_data.max())
ax.set_xlim([df_min-1, df_max+1])
ax.set_xlim([df_min - 1, df_max + 1])
else:
ax.set_xlim(xlim)
@@ -8,6 +8,7 @@
class DenseTransformer(object):
"""Convert a sparse matrix into a dense matrix."""
def __init__(self, some_param=True):
+1
View File
@@ -11,6 +11,7 @@ from .transformer import TransformerObj
class MeanCenterer(TransformerObj):
"""Column centering of vectors and matrices.
Attributes
+9 -3
View File
@@ -22,7 +22,7 @@ def one_hot(y, num_labels='auto', dtype='float'):
Returns
----------
onehot : numpy.ndarray, shape = [n_classlabels]
ary : numpy.ndarray, shape = [n_classlabels]
One-hot encoded array, where each sample is represented as
a row vector in the returned array.
@@ -41,5 +41,11 @@ def one_hot(y, num_labels='auto', dtype='float'):
else:
uniq = num_labels
if uniq == 1:
return np.array([[0.]], dtype=dtype)
return (np.arange(uniq) == yt[:, None]).astype(dtype)
ary = np.array([[0.]], dtype=dtype)
else:
ary = np.zeros((len(y), uniq))
for i, val in enumerate(y):
ary[i, val] = 1
return ary.astype(dtype)
+2 -2
View File
@@ -48,8 +48,8 @@ def minmax_scaling(array, columns, min_val=0, max_val=1):
ary_newt[:, columns] = numerator / denominator
if not min_val == 0 and not max_val == 1:
ary_newt[:, columns] = (ary_newt[:, columns] *
(max_val - min_val) + min_val)
ary_newt[:, columns] = (ary_newt[:, columns] *
(max_val - min_val) + min_val)
return ary_newt[:, columns]
+3 -3
View File
@@ -46,13 +46,13 @@ def test_list():
@raises(AttributeError)
def test_multidim_list():
y = [[0, 1, 2, 3, 4, 2]]
out = one_hot(y)
one_hot(y)
@raises(AttributeError)
def test_multidim_array():
y = np.array([[0], [1], [2], [3], [4], [2]])
out = one_hot(y)
one_hot(y)
def test_oneclass():
@@ -60,7 +60,7 @@ def test_oneclass():
np.array([[0.]], dtype='float'))
def test_list():
def test_list_morelabels():
y = [0, 1]
expect = np.array([[1., 0., 0.],
[0., 1., 0.]], dtype='float')
@@ -13,7 +13,8 @@ import matplotlib.pyplot as plt
import numpy as np
def plot_linear_regression(X, y, model=LinearRegression(), corr_func='pearsonr',
def plot_linear_regression(X, y, model=LinearRegression(),
corr_func='pearsonr',
scattercolor='blue', fit_style='k--', legend=True,
xlim='auto'):
"""Plot a linear regression line fit.
+16 -4
View File
@@ -12,11 +12,17 @@ from time import time
class _BaseRegressor(object):
"""Parent Class Base Regressor"""
"""Parent Class Base Regressor
A base class that is important by
regressor child classes.
"""
def __init__(self, print_progress=0):
self.print_progress = print_progress
def fit(self, X, y):
def fit(self, X, y, init_weights=True):
"""Learn weight coefficients from training data.
Parameters
@@ -26,12 +32,18 @@ class _BaseRegressor(object):
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
init_weights : bool (default: None)
Reinitialize weights
Returns
-------
self : object
"""
if not (init_weights is None or isinstance(init_weights, bool)):
raise AttributeError("init_weights must be True, False, or None")
init_weights
self._check_arrays(X=X, y=y)
return self
def predict(self, X):
@@ -58,8 +70,8 @@ class _BaseRegressor(object):
def _shuffle(self, arrays):
"""Shuffle arrays in unison."""
r = np.random.permutation(len(y))
return [ary[r] for r in arrays]
r = np.random.permutation(len(arrays[0]))
return [ary[r] for ary in arrays]
def _print_progress(self, epoch, cost=None, time_interval=10):
if self.print_progress > 0:
+6 -12
View File
@@ -7,7 +7,6 @@
# License: BSD 3 clause
import numpy as np
from sys import stderr
from time import time
from .base import _BaseRegressor
@@ -19,10 +18,9 @@ from .base import _BaseRegressor
#
# License: BSD 3 clause
import numpy as np
class LinearRegression(_BaseRegressor):
""" Ordinary least squares linear regression.
Parameters
@@ -93,9 +91,10 @@ class LinearRegression(_BaseRegressor):
# initialize weights
if init_weights:
self.w_ = self._init_weights(shape=1 + X.shape[1],
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.w_ = self._init_weights(
shape=1 + X.shape[1],
zero_init_weight=self.zero_init_weight,
seed=self.random_seed)
self.cost_ = []
@@ -124,7 +123,7 @@ class LinearRegression(_BaseRegressor):
cost = self._sum_squared_error_cost(y, self.activation(X))
self.cost_.append(cost)
if self.print_progress:
self._print_progress(epoch=i+1, cost=cost)
self._print_progress(epoch=i + 1, cost=cost)
return self
@@ -135,11 +134,6 @@ class LinearRegression(_BaseRegressor):
w = np.dot(z, np.dot(Xb.T, y))
return w
def _shuffle(self, X, y):
"""Unison shuffling."""
r = np.random.permutation(len(y))
return X[r], y[r]
def net_input(self, X):
"""Compute the linear net input."""
return np.dot(X, self.w_[1:]) + self.w_[0]
+1
View File
@@ -18,6 +18,7 @@ import numpy as np
class StackingRegressor(BaseEstimator, RegressorMixin, TransformerMixin):
"""A Stacking regressor for scikit-learn estimators for regression.
Parameters
@@ -4,12 +4,10 @@
#
# License: BSD 3 clause
from mlxtend.data import boston_housing_data
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from mlxtend.data import boston_housing_data
import numpy as np
from sklearn.grid_search import GridSearchCV
from numpy.testing import assert_almost_equal
@@ -31,7 +29,7 @@ def test_different_models():
svr_rbf = SVR(kernel='rbf')
stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
meta_regressor=svr_rbf)
y_pred = stregr.fit(X1, y).predict(X1)
stregr.fit(X1, y).predict(X1)
mse = 0.214
got = np.mean((stregr.predict(X1) - y) ** 2)
assert round(got, 3) == mse
@@ -44,7 +42,7 @@ def test_multivariate():
svr_rbf = SVR(kernel='rbf')
stregr = StackingRegressor(regressors=[svr_lin, lr, ridge],
meta_regressor=svr_rbf)
y_pred = stregr.fit(X2, y).predict(X2)
stregr.fit(X2, y).predict(X2)
mse = 0.218
got = np.mean((stregr.predict(X2) - y) ** 2)
print(got)
@@ -129,4 +127,5 @@ def test_get_coeff_fail():
stregr = StackingRegressor(regressors=[ridge, lr],
meta_regressor=svr_rbf)
stregr = stregr.fit(X1, y)
got = stregr.coef_
r = stregr.coef_
assert r
+1 -1
View File
@@ -14,4 +14,4 @@ from .tokenizer import tokenizer_words_and_emoticons
from .tokenizer import tokenizer_emoticons
__all__ = ["generalize_names", "generalize_names_duplcheck",
"tokenizer_words_and_emoticons", "tokenizer_emoticons"]
"tokenizer_words_and_emoticons", "tokenizer_emoticons"]
+30 -19
View File
@@ -17,6 +17,7 @@ if sys.version_info <= (3, 0):
" with Python 2.x,"
" due to its unicode intricacies")
def generalize_names(name, output_sep=' ', firstname_output_letters=1):
"""Generalize a person's first and last name.
@@ -48,15 +49,14 @@ def generalize_names(name, output_sep=' ', firstname_output_letters=1):
exc = ['van der ', 'de ', 'van ', 'von ', 'di ']
for e in exc:
if name.startswith(e):
repl = e.replace(' ','')
name = (repl + name[len(e)-1:].strip())
repl = e.replace(' ', '')
name = (repl + name[len(e) - 1:].strip())
exc = [' van der ', ' de ', ' van ', ' von ', ' di ',
', van der ', ', de', ', van ', ', von ', ', di ']
', van der ', ', de', ', van ', ', von ', ', di ']
for e in exc:
name = name.replace(e, ' '+e.replace(' ', ''))
name = name.replace(e, ' ' + e.replace(' ', ''))
if ',' in name:
last, first = first, last
@@ -71,12 +71,15 @@ def generalize_names(name, output_sep=' ', firstname_output_letters=1):
if sys.version_info.major == 2:
name = name.decode('utf-8')
name = ''.join(x for x in unicodedata.normalize('NFKD', name) if x in string.ascii_letters+' ')
name = ''.join(x for x in unicodedata.normalize('NFKD', name)
if x in string.ascii_letters + ' ')
# get first and last name if applicable
m = re.match('(?P<first>\w+)\W+(?P<last>\w+)', name)
if m:
output = '%s%s%s' % (m.group(last), output_sep, m.group(first)[:firstname_output_letters])
output = '%s%s%s' % (m.group(last),
output_sep,
m.group(first)[:firstname_output_letters])
else:
output = name
@@ -87,21 +90,24 @@ def generalize_names(name, output_sep=' ', firstname_output_letters=1):
def generalize_names_duplcheck(df, col_name):
""" Generalizes names and removes duplicates.
Description : Applies mlxtend.text.generalize_names to a DataFrame with 1 first name letter
by default and uses more first name letters if duplicates are detected.
Description : Applies mlxtend.text.generalize_names to a DataFrame
with 1 first name letter by default
and uses more first name letters if duplicates are detected.
Parameters
----------
df : `pandas.DataFrame`
DataFrame that contains a column where generalize_names should be applied.
DataFrame that contains a column where
generalize_names should be applied.
col_name : `str`
Name of the DataFrame column where `generalize_names` function should be applied to.
Name of the DataFrame column where `generalize_names`
function should be applied to.
Returns
----------
df_new : `str`
New DataFrame object where generalize_names function has been applied without duplicates.
New DataFrame object where generalize_names function has
been applied without duplicates.
"""
df_new = df.copy()
@@ -110,15 +116,20 @@ def generalize_names_duplcheck(df, col_name):
df_new[col_name] = df_new[col_name].apply(generalize_names)
dupl = list(df_new[df_new.duplicated(subset=col_name, take_last=True)].index) + \
list(df_new[df_new.duplicated(subset=col_name, take_last=False)].index)
dupl = (list(df_new[df_new.duplicated(subset=col_name,
take_last=True)].index) +
list(df_new[df_new.duplicated(subset=col_name,
take_last=False)].index))
firstname_letters = 2
while len(dupl) > 0:
for idx in dupl:
df_new.loc[idx, col_name] = generalize_names(df.loc[idx, col_name],
firstname_output_letters=firstname_letters)
dupl = list(df_new[df_new.duplicated(subset=col_name, take_last=True)].index) + \
list(df_new[df_new.duplicated(subset=col_name, take_last=False)].index)
df_new.loc[idx, col_name] = generalize_names(
df.loc[idx, col_name],
firstname_output_letters=firstname_letters)
dupl = (list(df_new[df_new.duplicated(subset=col_name,
take_last=True)].index) +
list(df_new[df_new.duplicated(subset=col_name,
take_last=False)].index))
firstname_letters += 1
return df_new
+16 -8
View File
@@ -6,6 +6,7 @@ if sys.version_info < (3, 0):
from mlxtend.text import generalize_names
def test_generalize_names():
assert(generalize_names("Samuel Eto'o") == 'etoo s')
@@ -13,18 +14,25 @@ def test_generalize_names():
assert(generalize_names("Eto'o, Samuel") == 'etoo s')
assert(generalize_names('Xavi') == 'xavi')
assert(generalize_names('Yaya Toure') == 'toure y')
assert(generalize_names('Pozo, Jose Angel') == 'pozo j')
assert(generalize_names('Pozo, Jose Angel') == 'pozo j')
assert(generalize_names('Pozo, Jose Angel') == 'pozo j')
assert(generalize_names('Jose Angel Pozo') == 'pozo j')
assert(generalize_names('Jose Pozo') == 'pozo j')
assert(generalize_names('Pozo, Jose Angel', firstname_output_letters=2) == 'pozo jo')
assert(generalize_names("Eto'o, Samuel", firstname_output_letters=2) == 'etoo sa')
assert(generalize_names("Eto'o, Samuel", firstname_output_letters=0) == 'etoo')
assert(generalize_names('Pozo, Jose Angel', firstname_output_letters=2) ==
'pozo jo')
assert(generalize_names("Eto'o, Samuel", firstname_output_letters=2) ==
'etoo sa')
assert(generalize_names("Eto'o, Samuel", firstname_output_letters=0) ==
'etoo')
assert(generalize_names("Eto'o, Samuel", output_sep=', ') == 'etoo, s')
assert(generalize_names("Eto'o, Samuel", output_sep=', ') == 'etoo, s')
assert(generalize_names("van Persie, Robin", output_sep=', ') == 'vanpersie, r')
assert(generalize_names("Robin van Persie", output_sep=', ') == 'vanpersie, r')
assert(generalize_names("Rafael van der Vaart", output_sep=', ') == 'vandervaart, r')
assert(generalize_names("van der Vaart, Rafael", output_sep=', ') == 'vandervaart, r')
assert(generalize_names("van Persie, Robin", output_sep=', ') ==
'vanpersie, r')
assert(generalize_names("Robin van Persie", output_sep=', ') ==
'vanpersie, r')
assert(generalize_names("Rafael van der Vaart", output_sep=', ') ==
'vandervaart, r')
assert(generalize_names("van der Vaart, Rafael", output_sep=', ') ==
'vandervaart, r')
assert(generalize_names("Ben Hamer") == 'hamer b')
@@ -9,18 +9,17 @@ from mlxtend.text import generalize_names_duplcheck
from mlxtend.text import generalize_names
from io import StringIO
import pandas as pd
import os
def test_generalize_names_duplcheck():
df = pd.read_csv(StringIO(csv))
# duplicates before
dupl = any(df['Name'].apply(generalize_names).duplicated())
assert(dupl==True)
assert dupl is True
# no duplicates
df_new = generalize_names_duplcheck(df=df, col_name='Name')
no_dupl = any(df_new['Name'].duplicated())
assert(no_dupl==False)
assert no_dupl is False
+8 -4
View File
@@ -1,8 +1,12 @@
from mlxtend.text import tokenizer_words_and_emoticons
from mlxtend.text import tokenizer_emoticons
def test_tokenizer_words_and_emoticons():
assert(tokenizer_words_and_emoticons('</a>This :) is :( a test :-)!') == ['this', 'is', 'a', 'test', ':)', ':(', ':-)'])
def test_tokenizer_words_and_emoticons():
assert(tokenizer_emoticons('</a>This :) is :( a test :-)!') == [':)', ':(', ':-)'])
def test_tokenizer_words_and_emoticons_1():
assert(tokenizer_words_and_emoticons('</a>This :) is :( a test :-)!') ==
['this', 'is', 'a', 'test', ':)', ':(', ':-)'])
def test_tokenizer_words_and_emoticons_2():
assert(tokenizer_emoticons('</a>This :) is :( a test :-)!') ==
[':)', ':(', ':-)'])
+1 -1
View File
@@ -6,9 +6,9 @@
#
# License: BSD 3 clause
import re
def tokenizer_words_and_emoticons(text):
"""Convert text to lowercase words and emoticons.
+2 -1
View File
@@ -10,7 +10,8 @@ import time
import sys
class Counter():
class Counter(object):
"""Class to display the progress of for-loop iterators.
Parameters