-
Notifications
You must be signed in to change notification settings - Fork 418
Feature generator #126
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature generator #126
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
def gen_features(columns, classes=None): | ||
"""Generates a feature definition list which can be passed | ||
into DataFrameMapper | ||
|
||
Params: | ||
|
||
columns a list of column names to generate features for. | ||
|
||
classes a list of classes for each feature, a list dictionaries with | ||
transformer class and init parameters, or None. | ||
|
||
If list of classes is provided, then each of them is | ||
instantiated with default arguments: | ||
|
||
classes = [StandardScaler, LabelBinarizer] | ||
|
||
If list of dictionaries is provided, then each of them should | ||
have a 'class' key with transformer class. All other keys are | ||
passed into 'class' value constructor: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. Say There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. |
||
|
||
classes = [ | ||
{'class': StandardScaler, 'with_mean': False}, | ||
{'class': LabelBinarizer} | ||
}] | ||
|
||
If None value selected, then each feature left as is. | ||
|
||
""" | ||
if classes is None: | ||
return [(column, None) for column in columns] | ||
|
||
feature_defs = [] | ||
|
||
for column in columns: | ||
feature_transformers = [] | ||
|
||
for definition in classes: | ||
if isinstance(definition, dict): | ||
params = definition.copy() | ||
klass = params.pop('class') | ||
feature_transformers.append(klass(**params)) | ||
elif isinstance(definition, type): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe you should substitute the
However we would like to instantiate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, I see, didn't know about it. I guess the reason is I mostly worked with Python v3.x with its default new-style classes. Will fix that. |
||
feature_transformers.append(definition()) | ||
|
||
if not feature_transformers: | ||
feature_transformers = None | ||
|
||
feature_defs.append((column, feature_transformers)) | ||
|
||
return feature_defs |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from collections import Counter | ||
|
||
import pytest | ||
import numpy as np | ||
from pandas import DataFrame | ||
from numpy.testing import assert_array_equal | ||
|
||
from sklearn_pandas import DataFrameMapper | ||
from sklearn_pandas.features_generator import gen_features | ||
|
||
|
||
class MockClass(object): | ||
|
||
def __init__(self, value=1, name='class'): | ||
self.value = value | ||
self.name = name | ||
|
||
|
||
class MockTransformer(object): | ||
|
||
def __init__(self): | ||
self.most_common_ = None | ||
|
||
def fit(self, X, y=None): | ||
[(value, _)] = Counter(X).most_common(1) | ||
self.most_common_ = value | ||
return self | ||
|
||
def transform(self, X, y=None): | ||
return np.asarray([self.most_common_] * len(X)) | ||
|
||
|
||
@pytest.fixture | ||
def simple_dataset(): | ||
return DataFrame({ | ||
'feat1': [1, 2, 1, 3, 1], | ||
'feat2': [1, 2, 2, 2, 3], | ||
'feat3': [1, 2, 3, 4, 5], | ||
}) | ||
|
||
|
||
@pytest.mark.parametrize('columns', [['colA', 'colB', 'colC']]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why using pytest's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess just kind of habit :) Will move into function. |
||
def test_generate_features_with_default_parameters(columns): | ||
""" | ||
Tests generating features from classes with default init arguments. | ||
""" | ||
feature_defs = gen_features(columns=columns, classes=[MockClass]) | ||
assert len(feature_defs) == len(columns) | ||
|
||
feature_dict = dict(feature_defs) | ||
assert columns == sorted(feature_dict) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you write |
||
|
||
expected = {'value': 1, 'name': 'class'} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add comment |
||
for column, transformers in feature_dict.items(): | ||
for obj in transformers: | ||
assert_attributes(obj, **expected) | ||
|
||
|
||
def test_generate_features_with_several_classes(): | ||
""" | ||
Tests generating features pipeline with different transformers parameters. | ||
""" | ||
feature_defs = gen_features( | ||
columns=['colA', 'colB', 'colC'], | ||
classes=[ | ||
{'class': MockClass}, | ||
{'class': MockClass, 'name': 'mockA'}, | ||
{'class': MockClass, 'name': 'mockB', 'value': None} | ||
] | ||
) | ||
|
||
for transformers in dict(feature_defs).values(): | ||
assert_attributes(transformers[0], name='class', value=1) | ||
assert_attributes(transformers[1], name='mockA', value=1) | ||
assert_attributes(transformers[2], name='mockB', value=None) | ||
|
||
|
||
def test_generate_features_with_none_transformers(): | ||
""" | ||
Tests generating "dummy" feature definiton which doesn't apply any | ||
transformation. | ||
""" | ||
feature_defs = gen_features( | ||
columns=['colA', 'colB', 'colC'], classes=[None]) | ||
|
||
expected = [('colA', None), | ||
('colB', None), | ||
('colC', None)] | ||
|
||
assert feature_defs == expected | ||
|
||
|
||
def test_compatibility_with_data_frame_mapper(simple_dataset): | ||
""" | ||
Tests compatibility of generated feature definition with DataFrameMapper. | ||
""" | ||
features_defs = gen_features( | ||
columns=['feat1', 'feat2'], | ||
classes=[MockTransformer]) | ||
features_defs.append(('feat3', None)) | ||
|
||
mapper = DataFrameMapper(features_defs) | ||
X = mapper.fit_transform(simple_dataset) | ||
expected = np.asarray([ | ||
[1, 2, 1], | ||
[1, 2, 2], | ||
[1, 2, 3], | ||
[1, 2, 4], | ||
[1, 2, 5] | ||
]) | ||
|
||
assert_array_equal(X, expected) | ||
|
||
|
||
def assert_attributes(obj, **attrs): | ||
for attr, value in attrs.items(): | ||
assert getattr(obj, attr) == value |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd write "with default arguments. Example:". Otherwise it is a bit confusing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it.