Skip to content

Feature generator #126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 22, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sklearn_pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .dataframe_mapper import DataFrameMapper # NOQA
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
from .categorical_imputer import CategoricalImputer # NOQA
from .features_generator import gen_features # NOQA
50 changes: 50 additions & 0 deletions sklearn_pandas/features_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
def gen_features(columns, classes=None):
"""Generates a feature definition list which can be passed
into DataFrameMapper

Params:

columns a list of column names to generate features for.

classes a list of classes for each feature, a list dictionaries with
transformer class and init parameters, or None.

If list of classes is provided, then each of them is
instantiated with default arguments:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd write "with default arguments. Example:". Otherwise it is a bit confusing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it.


classes = [StandardScaler, LabelBinarizer]

If list of dictionaries is provided, then each of them should
have a 'class' key with transformer class. All other keys are
passed into 'class' value constructor:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. Say Example: to be clear that what we are showing is an example.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it.


classes = [
{'class': StandardScaler, 'with_mean': False},
{'class': LabelBinarizer}
}]

If None value selected, then each feature left as is.

"""
if classes is None:
return [(column, None) for column in columns]

feature_defs = []

for column in columns:
feature_transformers = []

for definition in classes:
if isinstance(definition, dict):
params = definition.copy()
klass = params.pop('class')
feature_transformers.append(klass(**params))
elif isinstance(definition, type):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you should substitute the elif with a simple else since we want all other cases to be instantiated even if the class is not a new-style one. See:

In [6]: class A:
   ...:     pass
   ...: 

In [7]: isinstance(A, type)
Out[7]: False

In [8]: class B(object):
   ...:     pass
   ...: 

In [9]: isinstance(B, type)
Out[9]: True

However we would like to instantiate A in this case anyhow. Also, the passed object can be a function that returns an instantiated class as well.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I see, didn't know about it. I guess the reason is I mostly worked with Python v3.x with its default new-style classes. Will fix that.

feature_transformers.append(definition())

if not feature_transformers:
feature_transformers = None

feature_defs.append((column, feature_transformers))

return feature_defs
117 changes: 117 additions & 0 deletions tests/test_features_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from collections import Counter

import pytest
import numpy as np
from pandas import DataFrame
from numpy.testing import assert_array_equal

from sklearn_pandas import DataFrameMapper
from sklearn_pandas.features_generator import gen_features


class MockClass(object):

def __init__(self, value=1, name='class'):
self.value = value
self.name = name


class MockTransformer(object):

def __init__(self):
self.most_common_ = None

def fit(self, X, y=None):
[(value, _)] = Counter(X).most_common(1)
self.most_common_ = value
return self

def transform(self, X, y=None):
return np.asarray([self.most_common_] * len(X))


@pytest.fixture
def simple_dataset():
return DataFrame({
'feat1': [1, 2, 1, 3, 1],
'feat2': [1, 2, 2, 2, 3],
'feat3': [1, 2, 3, 4, 5],
})


@pytest.mark.parametrize('columns', [['colA', 'colB', 'colC']])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why using pytest's parametrize if we only try with one parameter?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess just kind of habit :) Will move into function.

def test_generate_features_with_default_parameters(columns):
"""
Tests generating features from classes with default init arguments.
"""
feature_defs = gen_features(columns=columns, classes=[MockClass])
assert len(feature_defs) == len(columns)

feature_dict = dict(feature_defs)
assert columns == sorted(feature_dict)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you write assert sorted(columns) == sorted(feature_dict.keys())? I didn't know that sorted(dict) returns its keys sorted and it's not evident.


expected = {'value': 1, 'name': 'class'}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add comment # default init arguments for MockClass for clarification.

for column, transformers in feature_dict.items():
for obj in transformers:
assert_attributes(obj, **expected)


def test_generate_features_with_several_classes():
"""
Tests generating features pipeline with different transformers parameters.
"""
feature_defs = gen_features(
columns=['colA', 'colB', 'colC'],
classes=[
{'class': MockClass},
{'class': MockClass, 'name': 'mockA'},
{'class': MockClass, 'name': 'mockB', 'value': None}
]
)

for transformers in dict(feature_defs).values():
assert_attributes(transformers[0], name='class', value=1)
assert_attributes(transformers[1], name='mockA', value=1)
assert_attributes(transformers[2], name='mockB', value=None)


def test_generate_features_with_none_transformers():
"""
Tests generating "dummy" feature definiton which doesn't apply any
transformation.
"""
feature_defs = gen_features(
columns=['colA', 'colB', 'colC'], classes=[None])

expected = [('colA', None),
('colB', None),
('colC', None)]

assert feature_defs == expected


def test_compatibility_with_data_frame_mapper(simple_dataset):
"""
Tests compatibility of generated feature definition with DataFrameMapper.
"""
features_defs = gen_features(
columns=['feat1', 'feat2'],
classes=[MockTransformer])
features_defs.append(('feat3', None))

mapper = DataFrameMapper(features_defs)
X = mapper.fit_transform(simple_dataset)
expected = np.asarray([
[1, 2, 1],
[1, 2, 2],
[1, 2, 3],
[1, 2, 4],
[1, 2, 5]
])

assert_array_equal(X, expected)


def assert_attributes(obj, **attrs):
for attr, value in attrs.items():
assert getattr(obj, attr) == value