Skip to content

Feature generator #126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ in a list::
[ 0. ],
[ 1.22474487]])


Columns that don't need any transformation
******************************************

Expand Down Expand Up @@ -282,6 +283,59 @@ passing it as the ``default`` argument to the mapper:
Using ``default=False`` (the default) drops unselected columns. Using
``default=None`` pass the unselected columns unchanged.


Same transformer for the multiple columns
*****************************************

Sometimes it is required to apply the same transformation to several dataframe columns.
To simplify this process, the package provides ``gen_features`` function which accepts a list
of columns and feature transformer class (or list of classes), and generates a feature definition,
acceptable by ``DataFrameMapper``.

For example, consider a dataset with three categorical columns, 'col1', 'col2', and 'col3',
To binarize each of them, one could pass column names and ``LabelBinarizer`` transformer class
into generator, and then use returned definition as ``features`` argument for ``DataFrameMapper``:

>>> from sklearn_pandas import gen_features
>>> feature_def = gen_features(
... columns=['col1', 'col2', 'col3'],
... classes=[sklearn.preprocessing.LabelEncoder]
... )
>>> feature_def
[('col1', [LabelEncoder()]), ('col2', [LabelEncoder()]), ('col3', [LabelEncoder()])]
>>> mapper5 = DataFrameMapper(feature_def)
>>> data5 = pd.DataFrame({
... 'col1': ['yes', 'no', 'yes'],
... 'col2': [True, False, False],
... 'col3': ['one', 'two', 'three']
... })
>>> mapper5.fit_transform(data5)
array([[1, 1, 0],
[0, 0, 2],
[1, 0, 1]])

If it is required to override some of transformer parameters, then a dict with 'class' key and
transformer parameters should be provided. For example, consider a dataset with missing values.
Then the following code could be used to override default imputing strategy:

>>> feature_def = gen_features(
... columns=[['col1'], ['col2'], ['col3']],
... classes=[{'class': sklearn.preprocessing.Imputer, 'strategy': 'most_frequent'}]
... )
>>> mapper6 = DataFrameMapper(feature_def)
>>> data6 = pd.DataFrame({
... 'col1': [None, 1, 1, 2, 3],
... 'col2': [True, False, None, None, True],
... 'col3': [0, 0, 0, None, None]
... })
>>> mapper6.fit_transform(data6)
array([[ 1., 1., 0.],
[ 1., 0., 0.],
[ 1., 1., 0.],
[ 2., 1., 0.],
[ 3., 1., 0.]])


Feature selection and other supervised transformations
******************************************************

Expand Down
1 change: 1 addition & 0 deletions sklearn_pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .dataframe_mapper import DataFrameMapper # NOQA
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
from .categorical_imputer import CategoricalImputer # NOQA
from .features_generator import gen_features # NOQA
55 changes: 55 additions & 0 deletions sklearn_pandas/features_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
def gen_features(columns, classes=None):
"""Generates a feature definition list which can be passed
into DataFrameMapper

Params:

columns a list of column names to generate features for.

classes a list of classes for each feature, a list of dictionaries with
transformer class and init parameters, or None.

If list of classes is provided, then each of them is
instantiated with default arguments. Example:

classes = [StandardScaler, LabelBinarizer]

If list of dictionaries is provided, then each of them should
have a 'class' key with transformer class. All other keys are
passed into 'class' value constructor. Example:

classes = [
{'class': StandardScaler, 'with_mean': False},
{'class': LabelBinarizer}
}]

If None value selected, then each feature left as is.

"""
if classes is None:
return [(column, None) for column in columns]

feature_defs = []

for column in columns:
feature_transformers = []

classes = [cls for cls in classes if cls is not None]
if not classes:
feature_defs.append((column, None))

else:
for definition in classes:
if isinstance(definition, dict):
params = definition.copy()
klass = params.pop('class')
feature_transformers.append(klass(**params))
else:
feature_transformers.append(definition())

if not feature_transformers:
feature_transformers = None

feature_defs.append((column, feature_transformers))

return feature_defs
118 changes: 118 additions & 0 deletions tests/test_features_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from collections import Counter

import pytest
import numpy as np
from pandas import DataFrame
from numpy.testing import assert_array_equal

from sklearn_pandas import DataFrameMapper
from sklearn_pandas.features_generator import gen_features


class MockClass(object):

def __init__(self, value=1, name='class'):
self.value = value
self.name = name


class MockTransformer(object):

def __init__(self):
self.most_common_ = None

def fit(self, X, y=None):
[(value, _)] = Counter(X).most_common(1)
self.most_common_ = value
return self

def transform(self, X, y=None):
return np.asarray([self.most_common_] * len(X))


@pytest.fixture
def simple_dataset():
return DataFrame({
'feat1': [1, 2, 1, 3, 1],
'feat2': [1, 2, 2, 2, 3],
'feat3': [1, 2, 3, 4, 5],
})


def test_generate_features_with_default_parameters():
"""
Tests generating features from classes with default init arguments.
"""
columns = ['colA', 'colB', 'colC']
feature_defs = gen_features(columns=columns, classes=[MockClass])
assert len(feature_defs) == len(columns)

feature_dict = dict(feature_defs)
assert columns == sorted(feature_dict.keys())

# default init arguments for MockClass for clarification.
expected = {'value': 1, 'name': 'class'}
for column, transformers in feature_dict.items():
for obj in transformers:
assert_attributes(obj, **expected)


def test_generate_features_with_several_classes():
"""
Tests generating features pipeline with different transformers parameters.
"""
feature_defs = gen_features(
columns=['colA', 'colB', 'colC'],
classes=[
{'class': MockClass},
{'class': MockClass, 'name': 'mockA'},
{'class': MockClass, 'name': 'mockB', 'value': None}
]
)

for transformers in dict(feature_defs).values():
assert_attributes(transformers[0], name='class', value=1)
assert_attributes(transformers[1], name='mockA', value=1)
assert_attributes(transformers[2], name='mockB', value=None)


def test_generate_features_with_none_only_transformers():
"""
Tests generating "dummy" feature definition which doesn't apply any
transformation.
"""
feature_defs = gen_features(
columns=['colA', 'colB', 'colC'], classes=[None])

expected = [('colA', None),
('colB', None),
('colC', None)]

assert feature_defs == expected


def test_compatibility_with_data_frame_mapper(simple_dataset):
"""
Tests compatibility of generated feature definition with DataFrameMapper.
"""
features_defs = gen_features(
columns=['feat1', 'feat2'],
classes=[MockTransformer])
features_defs.append(('feat3', None))

mapper = DataFrameMapper(features_defs)
X = mapper.fit_transform(simple_dataset)
expected = np.asarray([
[1, 2, 1],
[1, 2, 2],
[1, 2, 3],
[1, 2, 4],
[1, 2, 5]
])

assert_array_equal(X, expected)


def assert_attributes(obj, **attrs):
for attr, value in attrs.items():
assert getattr(obj, attr) == value