Skip to content

Commit e7e71c0

Browse files
authored
Merge pull request #126 from devforfu/master
Feature generator
2 parents e9bfb56 + 2646c8b commit e7e71c0

File tree

4 files changed

+228
-0
lines changed

4 files changed

+228
-0
lines changed

README.rst

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ in a list::
240240
[ 0. ],
241241
[ 1.22474487]])
242242

243+
243244
Columns that don't need any transformation
244245
******************************************
245246

@@ -282,6 +283,59 @@ passing it as the ``default`` argument to the mapper:
282283
Using ``default=False`` (the default) drops unselected columns. Using
283284
``default=None`` pass the unselected columns unchanged.
284285

286+
287+
Same transformer for the multiple columns
288+
*****************************************
289+
290+
Sometimes it is required to apply the same transformation to several dataframe columns.
291+
To simplify this process, the package provides ``gen_features`` function which accepts a list
292+
of columns and feature transformer class (or list of classes), and generates a feature definition,
293+
acceptable by ``DataFrameMapper``.
294+
295+
For example, consider a dataset with three categorical columns, 'col1', 'col2', and 'col3',
296+
To binarize each of them, one could pass column names and ``LabelBinarizer`` transformer class
297+
into generator, and then use returned definition as ``features`` argument for ``DataFrameMapper``:
298+
299+
>>> from sklearn_pandas import gen_features
300+
>>> feature_def = gen_features(
301+
... columns=['col1', 'col2', 'col3'],
302+
... classes=[sklearn.preprocessing.LabelEncoder]
303+
... )
304+
>>> feature_def
305+
[('col1', [LabelEncoder()]), ('col2', [LabelEncoder()]), ('col3', [LabelEncoder()])]
306+
>>> mapper5 = DataFrameMapper(feature_def)
307+
>>> data5 = pd.DataFrame({
308+
... 'col1': ['yes', 'no', 'yes'],
309+
... 'col2': [True, False, False],
310+
... 'col3': ['one', 'two', 'three']
311+
... })
312+
>>> mapper5.fit_transform(data5)
313+
array([[1, 1, 0],
314+
[0, 0, 2],
315+
[1, 0, 1]])
316+
317+
If it is required to override some of transformer parameters, then a dict with 'class' key and
318+
transformer parameters should be provided. For example, consider a dataset with missing values.
319+
Then the following code could be used to override default imputing strategy:
320+
321+
>>> feature_def = gen_features(
322+
... columns=[['col1'], ['col2'], ['col3']],
323+
... classes=[{'class': sklearn.preprocessing.Imputer, 'strategy': 'most_frequent'}]
324+
... )
325+
>>> mapper6 = DataFrameMapper(feature_def)
326+
>>> data6 = pd.DataFrame({
327+
... 'col1': [None, 1, 1, 2, 3],
328+
... 'col2': [True, False, None, None, True],
329+
... 'col3': [0, 0, 0, None, None]
330+
... })
331+
>>> mapper6.fit_transform(data6)
332+
array([[ 1., 1., 0.],
333+
[ 1., 0., 0.],
334+
[ 1., 1., 0.],
335+
[ 2., 1., 0.],
336+
[ 3., 1., 0.]])
337+
338+
285339
Feature selection and other supervised transformations
286340
******************************************************
287341

sklearn_pandas/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
from .dataframe_mapper import DataFrameMapper # NOQA
44
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
55
from .categorical_imputer import CategoricalImputer # NOQA
6+
from .features_generator import gen_features # NOQA

sklearn_pandas/features_generator.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
def gen_features(columns, classes=None):
2+
"""Generates a feature definition list which can be passed
3+
into DataFrameMapper
4+
5+
Params:
6+
7+
columns a list of column names to generate features for.
8+
9+
classes a list of classes for each feature, a list of dictionaries with
10+
transformer class and init parameters, or None.
11+
12+
If list of classes is provided, then each of them is
13+
instantiated with default arguments. Example:
14+
15+
classes = [StandardScaler, LabelBinarizer]
16+
17+
If list of dictionaries is provided, then each of them should
18+
have a 'class' key with transformer class. All other keys are
19+
passed into 'class' value constructor. Example:
20+
21+
classes = [
22+
{'class': StandardScaler, 'with_mean': False},
23+
{'class': LabelBinarizer}
24+
}]
25+
26+
If None value selected, then each feature left as is.
27+
28+
"""
29+
if classes is None:
30+
return [(column, None) for column in columns]
31+
32+
feature_defs = []
33+
34+
for column in columns:
35+
feature_transformers = []
36+
37+
classes = [cls for cls in classes if cls is not None]
38+
if not classes:
39+
feature_defs.append((column, None))
40+
41+
else:
42+
for definition in classes:
43+
if isinstance(definition, dict):
44+
params = definition.copy()
45+
klass = params.pop('class')
46+
feature_transformers.append(klass(**params))
47+
else:
48+
feature_transformers.append(definition())
49+
50+
if not feature_transformers:
51+
feature_transformers = None
52+
53+
feature_defs.append((column, feature_transformers))
54+
55+
return feature_defs

tests/test_features_generator.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
from collections import Counter
2+
3+
import pytest
4+
import numpy as np
5+
from pandas import DataFrame
6+
from numpy.testing import assert_array_equal
7+
8+
from sklearn_pandas import DataFrameMapper
9+
from sklearn_pandas.features_generator import gen_features
10+
11+
12+
class MockClass(object):
13+
14+
def __init__(self, value=1, name='class'):
15+
self.value = value
16+
self.name = name
17+
18+
19+
class MockTransformer(object):
20+
21+
def __init__(self):
22+
self.most_common_ = None
23+
24+
def fit(self, X, y=None):
25+
[(value, _)] = Counter(X).most_common(1)
26+
self.most_common_ = value
27+
return self
28+
29+
def transform(self, X, y=None):
30+
return np.asarray([self.most_common_] * len(X))
31+
32+
33+
@pytest.fixture
34+
def simple_dataset():
35+
return DataFrame({
36+
'feat1': [1, 2, 1, 3, 1],
37+
'feat2': [1, 2, 2, 2, 3],
38+
'feat3': [1, 2, 3, 4, 5],
39+
})
40+
41+
42+
def test_generate_features_with_default_parameters():
43+
"""
44+
Tests generating features from classes with default init arguments.
45+
"""
46+
columns = ['colA', 'colB', 'colC']
47+
feature_defs = gen_features(columns=columns, classes=[MockClass])
48+
assert len(feature_defs) == len(columns)
49+
50+
feature_dict = dict(feature_defs)
51+
assert columns == sorted(feature_dict.keys())
52+
53+
# default init arguments for MockClass for clarification.
54+
expected = {'value': 1, 'name': 'class'}
55+
for column, transformers in feature_dict.items():
56+
for obj in transformers:
57+
assert_attributes(obj, **expected)
58+
59+
60+
def test_generate_features_with_several_classes():
61+
"""
62+
Tests generating features pipeline with different transformers parameters.
63+
"""
64+
feature_defs = gen_features(
65+
columns=['colA', 'colB', 'colC'],
66+
classes=[
67+
{'class': MockClass},
68+
{'class': MockClass, 'name': 'mockA'},
69+
{'class': MockClass, 'name': 'mockB', 'value': None}
70+
]
71+
)
72+
73+
for transformers in dict(feature_defs).values():
74+
assert_attributes(transformers[0], name='class', value=1)
75+
assert_attributes(transformers[1], name='mockA', value=1)
76+
assert_attributes(transformers[2], name='mockB', value=None)
77+
78+
79+
def test_generate_features_with_none_only_transformers():
80+
"""
81+
Tests generating "dummy" feature definition which doesn't apply any
82+
transformation.
83+
"""
84+
feature_defs = gen_features(
85+
columns=['colA', 'colB', 'colC'], classes=[None])
86+
87+
expected = [('colA', None),
88+
('colB', None),
89+
('colC', None)]
90+
91+
assert feature_defs == expected
92+
93+
94+
def test_compatibility_with_data_frame_mapper(simple_dataset):
95+
"""
96+
Tests compatibility of generated feature definition with DataFrameMapper.
97+
"""
98+
features_defs = gen_features(
99+
columns=['feat1', 'feat2'],
100+
classes=[MockTransformer])
101+
features_defs.append(('feat3', None))
102+
103+
mapper = DataFrameMapper(features_defs)
104+
X = mapper.fit_transform(simple_dataset)
105+
expected = np.asarray([
106+
[1, 2, 1],
107+
[1, 2, 2],
108+
[1, 2, 3],
109+
[1, 2, 4],
110+
[1, 2, 5]
111+
])
112+
113+
assert_array_equal(X, expected)
114+
115+
116+
def assert_attributes(obj, **attrs):
117+
for attr, value in attrs.items():
118+
assert getattr(obj, attr) == value

0 commit comments

Comments
 (0)