Skip to content

Commit 2646c8b

Browse files
committed
Updating README.rst and fixing features generator bug
1 parent 279545f commit 2646c8b

File tree

3 files changed

+78
-18
lines changed

3 files changed

+78
-18
lines changed

README.rst

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ in a list::
240240
[ 0. ],
241241
[ 1.22474487]])
242242

243+
243244
Columns that don't need any transformation
244245
******************************************
245246

@@ -282,6 +283,59 @@ passing it as the ``default`` argument to the mapper:
282283
Using ``default=False`` (the default) drops unselected columns. Using
283284
``default=None`` pass the unselected columns unchanged.
284285

286+
287+
Same transformer for the multiple columns
288+
*****************************************
289+
290+
Sometimes it is required to apply the same transformation to several dataframe columns.
291+
To simplify this process, the package provides ``gen_features`` function which accepts a list
292+
of columns and feature transformer class (or list of classes), and generates a feature definition,
293+
acceptable by ``DataFrameMapper``.
294+
295+
For example, consider a dataset with three categorical columns, 'col1', 'col2', and 'col3',
296+
To binarize each of them, one could pass column names and ``LabelBinarizer`` transformer class
297+
into generator, and then use returned definition as ``features`` argument for ``DataFrameMapper``:
298+
299+
>>> from sklearn_pandas import gen_features
300+
>>> feature_def = gen_features(
301+
... columns=['col1', 'col2', 'col3'],
302+
... classes=[sklearn.preprocessing.LabelEncoder]
303+
... )
304+
>>> feature_def
305+
[('col1', [LabelEncoder()]), ('col2', [LabelEncoder()]), ('col3', [LabelEncoder()])]
306+
>>> mapper5 = DataFrameMapper(feature_def)
307+
>>> data5 = pd.DataFrame({
308+
... 'col1': ['yes', 'no', 'yes'],
309+
... 'col2': [True, False, False],
310+
... 'col3': ['one', 'two', 'three']
311+
... })
312+
>>> mapper5.fit_transform(data5)
313+
array([[1, 1, 0],
314+
[0, 0, 2],
315+
[1, 0, 1]])
316+
317+
If it is required to override some of transformer parameters, then a dict with 'class' key and
318+
transformer parameters should be provided. For example, consider a dataset with missing values.
319+
Then the following code could be used to override default imputing strategy:
320+
321+
>>> feature_def = gen_features(
322+
... columns=[['col1'], ['col2'], ['col3']],
323+
... classes=[{'class': sklearn.preprocessing.Imputer, 'strategy': 'most_frequent'}]
324+
... )
325+
>>> mapper6 = DataFrameMapper(feature_def)
326+
>>> data6 = pd.DataFrame({
327+
... 'col1': [None, 1, 1, 2, 3],
328+
... 'col2': [True, False, None, None, True],
329+
... 'col3': [0, 0, 0, None, None]
330+
... })
331+
>>> mapper6.fit_transform(data6)
332+
array([[ 1., 1., 0.],
333+
[ 1., 0., 0.],
334+
[ 1., 1., 0.],
335+
[ 2., 1., 0.],
336+
[ 3., 1., 0.]])
337+
338+
285339
Feature selection and other supervised transformations
286340
******************************************************
287341

sklearn_pandas/features_generator.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@ def gen_features(columns, classes=None):
66
77
columns a list of column names to generate features for.
88
9-
classes a list of classes for each feature, a list dictionaries with
9+
classes a list of classes for each feature, a list of dictionaries with
1010
transformer class and init parameters, or None.
1111
1212
If list of classes is provided, then each of them is
13-
instantiated with default arguments:
13+
instantiated with default arguments. Example:
1414
1515
classes = [StandardScaler, LabelBinarizer]
1616
1717
If list of dictionaries is provided, then each of them should
1818
have a 'class' key with transformer class. All other keys are
19-
passed into 'class' value constructor:
19+
passed into 'class' value constructor. Example:
2020
2121
classes = [
2222
{'class': StandardScaler, 'with_mean': False},
@@ -34,17 +34,22 @@ def gen_features(columns, classes=None):
3434
for column in columns:
3535
feature_transformers = []
3636

37-
for definition in classes:
38-
if isinstance(definition, dict):
39-
params = definition.copy()
40-
klass = params.pop('class')
41-
feature_transformers.append(klass(**params))
42-
elif isinstance(definition, type):
43-
feature_transformers.append(definition())
37+
classes = [cls for cls in classes if cls is not None]
38+
if not classes:
39+
feature_defs.append((column, None))
4440

45-
if not feature_transformers:
46-
feature_transformers = None
41+
else:
42+
for definition in classes:
43+
if isinstance(definition, dict):
44+
params = definition.copy()
45+
klass = params.pop('class')
46+
feature_transformers.append(klass(**params))
47+
else:
48+
feature_transformers.append(definition())
4749

48-
feature_defs.append((column, feature_transformers))
50+
if not feature_transformers:
51+
feature_transformers = None
52+
53+
feature_defs.append((column, feature_transformers))
4954

5055
return feature_defs

tests/test_features_generator.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,18 @@ def simple_dataset():
3939
})
4040

4141

42-
@pytest.mark.parametrize('columns', [['colA', 'colB', 'colC']])
43-
def test_generate_features_with_default_parameters(columns):
42+
def test_generate_features_with_default_parameters():
4443
"""
4544
Tests generating features from classes with default init arguments.
4645
"""
46+
columns = ['colA', 'colB', 'colC']
4747
feature_defs = gen_features(columns=columns, classes=[MockClass])
4848
assert len(feature_defs) == len(columns)
4949

5050
feature_dict = dict(feature_defs)
51-
assert columns == sorted(feature_dict)
51+
assert columns == sorted(feature_dict.keys())
5252

53+
# default init arguments for MockClass for clarification.
5354
expected = {'value': 1, 'name': 'class'}
5455
for column, transformers in feature_dict.items():
5556
for obj in transformers:
@@ -75,9 +76,9 @@ def test_generate_features_with_several_classes():
7576
assert_attributes(transformers[2], name='mockB', value=None)
7677

7778

78-
def test_generate_features_with_none_transformers():
79+
def test_generate_features_with_none_only_transformers():
7980
"""
80-
Tests generating "dummy" feature definiton which doesn't apply any
81+
Tests generating "dummy" feature definition which doesn't apply any
8182
transformation.
8283
"""
8384
feature_defs = gen_features(

0 commit comments

Comments
 (0)