diff --git a/README.rst b/README.rst index 9d80967..aa12056 100644 --- a/README.rst +++ b/README.rst @@ -240,6 +240,7 @@ in a list:: [ 0. ], [ 1.22474487]]) + Columns that don't need any transformation ****************************************** @@ -282,6 +283,59 @@ passing it as the ``default`` argument to the mapper: Using ``default=False`` (the default) drops unselected columns. Using ``default=None`` pass the unselected columns unchanged. + +Same transformer for the multiple columns +***************************************** + +Sometimes it is required to apply the same transformation to several dataframe columns. +To simplify this process, the package provides ``gen_features`` function which accepts a list +of columns and feature transformer class (or list of classes), and generates a feature definition, +acceptable by ``DataFrameMapper``. + +For example, consider a dataset with three categorical columns, 'col1', 'col2', and 'col3', +To binarize each of them, one could pass column names and ``LabelBinarizer`` transformer class +into generator, and then use returned definition as ``features`` argument for ``DataFrameMapper``: + + >>> from sklearn_pandas import gen_features + >>> feature_def = gen_features( + ... columns=['col1', 'col2', 'col3'], + ... classes=[sklearn.preprocessing.LabelEncoder] + ... ) + >>> feature_def + [('col1', [LabelEncoder()]), ('col2', [LabelEncoder()]), ('col3', [LabelEncoder()])] + >>> mapper5 = DataFrameMapper(feature_def) + >>> data5 = pd.DataFrame({ + ... 'col1': ['yes', 'no', 'yes'], + ... 'col2': [True, False, False], + ... 'col3': ['one', 'two', 'three'] + ... }) + >>> mapper5.fit_transform(data5) + array([[1, 1, 0], + [0, 0, 2], + [1, 0, 1]]) + +If it is required to override some of transformer parameters, then a dict with 'class' key and +transformer parameters should be provided. For example, consider a dataset with missing values. +Then the following code could be used to override default imputing strategy: + + >>> feature_def = gen_features( + ... columns=[['col1'], ['col2'], ['col3']], + ... classes=[{'class': sklearn.preprocessing.Imputer, 'strategy': 'most_frequent'}] + ... ) + >>> mapper6 = DataFrameMapper(feature_def) + >>> data6 = pd.DataFrame({ + ... 'col1': [None, 1, 1, 2, 3], + ... 'col2': [True, False, None, None, True], + ... 'col3': [0, 0, 0, None, None] + ... }) + >>> mapper6.fit_transform(data6) + array([[ 1., 1., 0.], + [ 1., 0., 0.], + [ 1., 1., 0.], + [ 2., 1., 0.], + [ 3., 1., 0.]]) + + Feature selection and other supervised transformations ****************************************************** diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py index ca0f54e..6642c35 100644 --- a/sklearn_pandas/__init__.py +++ b/sklearn_pandas/__init__.py @@ -3,3 +3,4 @@ from .dataframe_mapper import DataFrameMapper # NOQA from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA from .categorical_imputer import CategoricalImputer # NOQA +from .features_generator import gen_features # NOQA diff --git a/sklearn_pandas/features_generator.py b/sklearn_pandas/features_generator.py new file mode 100644 index 0000000..643ddd0 --- /dev/null +++ b/sklearn_pandas/features_generator.py @@ -0,0 +1,55 @@ +def gen_features(columns, classes=None): + """Generates a feature definition list which can be passed + into DataFrameMapper + + Params: + + columns a list of column names to generate features for. + + classes a list of classes for each feature, a list of dictionaries with + transformer class and init parameters, or None. + + If list of classes is provided, then each of them is + instantiated with default arguments. Example: + + classes = [StandardScaler, LabelBinarizer] + + If list of dictionaries is provided, then each of them should + have a 'class' key with transformer class. All other keys are + passed into 'class' value constructor. Example: + + classes = [ + {'class': StandardScaler, 'with_mean': False}, + {'class': LabelBinarizer} + }] + + If None value selected, then each feature left as is. + + """ + if classes is None: + return [(column, None) for column in columns] + + feature_defs = [] + + for column in columns: + feature_transformers = [] + + classes = [cls for cls in classes if cls is not None] + if not classes: + feature_defs.append((column, None)) + + else: + for definition in classes: + if isinstance(definition, dict): + params = definition.copy() + klass = params.pop('class') + feature_transformers.append(klass(**params)) + else: + feature_transformers.append(definition()) + + if not feature_transformers: + feature_transformers = None + + feature_defs.append((column, feature_transformers)) + + return feature_defs diff --git a/tests/test_features_generator.py b/tests/test_features_generator.py new file mode 100644 index 0000000..aa56e82 --- /dev/null +++ b/tests/test_features_generator.py @@ -0,0 +1,118 @@ +from collections import Counter + +import pytest +import numpy as np +from pandas import DataFrame +from numpy.testing import assert_array_equal + +from sklearn_pandas import DataFrameMapper +from sklearn_pandas.features_generator import gen_features + + +class MockClass(object): + + def __init__(self, value=1, name='class'): + self.value = value + self.name = name + + +class MockTransformer(object): + + def __init__(self): + self.most_common_ = None + + def fit(self, X, y=None): + [(value, _)] = Counter(X).most_common(1) + self.most_common_ = value + return self + + def transform(self, X, y=None): + return np.asarray([self.most_common_] * len(X)) + + +@pytest.fixture +def simple_dataset(): + return DataFrame({ + 'feat1': [1, 2, 1, 3, 1], + 'feat2': [1, 2, 2, 2, 3], + 'feat3': [1, 2, 3, 4, 5], + }) + + +def test_generate_features_with_default_parameters(): + """ + Tests generating features from classes with default init arguments. + """ + columns = ['colA', 'colB', 'colC'] + feature_defs = gen_features(columns=columns, classes=[MockClass]) + assert len(feature_defs) == len(columns) + + feature_dict = dict(feature_defs) + assert columns == sorted(feature_dict.keys()) + + # default init arguments for MockClass for clarification. + expected = {'value': 1, 'name': 'class'} + for column, transformers in feature_dict.items(): + for obj in transformers: + assert_attributes(obj, **expected) + + +def test_generate_features_with_several_classes(): + """ + Tests generating features pipeline with different transformers parameters. + """ + feature_defs = gen_features( + columns=['colA', 'colB', 'colC'], + classes=[ + {'class': MockClass}, + {'class': MockClass, 'name': 'mockA'}, + {'class': MockClass, 'name': 'mockB', 'value': None} + ] + ) + + for transformers in dict(feature_defs).values(): + assert_attributes(transformers[0], name='class', value=1) + assert_attributes(transformers[1], name='mockA', value=1) + assert_attributes(transformers[2], name='mockB', value=None) + + +def test_generate_features_with_none_only_transformers(): + """ + Tests generating "dummy" feature definition which doesn't apply any + transformation. + """ + feature_defs = gen_features( + columns=['colA', 'colB', 'colC'], classes=[None]) + + expected = [('colA', None), + ('colB', None), + ('colC', None)] + + assert feature_defs == expected + + +def test_compatibility_with_data_frame_mapper(simple_dataset): + """ + Tests compatibility of generated feature definition with DataFrameMapper. + """ + features_defs = gen_features( + columns=['feat1', 'feat2'], + classes=[MockTransformer]) + features_defs.append(('feat3', None)) + + mapper = DataFrameMapper(features_defs) + X = mapper.fit_transform(simple_dataset) + expected = np.asarray([ + [1, 2, 1], + [1, 2, 2], + [1, 2, 3], + [1, 2, 4], + [1, 2, 5] + ]) + + assert_array_equal(X, expected) + + +def assert_attributes(obj, **attrs): + for attr, value in attrs.items(): + assert getattr(obj, attr) == value