Merge pull request #85 from paulgb/transformers-input-df

dukebody · web-flow · commit c50565c34ccc · 2017-04-17T11:53:14.000+02:00
Add input_df init argument to pass df/series to transformers
diff --git a/README.rst b/README.rst
@@ -50,7 +50,7 @@ For these examples, we'll also use pandas, numpy, and sklearn::
 Load some Data
 **************
 
-Normally you'll read the data from a file, but for demonstration purposes I'll create a data frame from a Python dict::
+Normally you'll read the data from a file, but for demonstration purposes we'll create a data frame from a Python dict::
 
     >>> data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
     ...                      'children': [4., 6, 3, 3, 2, 3, 5, 4],
@@ -116,6 +116,37 @@ the dataframe mapper. We can do so by inspecting the automatically generated
     ['pet_cat', 'pet_dog', 'pet_fish', 'children']
 
 
+Passing Series/DataFrames to the transformers
+*********************************************
+
+By default the transformers are passed a numpy array of the selected columns
+as input. This is because ``sklearn`` transformers are historically designed to
+work with numpy arrays, not with pandas dataframes, even though their basic
+indexing interfaces are similar.
+
+However we can pass a dataframe/series to the transformers to handle custom
+cases initializing the dataframe mapper with ``input_df=True`::
+
+    >>> from sklearn.base import TransformerMixin
+    >>> class DateEncoder(TransformerMixin):
+    ...    def fit(self, X, y=None):
+    ...        return self
+    ...
+    ...    def transform(self, X):
+    ...        dt = X.dt
+    ...        return pd.concat([dt.year, dt.month, dt.day], axis=1)
+    >>> dates_df = pd.DataFrame(
+    ...     {'dates': pd.date_range('2015-10-30', '2015-11-02')})
+    >>> mapper_dates = DataFrameMapper([
+    ...     ('dates', DateEncoder())
+    ... ], input_df=True)
+    >>> mapper_dates.fit_transform(dates_df)
+    array([[2015,   10,   30],
+           [2015,   10,   31],
+           [2015,   11,    1],
+           [2015,   11,    2]])
+
+
 Outputting a dataframe
 **********************
 
@@ -289,6 +320,8 @@ Development
 * Capture output columns generated names in ``transformed_names_`` attribute (#78).
 * Add ``CategoricalImputer`` that replaces null-like values with the mode
   for string-like columns.
+* Add ``input_df`` init argument to allow inputting a dataframe/series to the
+  transformers instead of a numpy array (#60).
 
 
 1.3.0 (2017-01-21)
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -33,7 +33,8 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
     sklearn transformation.
     """
 
-    def __init__(self, features, default=False, sparse=False, df_out=False):
+    def __init__(self, features, default=False, sparse=False, df_out=False,
+                 input_df=False):
         """
         Params:
 
@@ -57,6 +58,10 @@ def __init__(self, features, default=False, sparse=False, df_out=False):
                     if there's multiple inputs, and the name concatenated with
                     '_1', '_2' etc if there's multiple outputs. NB: does not
                     work if *default* or *sparse* are true
+
+        input_df    If ``True`` pass the selected columns to the transformers
+                    as a pandas DataFrame or Series. Otherwise pass them as a
+                    numpy array. Defaults to ``False``.
         """
         if isinstance(features, list):
             features = [(columns, _build_transformer(transformers))
@@ -65,6 +70,7 @@ def __init__(self, features, default=False, sparse=False, df_out=False):
         self.default = _build_transformer(default)
         self.sparse = sparse
         self.df_out = df_out
+        self.input_df = input_df
         self.transformed_names_ = []
 
         if (df_out and (sparse or default)):
@@ -108,6 +114,8 @@ def __setstate__(self, state):
         self.default = state.get('default', False)
         self.df_out = state.get('df_out', False)
 
+        self.input_df = state.get('input_df', False)
+
     def _get_col_subset(self, X, cols):
         """
         Get a subset of columns from the given table X.
@@ -132,10 +140,15 @@ def _get_col_subset(self, X, cols):
             X = X.df
 
         if return_vector:
-            t = X[cols[0]].values
+            t = X[cols[0]]
         else:
-            t = X[cols].values
+            t = X[cols]
 
+        # return either a DataFrame/Series or a numpy array
+        if self.input_df:
+            return t
+        else:
+            return t.values
         return t
 
     def fit(self, X, y=None):
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -56,6 +56,15 @@ def predict(self, X):
         return True
 
 
+class DateEncoder():
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        dt = X.dt
+        return pd.concat([dt.year, dt.month, dt.day], axis=1)
+
+
 class ToSparseTransformer(BaseEstimator, TransformerMixin):
     """
     Transforms numpy matrix to sparse format.
@@ -225,6 +234,87 @@ def test_pca(complex_dataframe):
     assert cols[1] == 'feat1_feat2_1'
 
 
+def test_input_df_true_first_transformer(simple_dataframe, monkeypatch):
+    """
+    If input_df is True, the first transformer is passed
+    a pd.Series instead of an np.array
+    """
+    df = simple_dataframe
+    monkeypatch.setattr(MockXTransformer, 'fit', Mock())
+    monkeypatch.setattr(MockXTransformer, 'transform',
+                        Mock(return_value=np.array([1, 2, 3])))
+    mapper = DataFrameMapper([
+        ('a', MockXTransformer())
+    ], input_df=True)
+    out = mapper.fit_transform(df)
+
+    args, _ = MockXTransformer().fit.call_args
+    assert isinstance(args[0], pd.Series)
+
+    args, _ = MockXTransformer().transform.call_args
+    assert isinstance(args[0], pd.Series)
+
+    assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
+
+
+def test_input_df_true_next_transformers(simple_dataframe, monkeypatch):
+    """
+    If input_df is True, the subsequent transformers get passed pandas
+    objects instead of numpy arrays (given the previous transformers
+    output pandas objects as well)
+    """
+    df = simple_dataframe
+    monkeypatch.setattr(MockTClassifier, 'fit', Mock())
+    monkeypatch.setattr(MockTClassifier, 'transform',
+                        Mock(return_value=pd.Series([1, 2, 3])))
+    mapper = DataFrameMapper([
+        ('a', [MockXTransformer(), MockTClassifier()])
+    ], input_df=True)
+    out = mapper.fit_transform(df)
+
+    args, _ = MockTClassifier().fit.call_args
+    assert isinstance(args[0], pd.Series)
+
+    assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
+
+
+def test_input_df_true_multiple_cols(complex_dataframe):
+    """
+    When input_df is True, applying transformers to multiple columns
+    works as expected
+    """
+    df = complex_dataframe
+
+    mapper = DataFrameMapper([
+        ('target', MockXTransformer()),
+        ('feat1',  MockXTransformer()),
+    ], input_df=True)
+    out = mapper.fit_transform(df)
+
+    assert_array_equal(out[:, 0], df['target'].values)
+    assert_array_equal(out[:, 1], df['feat1'].values)
+
+
+def test_input_df_date_encoder():
+    """
+    When input_df is True we can apply a transformer that only works
+    with pandas dataframes like a DateEncoder
+    """
+    df = pd.DataFrame(
+        {'dates': pd.date_range('2015-10-30', '2015-11-02')})
+    mapper = DataFrameMapper([
+        ('dates', DateEncoder())
+    ], input_df=True)
+    out = mapper.fit_transform(df)
+    expected = np.array([
+        [2015, 10, 30],
+        [2015, 10, 31],
+        [2015, 11, 1],
+        [2015, 11, 2]
+    ])
+    assert_array_equal(out, expected)
+
+
 def test_nonexistent_columns_explicit_fail(simple_dataframe):
     """
     If a nonexistent column is selected, KeyError is raised.