Skip to content

Commit 4a63261

Browse files
authored
Merge pull request #73 from paulgb/jph00-dfout
Add a df_out option to return a dataframe
2 parents 10a43e4 + 4a0f16c commit 4a63261

File tree

3 files changed

+148
-6
lines changed

3 files changed

+148
-6
lines changed

README.rst

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,29 @@ Now that the transformation is trained, we confirm that it works on new data::
102102
>>> np.round(mapper.transform(sample), 2)
103103
array([[ 1. , 0. , 0. , 1.04]])
104104

105+
106+
Outputting a dataframe
107+
**********************
108+
109+
By default the output of the dataframe mapper is a numpy array. This is so because most sklearn estimators expect a numpy array as input. If however we want the output of the mapper to be a dataframe, we can do so using the parameter ``df_out`` when creating the mapper::
110+
111+
>>> mapper_df = DataFrameMapper([
112+
... ('pet', sklearn.preprocessing.LabelBinarizer()),
113+
... (['children'], sklearn.preprocessing.StandardScaler())
114+
... ], df_out=True)
115+
>>> np.round(mapper_df.fit_transform(data.copy()), 2)
116+
pet_cat pet_dog pet_fish children
117+
0 1.0 0.0 0.0 0.21
118+
1 0.0 1.0 0.0 1.88
119+
2 0.0 1.0 0.0 -0.63
120+
3 0.0 0.0 1.0 -0.63
121+
4 1.0 0.0 0.0 -1.46
122+
5 0.0 1.0 0.0 -0.63
123+
6 1.0 0.0 0.0 1.04
124+
7 0.0 0.0 1.0 0.21
125+
126+
Note this does not work together with the ``default=True`` or ``sparse=True`` arguments to the mapper.
127+
105128
Transform Multiple Columns
106129
**************************
107130

@@ -229,6 +252,13 @@ Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface
229252
Changelog
230253
---------
231254

255+
Development
256+
***********
257+
258+
* Make the mapper return dataframes when ``df_out=True`` (#70).
259+
* Update imports to avoid deprecation warnings in sklearn 0.18 (#68).
260+
261+
232262
1.2.0 (2016-10-02)
233263
******************
234264

@@ -272,9 +302,10 @@ The code for ``DataFrameMapper`` is based on code originally written by `Ben Ham
272302

273303
Other contributors:
274304

275-
* Paul Butler
276305
* Cal Paterson
277306
* Israel Saeta Pérez
278-
* Zac Stewart
307+
* Jeremy Howard
279308
* Olivier Grisel
309+
* Paul Butler
280310
* Vitaley Zaretskey
311+
* Zac Stewart

sklearn_pandas/dataframe_mapper.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
3333
sklearn transformation.
3434
"""
3535

36-
def __init__(self, features, default=False, sparse=False):
36+
def __init__(self, features, default=False, sparse=False, df_out=False):
3737
"""
3838
Params:
3939
@@ -50,13 +50,23 @@ def __init__(self, features, default=False, sparse=False):
5050
5151
sparse will return sparse matrix if set True and any of the
5252
extracted features is sparse. Defaults to False.
53+
54+
df_out return a pandas data frame, with each column named using
55+
the pandas column that created it (if there's only one
56+
input and output) or the input columns joined with '_'
57+
if there's multiple inputs, and the name concatenated with
58+
'_1', '_2' etc if there's multiple outputs. NB: does not
59+
work if *default* or *sparse* are true
5360
"""
5461
if isinstance(features, list):
5562
features = [(columns, _build_transformer(transformers))
5663
for (columns, transformers) in features]
5764
self.features = features
5865
self.default = _build_transformer(default)
5966
self.sparse = sparse
67+
self.df_out = df_out
68+
if (df_out and (sparse or default)):
69+
raise ValueError("Can not use df_out with sparse or default")
6070

6171
@property
6272
def _selected_columns(self):
@@ -94,6 +104,7 @@ def __setstate__(self, state):
94104
# compatibility shim for pickles created before ``default`` init
95105
# argument existed
96106
self.default = state.get('default', False)
107+
self.df_out = state.get('df_out', False)
97108

98109
def _get_col_subset(self, X, cols):
99110
"""
@@ -145,13 +156,26 @@ def fit(self, X, y=None):
145156
self._get_col_subset(X, self._unselected_columns(X)), y)
146157
return self
147158

159+
160+
def get_names(self, c, t, x):
161+
if type(c)==list:
162+
c = '_'.join(c)
163+
if hasattr(t, 'classes_') and (len(t.classes_)>2):
164+
return [c + '_' + o for o in t.classes_]
165+
elif len(x.shape)>1 and x.shape[1]>1:
166+
return [c + '_' + str(o) for o in range(x.shape[1])]
167+
else:
168+
return [c]
169+
170+
148171
def transform(self, X):
149172
"""
150173
Transform the given data. Assumes that fit has already been called.
151174
152175
X the data to transform
153176
"""
154177
extracted = []
178+
index = []
155179
for columns, transformers in self.features:
156180
# columns could be a string or list of
157181
# strings; we don't care because pandas
@@ -160,10 +184,13 @@ def transform(self, X):
160184
if transformers is not None:
161185
Xt = transformers.transform(Xt)
162186
extracted.append(_handle_feature(Xt))
187+
if self.df_out:
188+
index = index + self.get_names(columns, transformers, Xt)
163189

164190
# handle features not explicitly selected
165191
if self.default is not False:
166-
Xt = self._get_col_subset(X, self._unselected_columns(X))
192+
unsel_cols = self._unselected_columns(X)
193+
Xt = self._get_col_subset(X, unsel_cols)
167194
if self.default is not None:
168195
Xt = self.default.transform(Xt)
169196
extracted.append(_handle_feature(Xt))
@@ -185,4 +212,7 @@ def transform(self, X):
185212
else:
186213
stacked = np.hstack(extracted)
187214

188-
return stacked
215+
if not self.df_out:
216+
return stacked
217+
218+
return pd.DataFrame(stacked, columns=index)

tests/test_dataframe_mapper.py

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
from sklearn.pipeline import Pipeline
1818
from sklearn.svm import SVC
1919
from sklearn.feature_extraction.text import CountVectorizer
20-
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
20+
from sklearn.preprocessing import (
21+
Imputer, StandardScaler, OneHotEncoder, LabelBinarizer)
2122
from sklearn.feature_selection import SelectKBest, chi2
2223
from sklearn.base import BaseEstimator, TransformerMixin
24+
import sklearn.decomposition
2325
import numpy as np
2426
from numpy.testing import assert_array_equal
2527
import pickle
@@ -77,6 +79,85 @@ def complex_dataframe():
7779
'feat2': [1, 2, 3, 2, 3, 4]})
7880

7981

82+
def test_simple_df(simple_dataframe):
83+
"""
84+
Get a dataframe from a simple mapped dataframe
85+
"""
86+
df = simple_dataframe
87+
mapper = DataFrameMapper([('a', None)], df_out=True)
88+
transformed = mapper.fit_transform(df)
89+
assert type(transformed) == pd.DataFrame
90+
assert len(transformed["a"]) == len(simple_dataframe["a"])
91+
92+
93+
def test_complex_df(complex_dataframe):
94+
"""
95+
Get a dataframe from a complex mapped dataframe
96+
"""
97+
df = complex_dataframe
98+
mapper = DataFrameMapper(
99+
[('target', None), ('feat1', None), ('feat2', None)],
100+
df_out=True)
101+
transformed = mapper.fit_transform(df)
102+
assert len(transformed) == len(complex_dataframe)
103+
for c in df.columns:
104+
assert len(transformed[c]) == len(df[c])
105+
106+
107+
def test_binarizer_df():
108+
"""
109+
Check level names from LabelBinarizer
110+
"""
111+
df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']})
112+
mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
113+
transformed = mapper.fit_transform(df)
114+
cols = transformed.columns
115+
assert len(cols) == 3
116+
assert cols[0] == 'target_a'
117+
assert cols[1] == 'target_b'
118+
assert cols[2] == 'target_c'
119+
120+
121+
def test_binarizer2_df():
122+
"""
123+
Check level names from LabelBinarizer with just one output column
124+
"""
125+
df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']})
126+
mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
127+
transformed = mapper.fit_transform(df)
128+
cols = transformed.columns
129+
assert len(cols) == 1
130+
assert cols[0] == 'target'
131+
132+
133+
def test_onehot_df():
134+
"""
135+
Check level ids from one-hot
136+
"""
137+
df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]})
138+
mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True)
139+
transformed = mapper.fit_transform(df)
140+
cols = transformed.columns
141+
assert len(cols) == 4
142+
assert cols[0] == 'target_0'
143+
assert cols[3] == 'target_3'
144+
145+
146+
def test_pca(complex_dataframe):
147+
"""
148+
Check multi in and out with PCA
149+
"""
150+
df = complex_dataframe
151+
mapper = DataFrameMapper(
152+
[(['feat1', 'feat2'], sklearn.decomposition.PCA(2))],
153+
df_out=True)
154+
transformed = mapper.fit_transform(df)
155+
cols = transformed.columns
156+
assert len(cols) == 2
157+
assert cols[0] == 'feat1_feat2_0'
158+
assert cols[1] == 'feat1_feat2_1'
159+
160+
80161
def test_nonexistent_columns_explicit_fail(simple_dataframe):
81162
"""
82163
If a nonexistent column is selected, KeyError is raised.

0 commit comments

Comments
 (0)