If any of the extracted features is sparse, make the hstacked result sparse as well.

dukebody · dukebody · commit ae885db17320 · 2015-11-07T20:37:32.000+01:00
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pandas as pd
+from scipy import sparse
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn import cross_validation
 from sklearn import grid_search
@@ -55,11 +56,7 @@ def transform(self, X):
 
 
 def _handle_feature(fea):
-    if hasattr(fea, 'toarray'):
-        # sparse arrays should be converted to regular arrays
-        # for hstack.
-        fea = fea.toarray()
-
+    # convert 1-dimensional arrays to 2-dimensional column vectors
     if len(fea.shape) == 1:
         fea = np.array([fea]).T
 
@@ -156,4 +153,11 @@ def transform(self, X):
         # at this point we lose track of which features
         # were created from which input columns, so it's
         # assumed that that doesn't matter to the model.
-        return np.hstack(extracted)
+
+        # If any of the extracted features is sparse, combine to produce a
+        # sparse matrix. Otherwise, produce a dense one.
+        if any(sparse.issparse(fea) for fea in extracted):
+            stacked = sparse.hstack(extracted).tocsr()
+        else:
+            stacked = np.hstack(extracted)
+        return stacked
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -9,11 +9,12 @@
 
 from pandas import DataFrame
 import pandas as pd
+from scipy import sparse
 from sklearn.datasets import load_iris
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.preprocessing import Imputer, StandardScaler
+from sklearn.preprocessing import Imputer, StandardScaler, LabelBinarizer
 import numpy as np
 
 from sklearn_pandas import (
@@ -140,3 +141,17 @@ def test_list_transformers():
     # all features have mean 0 and std deviation 1 (standardized)
     assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
     assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()
+
+
+def test_sparse_features(cars_dataframe):
+    """
+    If any of the extracted features is sparse, the hstacked
+    is also sparse.
+    """
+    mapper = DataFrameMapper([
+        ("description", CountVectorizer()),  # sparse feature
+        ("model", LabelBinarizer()),  # dense feature
+    ])
+    dmatrix = mapper.fit_transform(cars_dataframe)
+
+    assert type(dmatrix) == sparse.csr.csr_matrix