BUG: Sparse creation with object dtype may raise TypeError

sinhrks · sinhrks · commit 443b47e83be2 · 2016-05-17T21:39:06.000+09:00
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -116,6 +116,7 @@ Bug Fixes
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
 - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
+- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`)
 - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
 
 
diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py
@@ -152,9 +152,17 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer',
 
         # Create array, do *not* copy data by default
         if copy:
-            subarr = np.array(values, dtype=dtype, copy=True)
+            try:
+                # ToDo: Can remove this error handling when we actually
+                # support other dtypes
+                subarr = np.array(values, dtype=dtype, copy=True)
+            except ValueError:
+                subarr = np.array(values, copy=True)
         else:
-            subarr = np.asarray(values, dtype=dtype)
+            try:
+                subarr = np.asarray(values, dtype=dtype)
+            except ValueError:
+                subarr = np.asarray(values)
 
         # if we have a bool type, make sure that we have a bool fill_value
         if ((dtype is not None and issubclass(dtype.type, np.bool_)) or
@@ -437,12 +445,12 @@ def count(self):
 
     @property
     def _null_fill_value(self):
-        return np.isnan(self.fill_value)
+        return com.isnull(self.fill_value)
 
     @property
     def _valid_sp_values(self):
         sp_vals = self.sp_values
-        mask = np.isfinite(sp_vals)
+        mask = com.notnull(sp_vals)
         return sp_vals[mask]
 
     @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs)
@@ -616,8 +624,8 @@ def make_sparse(arr, kind='block', fill_value=nan):
     if arr.ndim > 1:
         raise TypeError("expected dimension <= 1 data")
 
-    if np.isnan(fill_value):
-        mask = ~np.isnan(arr)
+    if com.isnull(fill_value):
+        mask = com.notnull(arr)
     else:
         mask = arr != fill_value
 
diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py
@@ -46,6 +46,17 @@ def test_constructor_dtype(self):
         self.assertEqual(arr.dtype, np.int64)
         self.assertEqual(arr.fill_value, 0)
 
+    def test_constructor_object_dtype(self):
+        # GH 11856
+        arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object)
+        self.assertEqual(arr.dtype, np.object)
+        self.assertTrue(np.isnan(arr.fill_value))
+
+        arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object,
+                          fill_value='A')
+        self.assertEqual(arr.dtype, np.object)
+        self.assertEqual(arr.fill_value, 'A')
+
     def test_constructor_spindex_dtype(self):
         arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]))
         tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan]))
diff --git a/pandas/sparse/tests/test_groupby.py b/pandas/sparse/tests/test_groupby.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestSparseGroupBy(tm.TestCase):
+
+    _multiprocess_can_split_ = True
+
+    def setUp(self):
+        self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+                                         'foo', 'bar', 'foo', 'foo'],
+                                   'B': ['one', 'one', 'two', 'three',
+                                         'two', 'two', 'one', 'three'],
+                                   'C': np.random.randn(8),
+                                   'D': np.random.randn(8),
+                                   'E': [np.nan, np.nan, 1, 2,
+                                         np.nan, 1, np.nan, np.nan]})
+        self.sparse = self.dense.to_sparse()
+
+    def test_first_last_nth(self):
+        # tests for first / last / nth
+        sparse_grouped = self.sparse.groupby('A')
+        dense_grouped = self.dense.groupby('A')
+
+        tm.assert_frame_equal(sparse_grouped.first(),
+                              dense_grouped.first())
+        tm.assert_frame_equal(sparse_grouped.last(),
+                              dense_grouped.last())
+        tm.assert_frame_equal(sparse_grouped.nth(1),
+                              dense_grouped.nth(1))
+
+    def test_aggfuncs(self):
+        sparse_grouped = self.sparse.groupby('A')
+        dense_grouped = self.dense.groupby('A')
+
+        tm.assert_frame_equal(sparse_grouped.mean(),
+                              dense_grouped.mean())
+
+        # ToDo: sparse sum includes str column
+        # tm.assert_frame_equal(sparse_grouped.sum(),
+        #                       dense_grouped.sum())
+
+        tm.assert_frame_equal(sparse_grouped.count(),
+                              dense_grouped.count())
diff --git a/pandas/sparse/tests/test_pivot.py b/pandas/sparse/tests/test_pivot.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+import pandas.util.testing as tm
+
+
+class TestPivotTable(tm.TestCase):
+
+    _multiprocess_can_split_ = True
+
+    def setUp(self):
+        self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+                                         'foo', 'bar', 'foo', 'foo'],
+                                   'B': ['one', 'one', 'two', 'three',
+                                         'two', 'two', 'one', 'three'],
+                                   'C': np.random.randn(8),
+                                   'D': np.random.randn(8),
+                                   'E': [np.nan, np.nan, 1, 2,
+                                         np.nan, 1, np.nan, np.nan]})
+        self.sparse = self.dense.to_sparse()
+
+    def test_pivot_table(self):
+        res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+                                    values='C')
+        res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+                                   values='C')
+        tm.assert_frame_equal(res_sparse, res_dense)
+
+        res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+                                    values='E')
+        res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+                                   values='E')
+        tm.assert_frame_equal(res_sparse, res_dense)
+
+        res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+                                    values='E', aggfunc='mean')
+        res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+                                   values='E', aggfunc='mean')
+        tm.assert_frame_equal(res_sparse, res_dense)
+
+        # ToDo: sum doesn't handle nan properly
+        # res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+        #                             values='E', aggfunc='sum')
+        # res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+        #                            values='E', aggfunc='sum')
+        # tm.assert_frame_equal(res_sparse, res_dense)
+
+    def test_pivot_table_multi(self):
+        res_sparse = pd.pivot_table(self.sparse, index='A', columns='B',
+                                    values=['D', 'E'])
+        res_dense = pd.pivot_table(self.dense, index='A', columns='B',
+                                   values=['D', 'E'])
+        tm.assert_frame_equal(res_sparse, res_dense)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -4508,7 +4508,7 @@ def test_groupby_with_empty(self):
         grouped = series.groupby(grouper)
         assert next(iter(grouped), None) is None
 
-    def test_aaa_groupby_with_small_elem(self):
+    def test_groupby_with_small_elem(self):
         # GH 8542
         # length=2
         df = pd.DataFrame({'event': ['start', 'start'],
@@ -5972,7 +5972,7 @@ def test__cython_agg_general(self):
                 exc.args += ('operation: %s' % op, )
                 raise
 
-    def test_aa_cython_group_transform_algos(self):
+    def test_cython_group_transform_algos(self):
         # GH 4095
         dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32,
                   np.uint64, np.float32, np.float64]