From 443b47e83be2191477f6a8530e06001f5da69ce0 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 17 May 2016 21:08:37 +0900 Subject: [PATCH] BUG: Sparse creation with object dtype may raise TypeError --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/sparse/array.py | 20 +++++++---- pandas/sparse/tests/test_array.py | 11 ++++++ pandas/sparse/tests/test_groupby.py | 46 +++++++++++++++++++++++++ pandas/sparse/tests/test_pivot.py | 52 +++++++++++++++++++++++++++++ pandas/tests/test_groupby.py | 4 +-- 6 files changed, 126 insertions(+), 8 deletions(-) create mode 100644 pandas/sparse/tests/test_groupby.py create mode 100644 pandas/sparse/tests/test_pivot.py diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 459bdbf10a4f1..84751fbe6bfe7 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -116,6 +116,7 @@ Bug Fixes - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index e114bee87ca27..0312fb023f7fd 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -152,9 +152,17 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', # Create array, do *not* copy data by default if copy: - subarr = np.array(values, dtype=dtype, copy=True) + try: + # ToDo: Can remove this error handling when we actually + # support other dtypes + subarr = np.array(values, dtype=dtype, copy=True) + except ValueError: + subarr = np.array(values, copy=True) else: - subarr = np.asarray(values, dtype=dtype) + try: + subarr = np.asarray(values, dtype=dtype) + except ValueError: + subarr = np.asarray(values) # if we have a bool type, make sure that we have a bool fill_value if ((dtype is not None and issubclass(dtype.type, np.bool_)) or @@ -437,12 +445,12 @@ def count(self): @property def _null_fill_value(self): - return np.isnan(self.fill_value) + return com.isnull(self.fill_value) @property def _valid_sp_values(self): sp_vals = self.sp_values - mask = np.isfinite(sp_vals) + mask = com.notnull(sp_vals) return sp_vals[mask] @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs) @@ -616,8 +624,8 @@ def make_sparse(arr, kind='block', fill_value=nan): if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") - if np.isnan(fill_value): - mask = ~np.isnan(arr) + if com.isnull(fill_value): + mask = com.notnull(arr) else: mask = arr != fill_value diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 26d018c56a8a8..dd2126d0f52d2 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -46,6 +46,17 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) + def test_constructor_object_dtype(self): + # GH 11856 + arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object) + self.assertEqual(arr.dtype, np.object) + self.assertTrue(np.isnan(arr.fill_value)) + + arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object, + fill_value='A') + self.assertEqual(arr.dtype, np.object) + self.assertEqual(arr.fill_value, 'A') + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan])) diff --git a/pandas/sparse/tests/test_groupby.py b/pandas/sparse/tests/test_groupby.py new file mode 100644 index 0000000000000..0cb33f4ea0a56 --- /dev/null +++ b/pandas/sparse/tests/test_groupby.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestSparseGroupBy(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': [np.nan, np.nan, 1, 2, + np.nan, 1, np.nan, np.nan]}) + self.sparse = self.dense.to_sparse() + + def test_first_last_nth(self): + # tests for first / last / nth + sparse_grouped = self.sparse.groupby('A') + dense_grouped = self.dense.groupby('A') + + tm.assert_frame_equal(sparse_grouped.first(), + dense_grouped.first()) + tm.assert_frame_equal(sparse_grouped.last(), + dense_grouped.last()) + tm.assert_frame_equal(sparse_grouped.nth(1), + dense_grouped.nth(1)) + + def test_aggfuncs(self): + sparse_grouped = self.sparse.groupby('A') + dense_grouped = self.dense.groupby('A') + + tm.assert_frame_equal(sparse_grouped.mean(), + dense_grouped.mean()) + + # ToDo: sparse sum includes str column + # tm.assert_frame_equal(sparse_grouped.sum(), + # dense_grouped.sum()) + + tm.assert_frame_equal(sparse_grouped.count(), + dense_grouped.count()) diff --git a/pandas/sparse/tests/test_pivot.py b/pandas/sparse/tests/test_pivot.py new file mode 100644 index 0000000000000..482a99a96194f --- /dev/null +++ b/pandas/sparse/tests/test_pivot.py @@ -0,0 +1,52 @@ +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestPivotTable(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': [np.nan, np.nan, 1, 2, + np.nan, 1, np.nan, np.nan]}) + self.sparse = self.dense.to_sparse() + + def test_pivot_table(self): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='C') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='C') + tm.assert_frame_equal(res_sparse, res_dense) + + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='E') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='E') + tm.assert_frame_equal(res_sparse, res_dense) + + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='E', aggfunc='mean') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='E', aggfunc='mean') + tm.assert_frame_equal(res_sparse, res_dense) + + # ToDo: sum doesn't handle nan properly + # res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + # values='E', aggfunc='sum') + # res_dense = pd.pivot_table(self.dense, index='A', columns='B', + # values='E', aggfunc='sum') + # tm.assert_frame_equal(res_sparse, res_dense) + + def test_pivot_table_multi(self): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values=['D', 'E']) + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values=['D', 'E']) + tm.assert_frame_equal(res_sparse, res_dense) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 583b1c7aea270..de56c92a9773c 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4508,7 +4508,7 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - def test_aaa_groupby_with_small_elem(self): + def test_groupby_with_small_elem(self): # GH 8542 # length=2 df = pd.DataFrame({'event': ['start', 'start'], @@ -5972,7 +5972,7 @@ def test__cython_agg_general(self): exc.args += ('operation: %s' % op, ) raise - def test_aa_cython_group_transform_algos(self): + def test_cython_group_transform_algos(self): # GH 4095 dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, np.uint64, np.float32, np.float64]