BUG: multi-type SparseDataFrame fixes and improvements

sstanovnik · jreback · commit 0e7ae89115e6 · 2016-08-10T06:36:47.000-04:00
Author: Sašo Stanovnik <sstanovnik@gmail.com> Closes #13917 from sstanovnik/fix-multitype-series-slice and squashes the following commits: 8c7d1ea [Sašo Stanovnik] Colon to comma. 057d56b [Sašo Stanovnik] Wording and code organization fixes. 926ca1e [Sašo Stanovnik] Fix a derp. 442b8c1 [Sašo Stanovnik] Whatsnew, issue tag, test reordering. 8d675ad [Sašo Stanovnik] Add tests for common dtypes, raises check for pandas ones. eebcb23 [Sašo Stanovnik] Moved multitype tests to sparse/tests/test_multitype.py ac790d7 [Sašo Stanovnik] Modify .values docs to process issue #10364. 2104948 [Sašo Stanovnik] Factor the common type discovery to an internal function. 6782bc7 [Sašo Stanovnik] Revert default argument change. 93d2de6 [Sašo Stanovnik] Modified the whatsnew message. 33973a5 [Sašo Stanovnik] Additional multitype tests. 114217e [Sašo Stanovnik] Infer dtype instead of forcing float in SparseArray. c7fb0f2 [Sašo Stanovnik] Use numpy to determine common dtypes. fb6237c [Sašo Stanovnik] Add a whatsnew note. 2e833fa [Sašo Stanovnik] BUG: multi-type sparse slicing fixes and improvements
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -437,6 +437,7 @@ API changes
 - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`)
 - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`)
 - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`)
+- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`)
 
 
 
@@ -764,6 +765,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
 - Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value``  (:issue:`13866`)
 - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`)
 - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`)
+- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`)
 
 .. _whatsnew_0190.deprecations:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2887,7 +2887,8 @@ def as_matrix(self, columns=None):
 
         e.g. If the dtypes are float16 and float32, dtype will be upcast to
         float32.  If dtypes are int32 and uint8, dtype will be upcase to
-        int32.
+        int32. By numpy.find_common_type convention, mixing int64 and uint64
+        will result in a flot64 dtype.
 
         This method is provided for backwards compatibility. Generally,
         it is recommended to use '.values'.
@@ -2913,8 +2914,9 @@ def values(self):
         with care if you are not dealing with the blocks.
 
         e.g. If the dtypes are float16 and float32, dtype will be upcast to
-        float32.  If dtypes are int32 and uint8, dtype will be upcase to
-        int32.
+        float32.  If dtypes are int32 and uint8, dtype will be upcast to
+        int32. By numpy.find_common_type convention, mixing int64 and uint64
+        will result in a flot64 dtype.
         """
         return self.as_matrix()
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -35,7 +35,8 @@
                                _infer_dtype_from_scalar,
                                _soft_convert_objects,
                                _possibly_convert_objects,
-                               _astype_nansafe)
+                               _astype_nansafe,
+                               _find_common_type)
 from pandas.types.missing import (isnull, array_equivalent,
                                   _is_na_compat,
                                   is_null_datelike_scalar)
@@ -4435,14 +4436,6 @@ def _interleaved_dtype(blocks):
     for x in blocks:
         counts[type(x)].append(x)
 
-    def _lcd_dtype(l):
-        """ find the lowest dtype that can accomodate the given types """
-        m = l[0].dtype
-        for x in l[1:]:
-            if x.dtype.itemsize > m.itemsize:
-                m = x.dtype
-        return m
-
     have_int = len(counts[IntBlock]) > 0
     have_bool = len(counts[BoolBlock]) > 0
     have_object = len(counts[ObjectBlock]) > 0
@@ -4455,7 +4448,6 @@ def _lcd_dtype(l):
     # TODO: have_sparse is not used
     have_sparse = len(counts[SparseBlock]) > 0  # noqa
     have_numeric = have_float or have_complex or have_int
-
     has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat
 
     if (have_object or
@@ -4467,10 +4459,9 @@ def _lcd_dtype(l):
     elif have_bool:
         return np.dtype(bool)
     elif have_int and not have_float and not have_complex:
-
         # if we are mixing unsigned and signed, then return
         # the next biggest int type (if we can)
-        lcd = _lcd_dtype(counts[IntBlock])
+        lcd = _find_common_type([b.dtype for b in counts[IntBlock]])
         kinds = set([i.dtype.kind for i in counts[IntBlock]])
         if len(kinds) == 1:
             return lcd
@@ -4486,7 +4477,8 @@ def _lcd_dtype(l):
     elif have_complex:
         return np.dtype('c16')
     else:
-        return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock])
+        introspection_blks = counts[FloatBlock] + counts[SparseBlock]
+        return _find_common_type([b.dtype for b in introspection_blks])
 
 
 def _consolidate(blocks):
diff --git a/pandas/core/ops.py b/pandas/core/ops.py
@@ -30,7 +30,7 @@
                                  is_bool_dtype, is_datetimetz,
                                  is_list_like,
                                  _ensure_object)
-from pandas.types.cast import _maybe_upcast_putmask
+from pandas.types.cast import _maybe_upcast_putmask, _find_common_type
 from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex
 
 # -----------------------------------------------------------------------------
@@ -616,7 +616,7 @@ def na_op(x, y):
                                           raise_on_error=True, **eval_kwargs)
         except TypeError:
             if isinstance(y, (np.ndarray, ABCSeries, pd.Index)):
-                dtype = np.find_common_type([x.dtype, y.dtype], [])
+                dtype = _find_common_type([x.dtype, y.dtype])
                 result = np.empty(x.size, dtype=dtype)
                 mask = notnull(x) & notnull(y)
                 result[mask] = op(x[mask], _values_from_object(y[mask]))
diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py
@@ -829,3 +829,81 @@ def test_reindex_fill_value(self):
         res = sparse.reindex(['A', 'C', 'B'])
         exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0)
         tm.assert_sp_frame_equal(res, exp)
+
+
+class TestMultitype(tm.TestCase):
+    def setUp(self):
+        self.cols = ['string', 'int', 'float', 'object']
+
+        self.string_series = pd.SparseSeries(['a', 'b', 'c'])
+        self.int_series = pd.SparseSeries([1, 2, 3])
+        self.float_series = pd.SparseSeries([1.1, 1.2, 1.3])
+        self.object_series = pd.SparseSeries([[], {}, set()])
+        self.sdf = pd.SparseDataFrame({
+            'string': self.string_series,
+            'int': self.int_series,
+            'float': self.float_series,
+            'object': self.object_series,
+        })
+        self.sdf = self.sdf[self.cols]
+        self.ss = pd.SparseSeries(['a', 1, 1.1, []], index=self.cols)
+
+    def test_frame_basic_dtypes(self):
+        for _, row in self.sdf.iterrows():
+            self.assertEqual(row.dtype, object)
+        tm.assert_sp_series_equal(self.sdf['string'], self.string_series,
+                                  check_names=False)
+        tm.assert_sp_series_equal(self.sdf['int'], self.int_series,
+                                  check_names=False)
+        tm.assert_sp_series_equal(self.sdf['float'], self.float_series,
+                                  check_names=False)
+        tm.assert_sp_series_equal(self.sdf['object'], self.object_series,
+                                  check_names=False)
+
+    def test_frame_indexing_single(self):
+        tm.assert_sp_series_equal(self.sdf.iloc[0],
+                                  pd.SparseSeries(['a', 1, 1.1, []],
+                                                  index=self.cols),
+                                  check_names=False)
+        tm.assert_sp_series_equal(self.sdf.iloc[1],
+                                  pd.SparseSeries(['b', 2, 1.2, {}],
+                                                  index=self.cols),
+                                  check_names=False)
+        tm.assert_sp_series_equal(self.sdf.iloc[2],
+                                  pd.SparseSeries(['c', 3, 1.3, set()],
+                                                  index=self.cols),
+                                  check_names=False)
+
+    def test_frame_indexing_multiple(self):
+        tm.assert_sp_frame_equal(self.sdf, self.sdf[:])
+        tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:])
+        tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]],
+                                 pd.SparseDataFrame({
+                                     'string': self.string_series.iloc[[1, 2]],
+                                     'int': self.int_series.iloc[[1, 2]],
+                                     'float': self.float_series.iloc[[1, 2]],
+                                     'object': self.object_series.iloc[[1, 2]]
+                                 }, index=[1, 2])[self.cols])
+        tm.assert_sp_frame_equal(self.sdf[['int', 'string']],
+                                 pd.SparseDataFrame({
+                                     'int': self.int_series,
+                                     'string': self.string_series,
+                                 }))
+
+    def test_series_indexing_single(self):
+        for i, idx in enumerate(self.cols):
+            self.assertEqual(self.ss.iloc[i], self.ss[idx])
+            self.assertEqual(type(self.ss.iloc[i]),
+                             type(self.ss[idx]))
+        self.assertEqual(self.ss['string'], 'a')
+        self.assertEqual(self.ss['int'], 1)
+        self.assertEqual(self.ss['float'], 1.1)
+        self.assertEqual(self.ss['object'], [])
+
+    def test_series_indexing_multiple(self):
+        tm.assert_sp_series_equal(self.ss.loc[['string', 'int']],
+                                  pd.SparseSeries(['a', 1],
+                                                  index=['string', 'int']))
+        tm.assert_sp_series_equal(self.ss.loc[['string', 'object']],
+                                  pd.SparseSeries(['a', []],
+                                                  index=['string', 'object']))
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
@@ -104,15 +104,21 @@ def test_as_matrix_lcd(self):
         values = self.mixed_float.as_matrix(['C'])
         self.assertEqual(values.dtype, np.float16)
 
+        # GH 10364
+        # B uint64 forces float because there are other signed int types
         values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D'])
-        self.assertEqual(values.dtype, np.int64)
+        self.assertEqual(values.dtype, np.float64)
 
         values = self.mixed_int.as_matrix(['A', 'D'])
         self.assertEqual(values.dtype, np.int64)
 
-        # guess all ints are cast to uints....
+        # B uint64 forces float because there are other signed int types
         values = self.mixed_int.as_matrix(['A', 'B', 'C'])
-        self.assertEqual(values.dtype, np.int64)
+        self.assertEqual(values.dtype, np.float64)
+
+        # as B and C are both unsigned, no forcing to float is needed
+        values = self.mixed_int.as_matrix(['B', 'C'])
+        self.assertEqual(values.dtype, np.uint64)
 
         values = self.mixed_int.as_matrix(['A', 'C'])
         self.assertEqual(values.dtype, np.int32)
diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py
@@ -15,7 +15,10 @@
                                _possibly_convert_objects,
                                _infer_dtype_from_scalar,
                                _maybe_convert_string_to_object,
-                               _maybe_convert_scalar)
+                               _maybe_convert_scalar,
+                               _find_common_type)
+from pandas.types.dtypes import (CategoricalDtype,
+                                 DatetimeTZDtype)
 from pandas.util import testing as tm
 
 _multiprocess_can_split_ = True
@@ -188,6 +191,46 @@ def test_possibly_convert_objects_copy(self):
         self.assertTrue(values is not out)
 
 
+class TestCommonTypes(tm.TestCase):
+    def test_numpy_dtypes(self):
+        # (source_types, destination_type)
+        testcases = (
+            # identity
+            ((np.int64,), np.int64),
+            ((np.uint64,), np.uint64),
+            ((np.float32,), np.float32),
+            ((np.object,), np.object),
+
+            # into ints
+            ((np.int16, np.int64), np.int64),
+            ((np.int32, np.uint32), np.int64),
+            ((np.uint16, np.uint64), np.uint64),
+
+            # into floats
+            ((np.float16, np.float32), np.float32),
+            ((np.float16, np.int16), np.float32),
+            ((np.float32, np.int16), np.float32),
+            ((np.uint64, np.int64), np.float64),
+            ((np.int16, np.float64), np.float64),
+            ((np.float16, np.int64), np.float64),
+
+            # into others
+            ((np.complex128, np.int32), np.complex128),
+            ((np.object, np.float32), np.object),
+            ((np.object, np.int16), np.object),
+        )
+        for src, common in testcases:
+            self.assertEqual(_find_common_type(src), common)
+
+    def test_pandas_dtypes(self):
+        # TODO: not implemented yet
+        with self.assertRaises(TypeError):
+            self.assertEqual(_find_common_type([CategoricalDtype()]),
+                             CategoricalDtype)
+        with self.assertRaises(TypeError):
+            self.assertEqual(_find_common_type([DatetimeTZDtype()]),
+                             DatetimeTZDtype)
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)
diff --git a/pandas/types/cast.py b/pandas/types/cast.py
@@ -19,6 +19,7 @@
                      _ensure_int32, _ensure_int64,
                      _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE,
                      _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES)
+from .dtypes import ExtensionDtype
 from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries
 from .missing import isnull, notnull
 from .inference import is_list_like
@@ -861,3 +862,12 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'):
             value = _possibly_infer_to_datetimelike(value)
 
     return value
+
+
+def _find_common_type(types):
+    """Find a common data type among the given dtypes."""
+    # TODO: enable using pandas-specific types
+    if any(isinstance(t, ExtensionDtype) for t in types):
+        raise TypeError("Common type discovery is currently only "
+                        "supported for pure numpy dtypes.")
+    return np.find_common_type(types, [])