diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a041e175d5f1a..f8ce963253887 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -437,6 +437,7 @@ API changes - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) +- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) @@ -764,6 +765,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` - Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) - Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) - Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) +- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`) .. _whatsnew_0190.deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 17cc76e703631..d0295afe990c8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2887,7 +2887,8 @@ def as_matrix(self, columns=None): e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. + int32. By numpy.find_common_type convention, mixing int64 and uint64 + will result in a flot64 dtype. This method is provided for backwards compatibility. Generally, it is recommended to use '.values'. @@ -2913,8 +2914,9 @@ def values(self): with care if you are not dealing with the blocks. e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By numpy.find_common_type convention, mixing int64 and uint64 + will result in a flot64 dtype. """ return self.as_matrix() diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 18b67c41b4554..e9b45e444d8d8 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -35,7 +35,8 @@ _infer_dtype_from_scalar, _soft_convert_objects, _possibly_convert_objects, - _astype_nansafe) + _astype_nansafe, + _find_common_type) from pandas.types.missing import (isnull, array_equivalent, _is_na_compat, is_null_datelike_scalar) @@ -4435,14 +4436,6 @@ def _interleaved_dtype(blocks): for x in blocks: counts[type(x)].append(x) - def _lcd_dtype(l): - """ find the lowest dtype that can accomodate the given types """ - m = l[0].dtype - for x in l[1:]: - if x.dtype.itemsize > m.itemsize: - m = x.dtype - return m - have_int = len(counts[IntBlock]) > 0 have_bool = len(counts[BoolBlock]) > 0 have_object = len(counts[ObjectBlock]) > 0 @@ -4455,7 +4448,6 @@ def _lcd_dtype(l): # TODO: have_sparse is not used have_sparse = len(counts[SparseBlock]) > 0 # noqa have_numeric = have_float or have_complex or have_int - has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat if (have_object or @@ -4467,10 +4459,9 @@ def _lcd_dtype(l): elif have_bool: return np.dtype(bool) elif have_int and not have_float and not have_complex: - # if we are mixing unsigned and signed, then return # the next biggest int type (if we can) - lcd = _lcd_dtype(counts[IntBlock]) + lcd = _find_common_type([b.dtype for b in counts[IntBlock]]) kinds = set([i.dtype.kind for i in counts[IntBlock]]) if len(kinds) == 1: return lcd @@ -4486,7 +4477,8 @@ def _lcd_dtype(l): elif have_complex: return np.dtype('c16') else: - return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock]) + introspection_blks = counts[FloatBlock] + counts[SparseBlock] + return _find_common_type([b.dtype for b in introspection_blks]) def _consolidate(blocks): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 44e3be32c23df..66d9391d2facf 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -30,7 +30,7 @@ is_bool_dtype, is_datetimetz, is_list_like, _ensure_object) -from pandas.types.cast import _maybe_upcast_putmask +from pandas.types.cast import _maybe_upcast_putmask, _find_common_type from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex # ----------------------------------------------------------------------------- @@ -616,7 +616,7 @@ def na_op(x, y): raise_on_error=True, **eval_kwargs) except TypeError: if isinstance(y, (np.ndarray, ABCSeries, pd.Index)): - dtype = np.find_common_type([x.dtype, y.dtype], []) + dtype = _find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) result[mask] = op(x[mask], _values_from_object(y[mask])) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index 1f88d22bd8f93..74c3785b06d77 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -829,3 +829,81 @@ def test_reindex_fill_value(self): res = sparse.reindex(['A', 'C', 'B']) exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) + + +class TestMultitype(tm.TestCase): + def setUp(self): + self.cols = ['string', 'int', 'float', 'object'] + + self.string_series = pd.SparseSeries(['a', 'b', 'c']) + self.int_series = pd.SparseSeries([1, 2, 3]) + self.float_series = pd.SparseSeries([1.1, 1.2, 1.3]) + self.object_series = pd.SparseSeries([[], {}, set()]) + self.sdf = pd.SparseDataFrame({ + 'string': self.string_series, + 'int': self.int_series, + 'float': self.float_series, + 'object': self.object_series, + }) + self.sdf = self.sdf[self.cols] + self.ss = pd.SparseSeries(['a', 1, 1.1, []], index=self.cols) + + def test_frame_basic_dtypes(self): + for _, row in self.sdf.iterrows(): + self.assertEqual(row.dtype, object) + tm.assert_sp_series_equal(self.sdf['string'], self.string_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['int'], self.int_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['float'], self.float_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['object'], self.object_series, + check_names=False) + + def test_frame_indexing_single(self): + tm.assert_sp_series_equal(self.sdf.iloc[0], + pd.SparseSeries(['a', 1, 1.1, []], + index=self.cols), + check_names=False) + tm.assert_sp_series_equal(self.sdf.iloc[1], + pd.SparseSeries(['b', 2, 1.2, {}], + index=self.cols), + check_names=False) + tm.assert_sp_series_equal(self.sdf.iloc[2], + pd.SparseSeries(['c', 3, 1.3, set()], + index=self.cols), + check_names=False) + + def test_frame_indexing_multiple(self): + tm.assert_sp_frame_equal(self.sdf, self.sdf[:]) + tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:]) + tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]], + pd.SparseDataFrame({ + 'string': self.string_series.iloc[[1, 2]], + 'int': self.int_series.iloc[[1, 2]], + 'float': self.float_series.iloc[[1, 2]], + 'object': self.object_series.iloc[[1, 2]] + }, index=[1, 2])[self.cols]) + tm.assert_sp_frame_equal(self.sdf[['int', 'string']], + pd.SparseDataFrame({ + 'int': self.int_series, + 'string': self.string_series, + })) + + def test_series_indexing_single(self): + for i, idx in enumerate(self.cols): + self.assertEqual(self.ss.iloc[i], self.ss[idx]) + self.assertEqual(type(self.ss.iloc[i]), + type(self.ss[idx])) + self.assertEqual(self.ss['string'], 'a') + self.assertEqual(self.ss['int'], 1) + self.assertEqual(self.ss['float'], 1.1) + self.assertEqual(self.ss['object'], []) + + def test_series_indexing_multiple(self): + tm.assert_sp_series_equal(self.ss.loc[['string', 'int']], + pd.SparseSeries(['a', 1], + index=['string', 'int'])) + tm.assert_sp_series_equal(self.ss.loc[['string', 'object']], + pd.SparseSeries(['a', []], + index=['string', 'object'])) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 38163d89355e9..e51cc0f5a6ec7 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -104,15 +104,21 @@ def test_as_matrix_lcd(self): values = self.mixed_float.as_matrix(['C']) self.assertEqual(values.dtype, np.float16) + # GH 10364 + # B uint64 forces float because there are other signed int types values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D']) - self.assertEqual(values.dtype, np.int64) + self.assertEqual(values.dtype, np.float64) values = self.mixed_int.as_matrix(['A', 'D']) self.assertEqual(values.dtype, np.int64) - # guess all ints are cast to uints.... + # B uint64 forces float because there are other signed int types values = self.mixed_int.as_matrix(['A', 'B', 'C']) - self.assertEqual(values.dtype, np.int64) + self.assertEqual(values.dtype, np.float64) + + # as B and C are both unsigned, no forcing to float is needed + values = self.mixed_int.as_matrix(['B', 'C']) + self.assertEqual(values.dtype, np.uint64) values = self.mixed_int.as_matrix(['A', 'C']) self.assertEqual(values.dtype, np.int32) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index dd3f07ea8157f..3394974d833fb 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -15,7 +15,10 @@ _possibly_convert_objects, _infer_dtype_from_scalar, _maybe_convert_string_to_object, - _maybe_convert_scalar) + _maybe_convert_scalar, + _find_common_type) +from pandas.types.dtypes import (CategoricalDtype, + DatetimeTZDtype) from pandas.util import testing as tm _multiprocess_can_split_ = True @@ -188,6 +191,46 @@ def test_possibly_convert_objects_copy(self): self.assertTrue(values is not out) +class TestCommonTypes(tm.TestCase): + def test_numpy_dtypes(self): + # (source_types, destination_type) + testcases = ( + # identity + ((np.int64,), np.int64), + ((np.uint64,), np.uint64), + ((np.float32,), np.float32), + ((np.object,), np.object), + + # into ints + ((np.int16, np.int64), np.int64), + ((np.int32, np.uint32), np.int64), + ((np.uint16, np.uint64), np.uint64), + + # into floats + ((np.float16, np.float32), np.float32), + ((np.float16, np.int16), np.float32), + ((np.float32, np.int16), np.float32), + ((np.uint64, np.int64), np.float64), + ((np.int16, np.float64), np.float64), + ((np.float16, np.int64), np.float64), + + # into others + ((np.complex128, np.int32), np.complex128), + ((np.object, np.float32), np.object), + ((np.object, np.int16), np.object), + ) + for src, common in testcases: + self.assertEqual(_find_common_type(src), common) + + def test_pandas_dtypes(self): + # TODO: not implemented yet + with self.assertRaises(TypeError): + self.assertEqual(_find_common_type([CategoricalDtype()]), + CategoricalDtype) + with self.assertRaises(TypeError): + self.assertEqual(_find_common_type([DatetimeTZDtype()]), + DatetimeTZDtype) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index e37b418664ba3..93be926fe1eeb 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -19,6 +19,7 @@ _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) +from .dtypes import ExtensionDtype from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries from .missing import isnull, notnull from .inference import is_list_like @@ -861,3 +862,12 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): value = _possibly_infer_to_datetimelike(value) return value + + +def _find_common_type(types): + """Find a common data type among the given dtypes.""" + # TODO: enable using pandas-specific types + if any(isinstance(t, ExtensionDtype) for t in types): + raise TypeError("Common type discovery is currently only " + "supported for pure numpy dtypes.") + return np.find_common_type(types, [])