Skip to content

Commit 0e7ae89

Browse files
sstanovnikjreback
authored andcommitted
BUG: multi-type SparseDataFrame fixes and improvements
Author: Sašo Stanovnik <[email protected]> Closes #13917 from sstanovnik/fix-multitype-series-slice and squashes the following commits: 8c7d1ea [Sašo Stanovnik] Colon to comma. 057d56b [Sašo Stanovnik] Wording and code organization fixes. 926ca1e [Sašo Stanovnik] Fix a derp. 442b8c1 [Sašo Stanovnik] Whatsnew, issue tag, test reordering. 8d675ad [Sašo Stanovnik] Add tests for common dtypes, raises check for pandas ones. eebcb23 [Sašo Stanovnik] Moved multitype tests to sparse/tests/test_multitype.py ac790d7 [Sašo Stanovnik] Modify .values docs to process issue #10364. 2104948 [Sašo Stanovnik] Factor the common type discovery to an internal function. 6782bc7 [Sašo Stanovnik] Revert default argument change. 93d2de6 [Sašo Stanovnik] Modified the whatsnew message. 33973a5 [Sašo Stanovnik] Additional multitype tests. 114217e [Sašo Stanovnik] Infer dtype instead of forcing float in SparseArray. c7fb0f2 [Sašo Stanovnik] Use numpy to determine common dtypes. fb6237c [Sašo Stanovnik] Add a whatsnew note. 2e833fa [Sašo Stanovnik] BUG: multi-type sparse slicing fixes and improvements
1 parent cce7993 commit 0e7ae89

File tree

8 files changed

+155
-22
lines changed

8 files changed

+155
-22
lines changed

doc/source/whatsnew/v0.19.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@ API changes
437437
- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`)
438438
- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`)
439439
- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`)
440+
- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`)
440441

441442

442443

@@ -764,6 +765,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
764765
- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`)
765766
- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`)
766767
- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`)
768+
- Bug in single row slicing on multi-type ``SparseDataFrame``s, types were previously forced to float (:issue:`13917`)
767769

768770
.. _whatsnew_0190.deprecations:
769771

pandas/core/generic.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -2887,7 +2887,8 @@ def as_matrix(self, columns=None):
28872887
28882888
e.g. If the dtypes are float16 and float32, dtype will be upcast to
28892889
float32. If dtypes are int32 and uint8, dtype will be upcase to
2890-
int32.
2890+
int32. By numpy.find_common_type convention, mixing int64 and uint64
2891+
will result in a flot64 dtype.
28912892
28922893
This method is provided for backwards compatibility. Generally,
28932894
it is recommended to use '.values'.
@@ -2913,8 +2914,9 @@ def values(self):
29132914
with care if you are not dealing with the blocks.
29142915
29152916
e.g. If the dtypes are float16 and float32, dtype will be upcast to
2916-
float32. If dtypes are int32 and uint8, dtype will be upcase to
2917-
int32.
2917+
float32. If dtypes are int32 and uint8, dtype will be upcast to
2918+
int32. By numpy.find_common_type convention, mixing int64 and uint64
2919+
will result in a flot64 dtype.
29182920
"""
29192921
return self.as_matrix()
29202922

pandas/core/internals.py

+5-13
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
_infer_dtype_from_scalar,
3636
_soft_convert_objects,
3737
_possibly_convert_objects,
38-
_astype_nansafe)
38+
_astype_nansafe,
39+
_find_common_type)
3940
from pandas.types.missing import (isnull, array_equivalent,
4041
_is_na_compat,
4142
is_null_datelike_scalar)
@@ -4435,14 +4436,6 @@ def _interleaved_dtype(blocks):
44354436
for x in blocks:
44364437
counts[type(x)].append(x)
44374438

4438-
def _lcd_dtype(l):
4439-
""" find the lowest dtype that can accomodate the given types """
4440-
m = l[0].dtype
4441-
for x in l[1:]:
4442-
if x.dtype.itemsize > m.itemsize:
4443-
m = x.dtype
4444-
return m
4445-
44464439
have_int = len(counts[IntBlock]) > 0
44474440
have_bool = len(counts[BoolBlock]) > 0
44484441
have_object = len(counts[ObjectBlock]) > 0
@@ -4455,7 +4448,6 @@ def _lcd_dtype(l):
44554448
# TODO: have_sparse is not used
44564449
have_sparse = len(counts[SparseBlock]) > 0 # noqa
44574450
have_numeric = have_float or have_complex or have_int
4458-
44594451
has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat
44604452

44614453
if (have_object or
@@ -4467,10 +4459,9 @@ def _lcd_dtype(l):
44674459
elif have_bool:
44684460
return np.dtype(bool)
44694461
elif have_int and not have_float and not have_complex:
4470-
44714462
# if we are mixing unsigned and signed, then return
44724463
# the next biggest int type (if we can)
4473-
lcd = _lcd_dtype(counts[IntBlock])
4464+
lcd = _find_common_type([b.dtype for b in counts[IntBlock]])
44744465
kinds = set([i.dtype.kind for i in counts[IntBlock]])
44754466
if len(kinds) == 1:
44764467
return lcd
@@ -4486,7 +4477,8 @@ def _lcd_dtype(l):
44864477
elif have_complex:
44874478
return np.dtype('c16')
44884479
else:
4489-
return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock])
4480+
introspection_blks = counts[FloatBlock] + counts[SparseBlock]
4481+
return _find_common_type([b.dtype for b in introspection_blks])
44904482

44914483

44924484
def _consolidate(blocks):

pandas/core/ops.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
is_bool_dtype, is_datetimetz,
3131
is_list_like,
3232
_ensure_object)
33-
from pandas.types.cast import _maybe_upcast_putmask
33+
from pandas.types.cast import _maybe_upcast_putmask, _find_common_type
3434
from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex
3535

3636
# -----------------------------------------------------------------------------
@@ -616,7 +616,7 @@ def na_op(x, y):
616616
raise_on_error=True, **eval_kwargs)
617617
except TypeError:
618618
if isinstance(y, (np.ndarray, ABCSeries, pd.Index)):
619-
dtype = np.find_common_type([x.dtype, y.dtype], [])
619+
dtype = _find_common_type([x.dtype, y.dtype])
620620
result = np.empty(x.size, dtype=dtype)
621621
mask = notnull(x) & notnull(y)
622622
result[mask] = op(x[mask], _values_from_object(y[mask]))

pandas/sparse/tests/test_indexing.py

+78
Original file line numberDiff line numberDiff line change
@@ -829,3 +829,81 @@ def test_reindex_fill_value(self):
829829
res = sparse.reindex(['A', 'C', 'B'])
830830
exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0)
831831
tm.assert_sp_frame_equal(res, exp)
832+
833+
834+
class TestMultitype(tm.TestCase):
835+
def setUp(self):
836+
self.cols = ['string', 'int', 'float', 'object']
837+
838+
self.string_series = pd.SparseSeries(['a', 'b', 'c'])
839+
self.int_series = pd.SparseSeries([1, 2, 3])
840+
self.float_series = pd.SparseSeries([1.1, 1.2, 1.3])
841+
self.object_series = pd.SparseSeries([[], {}, set()])
842+
self.sdf = pd.SparseDataFrame({
843+
'string': self.string_series,
844+
'int': self.int_series,
845+
'float': self.float_series,
846+
'object': self.object_series,
847+
})
848+
self.sdf = self.sdf[self.cols]
849+
self.ss = pd.SparseSeries(['a', 1, 1.1, []], index=self.cols)
850+
851+
def test_frame_basic_dtypes(self):
852+
for _, row in self.sdf.iterrows():
853+
self.assertEqual(row.dtype, object)
854+
tm.assert_sp_series_equal(self.sdf['string'], self.string_series,
855+
check_names=False)
856+
tm.assert_sp_series_equal(self.sdf['int'], self.int_series,
857+
check_names=False)
858+
tm.assert_sp_series_equal(self.sdf['float'], self.float_series,
859+
check_names=False)
860+
tm.assert_sp_series_equal(self.sdf['object'], self.object_series,
861+
check_names=False)
862+
863+
def test_frame_indexing_single(self):
864+
tm.assert_sp_series_equal(self.sdf.iloc[0],
865+
pd.SparseSeries(['a', 1, 1.1, []],
866+
index=self.cols),
867+
check_names=False)
868+
tm.assert_sp_series_equal(self.sdf.iloc[1],
869+
pd.SparseSeries(['b', 2, 1.2, {}],
870+
index=self.cols),
871+
check_names=False)
872+
tm.assert_sp_series_equal(self.sdf.iloc[2],
873+
pd.SparseSeries(['c', 3, 1.3, set()],
874+
index=self.cols),
875+
check_names=False)
876+
877+
def test_frame_indexing_multiple(self):
878+
tm.assert_sp_frame_equal(self.sdf, self.sdf[:])
879+
tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:])
880+
tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]],
881+
pd.SparseDataFrame({
882+
'string': self.string_series.iloc[[1, 2]],
883+
'int': self.int_series.iloc[[1, 2]],
884+
'float': self.float_series.iloc[[1, 2]],
885+
'object': self.object_series.iloc[[1, 2]]
886+
}, index=[1, 2])[self.cols])
887+
tm.assert_sp_frame_equal(self.sdf[['int', 'string']],
888+
pd.SparseDataFrame({
889+
'int': self.int_series,
890+
'string': self.string_series,
891+
}))
892+
893+
def test_series_indexing_single(self):
894+
for i, idx in enumerate(self.cols):
895+
self.assertEqual(self.ss.iloc[i], self.ss[idx])
896+
self.assertEqual(type(self.ss.iloc[i]),
897+
type(self.ss[idx]))
898+
self.assertEqual(self.ss['string'], 'a')
899+
self.assertEqual(self.ss['int'], 1)
900+
self.assertEqual(self.ss['float'], 1.1)
901+
self.assertEqual(self.ss['object'], [])
902+
903+
def test_series_indexing_multiple(self):
904+
tm.assert_sp_series_equal(self.ss.loc[['string', 'int']],
905+
pd.SparseSeries(['a', 1],
906+
index=['string', 'int']))
907+
tm.assert_sp_series_equal(self.ss.loc[['string', 'object']],
908+
pd.SparseSeries(['a', []],
909+
index=['string', 'object']))

pandas/tests/frame/test_block_internals.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -104,15 +104,21 @@ def test_as_matrix_lcd(self):
104104
values = self.mixed_float.as_matrix(['C'])
105105
self.assertEqual(values.dtype, np.float16)
106106

107+
# GH 10364
108+
# B uint64 forces float because there are other signed int types
107109
values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D'])
108-
self.assertEqual(values.dtype, np.int64)
110+
self.assertEqual(values.dtype, np.float64)
109111

110112
values = self.mixed_int.as_matrix(['A', 'D'])
111113
self.assertEqual(values.dtype, np.int64)
112114

113-
# guess all ints are cast to uints....
115+
# B uint64 forces float because there are other signed int types
114116
values = self.mixed_int.as_matrix(['A', 'B', 'C'])
115-
self.assertEqual(values.dtype, np.int64)
117+
self.assertEqual(values.dtype, np.float64)
118+
119+
# as B and C are both unsigned, no forcing to float is needed
120+
values = self.mixed_int.as_matrix(['B', 'C'])
121+
self.assertEqual(values.dtype, np.uint64)
116122

117123
values = self.mixed_int.as_matrix(['A', 'C'])
118124
self.assertEqual(values.dtype, np.int32)

pandas/tests/types/test_cast.py

+44-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
_possibly_convert_objects,
1616
_infer_dtype_from_scalar,
1717
_maybe_convert_string_to_object,
18-
_maybe_convert_scalar)
18+
_maybe_convert_scalar,
19+
_find_common_type)
20+
from pandas.types.dtypes import (CategoricalDtype,
21+
DatetimeTZDtype)
1922
from pandas.util import testing as tm
2023

2124
_multiprocess_can_split_ = True
@@ -188,6 +191,46 @@ def test_possibly_convert_objects_copy(self):
188191
self.assertTrue(values is not out)
189192

190193

194+
class TestCommonTypes(tm.TestCase):
195+
def test_numpy_dtypes(self):
196+
# (source_types, destination_type)
197+
testcases = (
198+
# identity
199+
((np.int64,), np.int64),
200+
((np.uint64,), np.uint64),
201+
((np.float32,), np.float32),
202+
((np.object,), np.object),
203+
204+
# into ints
205+
((np.int16, np.int64), np.int64),
206+
((np.int32, np.uint32), np.int64),
207+
((np.uint16, np.uint64), np.uint64),
208+
209+
# into floats
210+
((np.float16, np.float32), np.float32),
211+
((np.float16, np.int16), np.float32),
212+
((np.float32, np.int16), np.float32),
213+
((np.uint64, np.int64), np.float64),
214+
((np.int16, np.float64), np.float64),
215+
((np.float16, np.int64), np.float64),
216+
217+
# into others
218+
((np.complex128, np.int32), np.complex128),
219+
((np.object, np.float32), np.object),
220+
((np.object, np.int16), np.object),
221+
)
222+
for src, common in testcases:
223+
self.assertEqual(_find_common_type(src), common)
224+
225+
def test_pandas_dtypes(self):
226+
# TODO: not implemented yet
227+
with self.assertRaises(TypeError):
228+
self.assertEqual(_find_common_type([CategoricalDtype()]),
229+
CategoricalDtype)
230+
with self.assertRaises(TypeError):
231+
self.assertEqual(_find_common_type([DatetimeTZDtype()]),
232+
DatetimeTZDtype)
233+
191234
if __name__ == '__main__':
192235
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
193236
exit=False)

pandas/types/cast.py

+10
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
_ensure_int32, _ensure_int64,
2020
_NS_DTYPE, _TD_DTYPE, _INT64_DTYPE,
2121
_DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES)
22+
from .dtypes import ExtensionDtype
2223
from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries
2324
from .missing import isnull, notnull
2425
from .inference import is_list_like
@@ -861,3 +862,12 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'):
861862
value = _possibly_infer_to_datetimelike(value)
862863

863864
return value
865+
866+
867+
def _find_common_type(types):
868+
"""Find a common data type among the given dtypes."""
869+
# TODO: enable using pandas-specific types
870+
if any(isinstance(t, ExtensionDtype) for t in types):
871+
raise TypeError("Common type discovery is currently only "
872+
"supported for pure numpy dtypes.")
873+
return np.find_common_type(types, [])

0 commit comments

Comments
 (0)