Skip to content

Commit 9df8065

Browse files
Preserve Extension type on cross section (#22785)
1 parent f64f994 commit 9df8065

File tree

9 files changed

+91
-32
lines changed

9 files changed

+91
-32
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ ExtensionType Changes
500500
- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
501501
- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
502502
- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
503+
- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
503504
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
504505

505506
.. _whatsnew_0240.api.incompatibilities:

pandas/core/base.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,7 @@ def transpose(self, *args, **kwargs):
664664
"definition self")
665665

666666
@property
667-
def _is_homogeneous(self):
667+
def _is_homogeneous_type(self):
668668
"""Whether the object has a single dtype.
669669
670670
By definition, Series and Index are always considered homogeneous.
@@ -673,8 +673,8 @@ def _is_homogeneous(self):
673673
674674
See Also
675675
--------
676-
DataFrame._is_homogeneous
677-
MultiIndex._is_homogeneous
676+
DataFrame._is_homogeneous_type
677+
MultiIndex._is_homogeneous_type
678678
"""
679679
return True
680680

pandas/core/frame.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ def shape(self):
614614
return len(self.index), len(self.columns)
615615

616616
@property
617-
def _is_homogeneous(self):
617+
def _is_homogeneous_type(self):
618618
"""
619619
Whether all the columns in a DataFrame have the same type.
620620
@@ -624,16 +624,17 @@ def _is_homogeneous(self):
624624
625625
Examples
626626
--------
627-
>>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous
627+
>>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
628628
True
629-
>>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous
629+
>>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
630630
False
631631
632632
Items with the same type but different sizes are considered
633633
different types.
634634
635-
>>> DataFrame({"A": np.array([1, 2], dtype=np.int32),
636-
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous
635+
>>> DataFrame({
636+
... "A": np.array([1, 2], dtype=np.int32),
637+
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
637638
False
638639
"""
639640
if self._data.any_extension_types:

pandas/core/indexes/multi.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -289,21 +289,23 @@ def levels(self):
289289
return self._levels
290290

291291
@property
292-
def _is_homogeneous(self):
292+
def _is_homogeneous_type(self):
293293
"""Whether the levels of a MultiIndex all have the same dtype.
294294
295295
This looks at the dtypes of the levels.
296296
297297
See Also
298298
--------
299-
Index._is_homogeneous
300-
DataFrame._is_homogeneous
299+
Index._is_homogeneous_type
300+
DataFrame._is_homogeneous_type
301301
302302
Examples
303303
--------
304-
>>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')])._is_homogeneous
304+
>>> MultiIndex.from_tuples([
305+
... ('a', 'b'), ('a', 'c')])._is_homogeneous_type
305306
True
306-
>>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous
307+
>>> MultiIndex.from_tuples([
308+
... ('a', 1), ('a', 2)])._is_homogeneous_type
307309
False
308310
"""
309311
return len({x.dtype for x in self.levels}) <= 1

pandas/core/internals/managers.py

+30-11
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@
1212
from pandas.util._validators import validate_bool_kwarg
1313
from pandas.compat import range, map, zip
1414

15-
from pandas.core.dtypes.dtypes import (
16-
ExtensionDtype,
17-
PandasExtensionDtype)
1815
from pandas.core.dtypes.common import (
1916
_NS_DTYPE,
2017
is_datetimelike_v_numeric,
@@ -791,6 +788,11 @@ def _interleave(self):
791788
"""
792789
dtype = _interleaved_dtype(self.blocks)
793790

791+
if is_extension_array_dtype(dtype):
792+
# TODO: https://github.com/pandas-dev/pandas/issues/22791
793+
# Give EAs some input on what happens here. Sparse needs this.
794+
dtype = 'object'
795+
794796
result = np.empty(self.shape, dtype=dtype)
795797

796798
if result.shape[0] == 0:
@@ -906,14 +908,25 @@ def fast_xs(self, loc):
906908

907909
# unique
908910
dtype = _interleaved_dtype(self.blocks)
911+
909912
n = len(items)
910-
result = np.empty(n, dtype=dtype)
913+
if is_extension_array_dtype(dtype):
914+
# we'll eventually construct an ExtensionArray.
915+
result = np.empty(n, dtype=object)
916+
else:
917+
result = np.empty(n, dtype=dtype)
918+
911919
for blk in self.blocks:
912920
# Such assignment may incorrectly coerce NaT to None
913921
# result[blk.mgr_locs] = blk._slice((slice(None), loc))
914922
for i, rl in enumerate(blk.mgr_locs):
915923
result[rl] = blk._try_coerce_result(blk.iget((i, loc)))
916924

925+
if is_extension_array_dtype(dtype):
926+
result = dtype.construct_array_type()._from_sequence(
927+
result, dtype=dtype
928+
)
929+
917930
return result
918931

919932
def consolidate(self):
@@ -1855,16 +1868,22 @@ def _shape_compat(x):
18551868

18561869

18571870
def _interleaved_dtype(blocks):
1858-
if not len(blocks):
1859-
return None
1871+
# type: (List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]
1872+
"""Find the common dtype for `blocks`.
18601873
1861-
dtype = find_common_type([b.dtype for b in blocks])
1874+
Parameters
1875+
----------
1876+
blocks : List[Block]
18621877
1863-
# only numpy compat
1864-
if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)):
1865-
dtype = np.object
1878+
Returns
1879+
-------
1880+
dtype : Optional[Union[np.dtype, ExtensionDtype]]
1881+
None is returned when `blocks` is empty.
1882+
"""
1883+
if not len(blocks):
1884+
return None
18661885

1867-
return dtype
1886+
return find_common_type([b.dtype for b in blocks])
18681887

18691888

18701889
def _consolidate(blocks):

pandas/tests/frame/test_dtypes.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -836,8 +836,16 @@ def test_constructor_list_str_na(self, string_dtype):
836836
"B": pd.Categorical(['b', 'c'])}), False),
837837
838838
])
839-
def test_is_homogeneous(self, data, expected):
840-
assert data._is_homogeneous is expected
839+
def test_is_homogeneous_type(self, data, expected):
840+
assert data._is_homogeneous_type is expected
841+
842+
def test_asarray_homogenous(self):
843+
df = pd.DataFrame({"A": pd.Categorical([1, 2]),
844+
"B": pd.Categorical([1, 2])})
845+
result = np.asarray(df)
846+
# may change from object in the future
847+
expected = np.array([[1, 1], [2, 2]], dtype='object')
848+
tm.assert_numpy_array_equal(result, expected)
841849

842850

843851
class TestDataFrameDatetimeWithTZ(TestData):

pandas/tests/indexing/test_indexing.py

+28
Original file line numberDiff line numberDiff line change
@@ -1079,3 +1079,31 @@ def test_validate_indices_high():
10791079
def test_validate_indices_empty():
10801080
with tm.assert_raises_regex(IndexError, "indices are out"):
10811081
validate_indices(np.array([0, 1]), 0)
1082+
1083+
1084+
def test_extension_array_cross_section():
1085+
# A cross-section of a homogeneous EA should be an EA
1086+
df = pd.DataFrame({
1087+
"A": pd.core.arrays.integer_array([1, 2]),
1088+
"B": pd.core.arrays.integer_array([3, 4])
1089+
}, index=['a', 'b'])
1090+
expected = pd.Series(pd.core.arrays.integer_array([1, 3]),
1091+
index=['A', 'B'], name='a')
1092+
result = df.loc['a']
1093+
tm.assert_series_equal(result, expected)
1094+
1095+
result = df.iloc[0]
1096+
tm.assert_series_equal(result, expected)
1097+
1098+
1099+
def test_extension_array_cross_section_converts():
1100+
df = pd.DataFrame({
1101+
"A": pd.core.arrays.integer_array([1, 2]),
1102+
"B": np.array([1, 2]),
1103+
}, index=['a', 'b'])
1104+
result = df.loc['a']
1105+
expected = pd.Series([1, 1], dtype=object, index=['A', 'B'], name='a')
1106+
tm.assert_series_equal(result, expected)
1107+
1108+
result = df.iloc[0]
1109+
tm.assert_series_equal(result, expected)

pandas/tests/indexing/test_multiindex.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -738,8 +738,8 @@ def test_multiindex_contains_dropped(self):
738738
(MultiIndex.from_product([(1, 2), (3, 4)]), True),
739739
(MultiIndex.from_product([('a', 'b'), (1, 2)]), False),
740740
])
741-
def test_multiindex_is_homogeneous(self, data, expected):
742-
assert data._is_homogeneous is expected
741+
def test_multiindex_is_homogeneous_type(self, data, expected):
742+
assert data._is_homogeneous_type is expected
743743

744744

745745
class TestMultiIndexSlicers(object):

pandas/tests/series/test_dtypes.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ def test_infer_objects_series(self):
509509
assert actual.dtype == 'object'
510510
tm.assert_series_equal(actual, expected)
511511

512-
def test_is_homogeneous(self):
513-
assert Series()._is_homogeneous
514-
assert Series([1, 2])._is_homogeneous
515-
assert Series(pd.Categorical([1, 2]))._is_homogeneous
512+
def test_is_homogeneous_type(self):
513+
assert Series()._is_homogeneous_type
514+
assert Series([1, 2])._is_homogeneous_type
515+
assert Series(pd.Categorical([1, 2]))._is_homogeneous_type

0 commit comments

Comments
 (0)