Preserve Extension type on cross section (#22785)

TomAugspurger · web-flow · commit 9df8065ab3c1 · 2018-09-26T09:27:52.000-05:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -500,6 +500,7 @@ ExtensionType Changes
 - :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
 - :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
 - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
+- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
 
 .. _whatsnew_0240.api.incompatibilities:
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -664,7 +664,7 @@ def transpose(self, *args, **kwargs):
                                 "definition self")
 
     @property
-    def _is_homogeneous(self):
+    def _is_homogeneous_type(self):
         """Whether the object has a single dtype.
 
         By definition, Series and Index are always considered homogeneous.
@@ -673,8 +673,8 @@ def _is_homogeneous(self):
 
         See Also
         --------
-        DataFrame._is_homogeneous
-        MultiIndex._is_homogeneous
+        DataFrame._is_homogeneous_type
+        MultiIndex._is_homogeneous_type
         """
         return True
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -614,7 +614,7 @@ def shape(self):
         return len(self.index), len(self.columns)
 
     @property
-    def _is_homogeneous(self):
+    def _is_homogeneous_type(self):
         """
         Whether all the columns in a DataFrame have the same type.
 
@@ -624,16 +624,17 @@ def _is_homogeneous(self):
 
         Examples
         --------
-        >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous
+        >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
         True
-        >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous
+        >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
         False
 
         Items with the same type but different sizes are considered
         different types.
 
-        >>> DataFrame({"A": np.array([1, 2], dtype=np.int32),
-        ...            "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous
+        >>> DataFrame({
+        ...    "A": np.array([1, 2], dtype=np.int32),
+        ...    "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
         False
         """
         if self._data.any_extension_types:
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -289,21 +289,23 @@ def levels(self):
         return self._levels
 
     @property
-    def _is_homogeneous(self):
+    def _is_homogeneous_type(self):
         """Whether the levels of a MultiIndex all have the same dtype.
 
         This looks at the dtypes of the levels.
 
         See Also
         --------
-        Index._is_homogeneous
-        DataFrame._is_homogeneous
+        Index._is_homogeneous_type
+        DataFrame._is_homogeneous_type
 
         Examples
         --------
-        >>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')])._is_homogeneous
+        >>> MultiIndex.from_tuples([
+        ...     ('a', 'b'), ('a', 'c')])._is_homogeneous_type
         True
-        >>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous
+        >>> MultiIndex.from_tuples([
+        ...     ('a', 1), ('a', 2)])._is_homogeneous_type
         False
         """
         return len({x.dtype for x in self.levels}) <= 1
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -12,9 +12,6 @@
 from pandas.util._validators import validate_bool_kwarg
 from pandas.compat import range, map, zip
 
-from pandas.core.dtypes.dtypes import (
-    ExtensionDtype,
-    PandasExtensionDtype)
 from pandas.core.dtypes.common import (
     _NS_DTYPE,
     is_datetimelike_v_numeric,
@@ -791,6 +788,11 @@ def _interleave(self):
         """
         dtype = _interleaved_dtype(self.blocks)
 
+        if is_extension_array_dtype(dtype):
+            # TODO: https://github.com/pandas-dev/pandas/issues/22791
+            # Give EAs some input on what happens here. Sparse needs this.
+            dtype = 'object'
+
         result = np.empty(self.shape, dtype=dtype)
 
         if result.shape[0] == 0:
@@ -906,14 +908,25 @@ def fast_xs(self, loc):
 
         # unique
         dtype = _interleaved_dtype(self.blocks)
+
         n = len(items)
-        result = np.empty(n, dtype=dtype)
+        if is_extension_array_dtype(dtype):
+            # we'll eventually construct an ExtensionArray.
+            result = np.empty(n, dtype=object)
+        else:
+            result = np.empty(n, dtype=dtype)
+
         for blk in self.blocks:
             # Such assignment may incorrectly coerce NaT to None
             # result[blk.mgr_locs] = blk._slice((slice(None), loc))
             for i, rl in enumerate(blk.mgr_locs):
                 result[rl] = blk._try_coerce_result(blk.iget((i, loc)))
 
+        if is_extension_array_dtype(dtype):
+            result = dtype.construct_array_type()._from_sequence(
+                result, dtype=dtype
+            )
+
         return result
 
     def consolidate(self):
@@ -1855,16 +1868,22 @@ def _shape_compat(x):
 
 
 def _interleaved_dtype(blocks):
-    if not len(blocks):
-        return None
+    # type: (List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]
+    """Find the common dtype for `blocks`.
 
-    dtype = find_common_type([b.dtype for b in blocks])
+    Parameters
+    ----------
+    blocks : List[Block]
 
-    # only numpy compat
-    if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)):
-        dtype = np.object
+    Returns
+    -------
+    dtype : Optional[Union[np.dtype, ExtensionDtype]]
+        None is returned when `blocks` is empty.
+    """
+    if not len(blocks):
+        return None
 
-    return dtype
+    return find_common_type([b.dtype for b in blocks])
 
 
 def _consolidate(blocks):
diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
@@ -836,8 +836,16 @@ def test_constructor_list_str_na(self, string_dtype):
                     "B": pd.Categorical(['b', 'c'])}), False),
 
     ])
-    def test_is_homogeneous(self, data, expected):
-        assert data._is_homogeneous is expected
+    def test_is_homogeneous_type(self, data, expected):
+        assert data._is_homogeneous_type is expected
+
+    def test_asarray_homogenous(self):
+        df = pd.DataFrame({"A": pd.Categorical([1, 2]),
+                           "B": pd.Categorical([1, 2])})
+        result = np.asarray(df)
+        # may change from object in the future
+        expected = np.array([[1, 1], [2, 2]], dtype='object')
+        tm.assert_numpy_array_equal(result, expected)
 
 
 class TestDataFrameDatetimeWithTZ(TestData):
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -1079,3 +1079,31 @@ def test_validate_indices_high():
 def test_validate_indices_empty():
     with tm.assert_raises_regex(IndexError, "indices are out"):
         validate_indices(np.array([0, 1]), 0)
+
+
+def test_extension_array_cross_section():
+    # A cross-section of a homogeneous EA should be an EA
+    df = pd.DataFrame({
+        "A": pd.core.arrays.integer_array([1, 2]),
+        "B": pd.core.arrays.integer_array([3, 4])
+    }, index=['a', 'b'])
+    expected = pd.Series(pd.core.arrays.integer_array([1, 3]),
+                         index=['A', 'B'], name='a')
+    result = df.loc['a']
+    tm.assert_series_equal(result, expected)
+
+    result = df.iloc[0]
+    tm.assert_series_equal(result, expected)
+
+
+def test_extension_array_cross_section_converts():
+    df = pd.DataFrame({
+        "A": pd.core.arrays.integer_array([1, 2]),
+        "B": np.array([1, 2]),
+    }, index=['a', 'b'])
+    result = df.loc['a']
+    expected = pd.Series([1, 1], dtype=object, index=['A', 'B'], name='a')
+    tm.assert_series_equal(result, expected)
+
+    result = df.iloc[0]
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py
@@ -738,8 +738,8 @@ def test_multiindex_contains_dropped(self):
         (MultiIndex.from_product([(1, 2), (3, 4)]), True),
         (MultiIndex.from_product([('a', 'b'), (1, 2)]), False),
     ])
-    def test_multiindex_is_homogeneous(self, data, expected):
-        assert data._is_homogeneous is expected
+    def test_multiindex_is_homogeneous_type(self, data, expected):
+        assert data._is_homogeneous_type is expected
 
 
 class TestMultiIndexSlicers(object):
diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
@@ -509,7 +509,7 @@ def test_infer_objects_series(self):
         assert actual.dtype == 'object'
         tm.assert_series_equal(actual, expected)
 
-    def test_is_homogeneous(self):
-        assert Series()._is_homogeneous
-        assert Series([1, 2])._is_homogeneous
-        assert Series(pd.Categorical([1, 2]))._is_homogeneous
+    def test_is_homogeneous_type(self):
+        assert Series()._is_homogeneous_type
+        assert Series([1, 2])._is_homogeneous_type
+        assert Series(pd.Categorical([1, 2]))._is_homogeneous_type