From 90c76cb55f1c8aef8ccae7b4cc55d384a4cf1512 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 06:50:59 -0500 Subject: [PATCH 1/4] ENH: is_homogenous --- pandas/core/base.py | 15 +++++++++++++ pandas/core/frame.py | 28 ++++++++++++++++++++++++ pandas/core/indexes/multi.py | 20 +++++++++++++++++ pandas/tests/frame/test_dtypes.py | 24 ++++++++++++++++++++ pandas/tests/indexing/test_multiindex.py | 8 +++++++ pandas/tests/series/test_dtypes.py | 5 +++++ 6 files changed, 100 insertions(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index d831dc69338bd..8cd4fbbe7463c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -663,6 +663,21 @@ def transpose(self, *args, **kwargs): T = property(transpose, doc="return the transpose, which is by " "definition self") + @property + def is_homogeneous(self): + """Whether the object is homogeneous. + + By definition, Series and Index are always considered homogeneous. + A MultiIndex may or may not be homogeneous, depending on the + dtypes of the levels. + + See Also + -------- + DataFrame.is_homogenous + MultiIndex.is_homogenous + """ + return True + @property def shape(self): """ return a tuple of the shape of the underlying data """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bb221ced9e6bd..928741ae2546e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -613,6 +613,34 @@ def shape(self): """ return len(self.index), len(self.columns) + @property + def is_homogeneous(self): + """ + Whether all the columns in a DataFrame have the same type. + + Returns + ------- + bool + + Examples + -------- + >>> DataFrame({"A": [1, 2], "B": [3, 4]}).is_homogeneous + True + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]}).is_homogeneous + False + + Items with the type but different sizes are considered different + types. + + >>> DataFrame({"A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64)}).is_homogeneous + False + """ + if self._data.any_extension_types: + return len({block.dtype for block in self._data.blocks}) == 1 + else: + return not self._data.is_mixed_type + def _repr_fits_vertical_(self): """ Check length against max_rows. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a7932f667f6de..5e2ed57c420d6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -288,6 +288,26 @@ def _verify_integrity(self, labels=None, levels=None): def levels(self): return self._levels + @property + def is_homogeneous(self): + """Whether the levels of a MultiIndex are homogenous. + + This looks at the dtypes of the levels. + + See Also + -------- + Index.is_homogenous + DataFrame.is_homogenous + + Examples + -------- + >>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')]).is_homogeneous + True + >>> MultiIndex.from_tuples([('a', 1), ('a', 2)]).is_homogeneous + False + """ + return len(set(x.dtype for x in self.levels)) <= 1 + def _set_levels(self, levels, level=None, copy=False, validate=True, verify_integrity=False): # This is NOT part of the levels property because it should be diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 3b3ab3d03dce9..eb04254fcc397 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -815,6 +815,30 @@ def test_constructor_list_str_na(self, string_dtype): expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) assert_frame_equal(result, expected) + @pytest.mark.parametrize("data, expected", [ + # empty + (DataFrame(), True), + # multi-same + (DataFrame({"A": [1, 2], "B": [1, 2]}), True), + # multi-object + (DataFrame({"A": np.array([1, 2], dtype=object), + "B": np.array(["a", "b"], dtype=object)}), True), + # multi-extension + (DataFrame({"A": pd.Categorical(['a', 'b']), + "B": pd.Categorical(['a', 'b'])}), True), + # differ types + (DataFrame({"A": [1, 2], "B": [1., 2.]}), False), + # differ sizes + (DataFrame({"A": np.array([1, 2], dtype=np.int32), + "B": np.array([1, 2], dtype=np.int64)}), False), + # multi-extension differ + (DataFrame({"A": pd.Categorical(['a', 'b']), + "B": pd.Categorical(['b', 'c'])}), False), + + ]) + def test_is_homogeneous(self, data, expected): + assert data.is_homogeneous is expected + class TestDataFrameDatetimeWithTZ(TestData): diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 9e66dfad3ddc7..53d10a96bfbdf 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -733,6 +733,14 @@ def test_multiindex_contains_dropped(self): assert 'a' in idx.levels[0] assert 'a' not in idx + @pytest.mark.parametrize("data, expected", [ + (MultiIndex.from_product([(), ()]), True), + (MultiIndex.from_product([(1, 2), (3, 4)]), True), + (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), + ]) + def test_multiindex_is_homogeneous(self, data, expected): + assert data.is_homogeneous is expected + class TestMultiIndexSlicers(object): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 7aecaf340a3e0..0990579633857 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -508,3 +508,8 @@ def test_infer_objects_series(self): assert actual.dtype == 'object' tm.assert_series_equal(actual, expected) + + def test_is_homogeneous(self): + assert Series().is_homogeneous + assert Series([1, 2]).is_homogeneous + assert Series(pd.Categorical([1, 2])).is_homogeneous From a5fef742c5de655c4998da5eb73d9509e8a1c569 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 07:54:08 -0500 Subject: [PATCH 2/4] private --- pandas/core/base.py | 8 ++++---- pandas/core/frame.py | 8 ++++---- pandas/core/indexes/multi.py | 12 ++++++------ pandas/tests/frame/test_dtypes.py | 2 +- pandas/tests/indexing/test_multiindex.py | 2 +- pandas/tests/series/test_dtypes.py | 6 +++--- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 8cd4fbbe7463c..26fea89b45ae1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -664,8 +664,8 @@ def transpose(self, *args, **kwargs): "definition self") @property - def is_homogeneous(self): - """Whether the object is homogeneous. + def _is_homogeneous(self): + """Whether the object has a single dtype. By definition, Series and Index are always considered homogeneous. A MultiIndex may or may not be homogeneous, depending on the @@ -673,8 +673,8 @@ def is_homogeneous(self): See Also -------- - DataFrame.is_homogenous - MultiIndex.is_homogenous + DataFrame._is_homogeneous + MultiIndex._is_homogeneous """ return True diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 928741ae2546e..8e7b3270bda2f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -614,7 +614,7 @@ def shape(self): return len(self.index), len(self.columns) @property - def is_homogeneous(self): + def _is_homogeneous(self): """ Whether all the columns in a DataFrame have the same type. @@ -624,16 +624,16 @@ def is_homogeneous(self): Examples -------- - >>> DataFrame({"A": [1, 2], "B": [3, 4]}).is_homogeneous + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous True - >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]}).is_homogeneous + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous False Items with the type but different sizes are considered different types. >>> DataFrame({"A": np.array([1, 2], dtype=np.int32), - ... "B": np.array([1, 2], dtype=np.int64)}).is_homogeneous + ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous False """ if self._data.any_extension_types: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5e2ed57c420d6..c0d5bf5c7a08e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -289,21 +289,21 @@ def levels(self): return self._levels @property - def is_homogeneous(self): - """Whether the levels of a MultiIndex are homogenous. + def _is_homogeneous(self): + """Whether the levels of a MultiIndex all have the same dtype. This looks at the dtypes of the levels. See Also -------- - Index.is_homogenous - DataFrame.is_homogenous + Index._is_homogeneous + DataFrame._is_homogeneous Examples -------- - >>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')]).is_homogeneous + >>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')])._is_homogeneous True - >>> MultiIndex.from_tuples([('a', 1), ('a', 2)]).is_homogeneous + >>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous False """ return len(set(x.dtype for x in self.levels)) <= 1 diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index eb04254fcc397..ca4bd64659e06 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -837,7 +837,7 @@ def test_constructor_list_str_na(self, string_dtype): ]) def test_is_homogeneous(self, data, expected): - assert data.is_homogeneous is expected + assert data._is_homogeneous is expected class TestDataFrameDatetimeWithTZ(TestData): diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 53d10a96bfbdf..aefa8badf72e7 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -739,7 +739,7 @@ def test_multiindex_contains_dropped(self): (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), ]) def test_multiindex_is_homogeneous(self, data, expected): - assert data.is_homogeneous is expected + assert data._is_homogeneous is expected class TestMultiIndexSlicers(object): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 0990579633857..83a458eedbd93 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -510,6 +510,6 @@ def test_infer_objects_series(self): tm.assert_series_equal(actual, expected) def test_is_homogeneous(self): - assert Series().is_homogeneous - assert Series([1, 2]).is_homogeneous - assert Series(pd.Categorical([1, 2])).is_homogeneous + assert Series()._is_homogeneous + assert Series([1, 2])._is_homogeneous + assert Series(pd.Categorical([1, 2]))._is_homogeneous From 528bbd1e2456756b6ccab5a2bfc1afc77d33d192 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 09:39:35 -0500 Subject: [PATCH 3/4] set comprehension --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c0d5bf5c7a08e..ad38f037b6578 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -306,7 +306,7 @@ def _is_homogeneous(self): >>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous False """ - return len(set(x.dtype for x in self.levels)) <= 1 + return len({x.dtype for x in self.levels}) <= 1 def _set_levels(self, levels, level=None, copy=False, validate=True, verify_integrity=False): From 332dbcab41019d6557e3dc4070f01af47bbcd2e9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 11:22:14 -0500 Subject: [PATCH 4/4] fixed typo --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8e7b3270bda2f..959b0a4fd1890 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -629,8 +629,8 @@ def _is_homogeneous(self): >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous False - Items with the type but different sizes are considered different - types. + Items with the same type but different sizes are considered + different types. >>> DataFrame({"A": np.array([1, 2], dtype=np.int32), ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous