From 94099737ddf8b24329755223115668152d68f62f Mon Sep 17 00:00:00 2001 From: Pedro Freitas Date: Mon, 25 Mar 2024 17:36:38 +0000 Subject: [PATCH 1/3] FIX #57645: Cannot use numpy FLS as indicies since pandas 2.2.1 While using the function set_index with parameter inplace=True, the function would try and create a new index where its dtype would be a FLS S{value} dtype, which was not recognized by the function _dtype_to_subclass and raised a NotImplementedError. That said , by adding a verification that recognizes FLS dtype , the index is created successfully and the function executes properly. --- doc/source/whatsnew/v3.0.0.rst | 9 +++++++++ pandas/core/indexes/base.py | 4 +++- pandas/tests/frame/methods/test_set_index.py | 14 ++++++++++++++ pandas/tests/io/test_parquet.py | 4 +--- 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7da6d43c732a9..d7aafcd192374 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -378,6 +378,15 @@ Performance improvements Bug fixes ~~~~~~~~~ +- Fixed bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) +- Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) +- Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) +- Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) +- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) +- Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) +- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Fixed bug in :class:`Index` Index constructor did not allow FLS as indicies. (:issue:`57645`) Categorical ^^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ebbd85be44009..d32a6393c8edf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -625,7 +625,9 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): # NB: assuming away MultiIndex return Index - elif issubclass(dtype.type, str) or is_numeric_dtype(dtype): + elif ( + dtype.kind == "S" or issubclass(dtype.type, str) or is_numeric_dtype(dtype) + ): return Index raise NotImplementedError(dtype) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 198cab0e91eab..be8a2dfa20704 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -617,6 +617,20 @@ def test_set_index_raise_on_len( with pytest.raises(ValueError, match=msg): df.set_index(["A", df.A, box(values)], drop=drop, append=append) + def test_set_index_with_FLS_Dtype(self): + string_length = 6 + in_dtype, df_name = f"S{string_length}", "fruit" + data = ["apple", "banana", "orange", "grape"] + + # Create array with FLS(|S{value}) dtype + arr = np.array(data, dtype=in_dtype) + df = DataFrame(Series(arr), columns=[df_name]) + + # This will create a new Index with FLS dtype + expected = Index(data=Series(arr), name=df_name) + df.set_index(df_name, inplace=True) + tm.assert_index_equal(df.index, expected) + class TestSetIndexCustomLabelType: def test_set_index_custom_label_type(self): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2860b3a6483af..398561f22d2cb 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1027,9 +1027,7 @@ def test_columns_dtypes_not_invalid(self, pa): # bytes df.columns = [b"foo", b"bar"] - with pytest.raises(NotImplementedError, match="|S3"): - # Bytes fails on read_parquet - check_round_trip(df, pa) + check_round_trip(df, pa) # python object df.columns = [ From ff0b740f0139da67c51595ac4cbe6c296449caf8 Mon Sep 17 00:00:00 2001 From: Pedro Freitas Date: Mon, 25 Mar 2024 22:00:27 +0000 Subject: [PATCH 2/3] sort whatsnew entries alphabetically --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d7aafcd192374..4b41f9a6b9d69 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -378,6 +378,7 @@ Performance improvements Bug fixes ~~~~~~~~~ +- Fixed bug in :class:`Index` Index constructor did not allow FLS as indices. (:issue:`57645`) - Fixed bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) - Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) @@ -386,7 +387,6 @@ Bug fixes - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) -- Fixed bug in :class:`Index` Index constructor did not allow FLS as indicies. (:issue:`57645`) Categorical ^^^^^^^^^^^ From aaa456367e41c007aa152600521ec9b28ceca8c1 Mon Sep 17 00:00:00 2001 From: Pedro Freitas Date: Fri, 17 May 2024 09:58:12 +0100 Subject: [PATCH 3/3] While using the function set_index with the parameter inplace=True, the function attempted to create a new index with a dtype of FLS S{value}. This dtype was not recognized by the function _dtype_to_subclass, which raised a NotImplementedError. To address this, I added a verification to the function asarray_tuplesafe that converts data to an array with object type, allowing the index to be created succes sfully. Additionally, I created a new test and simplified a previously created test. I also reverted the test file test_parquet.py to restore the intended FLS behavior. --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/common.py | 2 +- pandas/core/indexes/base.py | 4 +--- pandas/tests/frame/methods/test_set_index.py | 15 ++++++--------- pandas/tests/indexes/test_index_new.py | 12 ++++++++++++ pandas/tests/io/test_parquet.py | 4 +++- 6 files changed, 24 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4b41f9a6b9d69..f76e0d1458731 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -378,7 +378,7 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Fixed bug in :class:`Index` Index constructor did not allow FLS as indices. (:issue:`57645`) +- Fixed bug in :class:`Index` Index constructor was not converting FLS to object. (:issue:`57645`) - Fixed bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) - Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 77e986a26fbe9..bf270efab8c5b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -236,7 +236,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi values = list(values) elif isinstance(values, ABCIndex): return values._values - elif isinstance(values, ABCSeries): + elif isinstance(values, ABCSeries) and values.dtype.kind != "S": return values._values if isinstance(values, list) and dtype in [np.object_, object]: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d32a6393c8edf..ebbd85be44009 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -625,9 +625,7 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): # NB: assuming away MultiIndex return Index - elif ( - dtype.kind == "S" or issubclass(dtype.type, str) or is_numeric_dtype(dtype) - ): + elif issubclass(dtype.type, str) or is_numeric_dtype(dtype): return Index raise NotImplementedError(dtype) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index be8a2dfa20704..5dca06afe72d6 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -618,17 +618,14 @@ def test_set_index_raise_on_len( df.set_index(["A", df.A, box(values)], drop=drop, append=append) def test_set_index_with_FLS_Dtype(self): - string_length = 6 - in_dtype, df_name = f"S{string_length}", "fruit" - data = ["apple", "banana", "orange", "grape"] + arr = np.array(["apple", "banana", "orange", "grape"], dtype="S6") - # Create array with FLS(|S{value}) dtype - arr = np.array(data, dtype=in_dtype) - df = DataFrame(Series(arr), columns=[df_name]) + # Attempt to create a DataFrame with an array with FLS Dtype + df = DataFrame(Series(arr), columns=["fruits"]) - # This will create a new Index with FLS dtype - expected = Index(data=Series(arr), name=df_name) - df.set_index(df_name, inplace=True) + # Create Index that converts FLS Dtype to object + expected = Index(data=Series(arr), name="fruits") + df.set_index("fruits", inplace=True) tm.assert_index_equal(df.index, expected) diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index b544ebac43ece..073f99d938bcd 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -185,6 +185,18 @@ def test_constructor_datetimes_mixed_tzs(self): expected = Index([dt1, dt2], dtype=object) tm.assert_index_equal(result, expected) + def test_FLS_to_object_conversion(self): + # Create NumPy array of fixed-length strings + arr = np.array(["apple", "banana", "orange", "grape"], dtype="S6") + # Create expected array for index + expected_arr = np.array( + [b"apple", b"banana", b"orange", b"grape"], dtype=object + ) + # Create Index that converts FLS Dtype to object + index = Index(data=Series(arr), name="fruits") + expected = Index(data=Series(expected_arr), name="fruits") + tm.assert_index_equal(index, expected) + class TestDtypeEnforced: # check we don't silently ignore the dtype keyword diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 398561f22d2cb..2860b3a6483af 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1027,7 +1027,9 @@ def test_columns_dtypes_not_invalid(self, pa): # bytes df.columns = [b"foo", b"bar"] - check_round_trip(df, pa) + with pytest.raises(NotImplementedError, match="|S3"): + # Bytes fails on read_parquet + check_round_trip(df, pa) # python object df.columns = [