From ec39d9eba8d6933f7d35e026fc61f6582e365d43 Mon Sep 17 00:00:00 2001 From: sshu2017 Date: Mon, 9 Sep 2024 00:01:55 -0700 Subject: [PATCH 1/8] fixed GH#59303 --- pandas/io/parsers/readers.py | 2 +- pandas/tests/io/parser/test_na_values.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6e933f94cf0ba..2b17f20536d4f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1648,7 +1648,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T if keep_default_na: v = set(v) | STR_NA_VALUES - na_values[k] = v + na_values[k] = _stringify_na_values(v, floatify) na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} else: if not is_list_like(na_values): diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 360a5feebe073..b5bd66c053eee 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -812,3 +812,22 @@ def test_bool_and_nan_to_float(all_parsers): result = parser.read_csv(StringIO(data), dtype="float") expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) tm.assert_frame_equal(result, expected) + +@xfail_pyarrow +def test_na_values_dict_without_dtype(all_parsers): + # GH#59303 + parser = all_parsers + data = """A +-99 +-99 +-99.0 +-99.0""" + # this would FAIL BEFORE this fix + result_1 = parser.read_csv(StringIO(data), na_values={"A": [-99.0, -99]}) + expected_1 = DataFrame.from_dict({"A": [np.nan, np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result_1, expected_1) + + # this would PASS even BEFORE this fix + result_2 = parser.read_csv(StringIO(data), na_values={"A": [-99, -99.0]}) + expected_2 = DataFrame.from_dict({"A": [np.nan, np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result_2, expected_2) \ No newline at end of file From 24b158d6622cb925adf51fc11a93111a70976cf7 Mon Sep 17 00:00:00 2001 From: sshu2017 Date: Mon, 9 Sep 2024 00:21:37 -0700 Subject: [PATCH 2/8] pre-commit done --- pandas/tests/io/parser/test_na_values.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index b5bd66c053eee..39f233c683570 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -813,6 +813,7 @@ def test_bool_and_nan_to_float(all_parsers): expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) tm.assert_frame_equal(result, expected) + @xfail_pyarrow def test_na_values_dict_without_dtype(all_parsers): # GH#59303 @@ -830,4 +831,4 @@ def test_na_values_dict_without_dtype(all_parsers): # this would PASS even BEFORE this fix result_2 = parser.read_csv(StringIO(data), na_values={"A": [-99, -99.0]}) expected_2 = DataFrame.from_dict({"A": [np.nan, np.nan, np.nan, np.nan]}) - tm.assert_frame_equal(result_2, expected_2) \ No newline at end of file + tm.assert_frame_equal(result_2, expected_2) From f2b8c867b9e16a8302a552a7bba72330e7374d0f Mon Sep 17 00:00:00 2001 From: sshu2017 Date: Mon, 9 Sep 2024 00:33:27 -0700 Subject: [PATCH 3/8] updated v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f25edd39cf7da..b7ab6bd57e96b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -695,6 +695,7 @@ Other - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) +- Bug in :meth:`_clean_na_values` in :class:`TextFileReader` that was not properly handling ``na_values`` when it is a list of strings. (:issue:`59303`) .. ***DO NOT USE THIS SECTION*** From d97418cb770d4f50576abf7e7b9ed7caa4fd6e62 Mon Sep 17 00:00:00 2001 From: sshu2017 Date: Mon, 9 Sep 2024 00:44:44 -0700 Subject: [PATCH 4/8] sort my entry in v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b7ab6bd57e96b..4c871e687a715 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -693,9 +693,9 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`_clean_na_values` in :class:`TextFileReader` that was not properly handling ``na_values`` when it is a list of strings. (:issue:`59303`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) -- Bug in :meth:`_clean_na_values` in :class:`TextFileReader` that was not properly handling ``na_values`` when it is a list of strings. (:issue:`59303`) .. ***DO NOT USE THIS SECTION*** From f6c1c94dfd5949803fd69190fd5c779547ca6c92 Mon Sep 17 00:00:00 2001 From: sshu2017 Date: Tue, 10 Sep 2024 17:09:05 -0700 Subject: [PATCH 5/8] changes based on comments on PR --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/tests/io/parser/test_na_values.py | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4c871e687a715..70fcc45e7be1c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -618,6 +618,7 @@ I/O - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) @@ -693,7 +694,6 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) -- Bug in :meth:`_clean_na_values` in :class:`TextFileReader` that was not properly handling ``na_values`` when it is a list of strings. (:issue:`59303`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 39f233c683570..4970cf9f9950c 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -815,20 +815,21 @@ def test_bool_and_nan_to_float(all_parsers): @xfail_pyarrow -def test_na_values_dict_without_dtype(all_parsers): - # GH#59303 +@pytest.mark.parametrize( + "na_values, expected_result, test_id", + [ + ({"A": [-99.0, -99]}, DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), "float_first"), + ({"A": [-99, -99.0]}, DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), "int_first"), + ], + ids=["float_first", "int_first"] +) +def test_na_values_dict_without_dtype(all_parsers, na_values, expected_result, test_id): parser = all_parsers data = """A -99 -99 -99.0 -99.0""" - # this would FAIL BEFORE this fix - result_1 = parser.read_csv(StringIO(data), na_values={"A": [-99.0, -99]}) - expected_1 = DataFrame.from_dict({"A": [np.nan, np.nan, np.nan, np.nan]}) - tm.assert_frame_equal(result_1, expected_1) - - # this would PASS even BEFORE this fix - result_2 = parser.read_csv(StringIO(data), na_values={"A": [-99, -99.0]}) - expected_2 = DataFrame.from_dict({"A": [np.nan, np.nan, np.nan, np.nan]}) - tm.assert_frame_equal(result_2, expected_2) + + result = parser.read_csv(StringIO(data), na_values=na_values) + tm.assert_frame_equal(result, expected_result) From 0fcf3c236589359492632deefcd1712f7e216b46 Mon Sep 17 00:00:00 2001 From: sshu2017 Date: Tue, 10 Sep 2024 18:04:49 -0700 Subject: [PATCH 6/8] reformat long lines --- pandas/tests/io/parser/test_na_values.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 4970cf9f9950c..3b8de6123571e 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -818,8 +818,12 @@ def test_bool_and_nan_to_float(all_parsers): @pytest.mark.parametrize( "na_values, expected_result, test_id", [ - ({"A": [-99.0, -99]}, DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), "float_first"), - ({"A": [-99, -99.0]}, DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), "int_first"), + ({"A": [-99.0, -99]}, + DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), + "float_first"), + ({"A": [-99, -99.0]}, + DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), + "int_first"), ], ids=["float_first", "int_first"] ) From d3b26eaef6e8e9acae2ebe3437185137f0482bbe Mon Sep 17 00:00:00 2001 From: sshu2017 Date: Tue, 10 Sep 2024 20:10:28 -0700 Subject: [PATCH 7/8] reformat test_na_values.py --- pandas/tests/io/parser/test_na_values.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 3b8de6123571e..63e130d350080 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -818,14 +818,18 @@ def test_bool_and_nan_to_float(all_parsers): @pytest.mark.parametrize( "na_values, expected_result, test_id", [ - ({"A": [-99.0, -99]}, - DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), - "float_first"), - ({"A": [-99, -99.0]}, - DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), - "int_first"), + ( + {"A": [-99.0, -99]}, + DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), + "float_first", + ), + ( + {"A": [-99, -99.0]}, + DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), + "int_first", + ), ], - ids=["float_first", "int_first"] + ids=["float_first", "int_first"], ) def test_na_values_dict_without_dtype(all_parsers, na_values, expected_result, test_id): parser = all_parsers From 9b3df1ae3ce28ed067cf5ec1cfc76d3ccd33762e Mon Sep 17 00:00:00 2001 From: sshu2017 Date: Wed, 11 Sep 2024 15:51:26 -0700 Subject: [PATCH 8/8] reformat test_na_values.py again --- pandas/tests/io/parser/test_na_values.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 63e130d350080..b612e60c959b1 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -816,22 +816,10 @@ def test_bool_and_nan_to_float(all_parsers): @xfail_pyarrow @pytest.mark.parametrize( - "na_values, expected_result, test_id", - [ - ( - {"A": [-99.0, -99]}, - DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), - "float_first", - ), - ( - {"A": [-99, -99.0]}, - DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}), - "int_first", - ), - ], - ids=["float_first", "int_first"], + "na_values", + [[-99.0, -99], [-99, -99.0]], ) -def test_na_values_dict_without_dtype(all_parsers, na_values, expected_result, test_id): +def test_na_values_dict_without_dtype(all_parsers, na_values): parser = all_parsers data = """A -99 @@ -840,4 +828,5 @@ def test_na_values_dict_without_dtype(all_parsers, na_values, expected_result, t -99.0""" result = parser.read_csv(StringIO(data), na_values=na_values) - tm.assert_frame_equal(result, expected_result) + expected = DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected)