From 74995402c0273cce7fd883e701b6e1d9b214f401 Mon Sep 17 00:00:00 2001 From: Kevin Date: Thu, 10 Aug 2023 12:28:15 -0700 Subject: [PATCH 1/3] Changed read_csv type hint for na_values and added tests Issue#53813 --- pandas/io/parsers/readers.py | 26 +++++++++--- pandas/tests/io/parser/test_na_values.py | 51 ++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7fad2b779ab28..d8ea3c10d6f90 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -65,6 +65,7 @@ if TYPE_CHECKING: from collections.abc import ( Hashable, + Iterable, Mapping, Sequence, ) @@ -637,7 +638,10 @@ def read_csv( skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -694,7 +698,10 @@ def read_csv( skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -751,7 +758,10 @@ def read_csv( skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -808,7 +818,10 @@ def read_csv( skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -878,7 +891,10 @@ def read_csv( skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False, diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 9a16ec5a50d36..35d8171a7f203 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -20,6 +20,7 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +@skip_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -38,6 +39,7 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -93,6 +95,7 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -295,6 +298,7 @@ def test_na_values_keep_default(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_no_na_values_no_keep_default(all_parsers): # see gh-4318: passing na_values=None and # keep_default_na=False yields 'None" as a na_value @@ -498,6 +502,7 @@ def test_na_values_uint64(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@skip_pyarrow def test_empty_na_values_no_default_with_index(all_parsers): # see gh-15835 data = "a,1\nb,2" @@ -646,6 +651,7 @@ def test_bool_and_nan_to_bool(all_parsers): parser.read_csv(StringIO(data), dtype="bool") +@skip_pyarrow def test_bool_and_nan_to_int(all_parsers): # GH#42808 parser = all_parsers @@ -658,6 +664,7 @@ def test_bool_and_nan_to_int(all_parsers): parser.read_csv(StringIO(data), dtype="int") +@skip_pyarrow def test_bool_and_nan_to_float(all_parsers): # GH#42808 parser = all_parsers @@ -669,3 +676,47 @@ def test_bool_and_nan_to_float(all_parsers): result = parser.read_csv(StringIO(data), dtype="float") expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_boolean_na_values(all_parsers): + # GH#53813 + parser = all_parsers + names = ["0"] + data = """True\nFalse""" + result = parser.read_csv(StringIO(data), names=names, na_values=True) + expected = DataFrame.from_dict({"0": [np.nan, False]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_tuple_na_values(all_parsers): + # GH#53813 + parser = all_parsers + names = ["0", "1"] + data = "1, 2\n3, 4" + result = parser.read_csv(StringIO(data), names=names, na_values=(1, 3)) + expected = DataFrame.from_dict({"0": [np.nan, np.nan], "1": [2, 4]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_list_na_values(all_parsers): + # GH#53813 + parser = all_parsers + names = ["0", "1"] + data = "1, 2\n3, 4" + result = parser.read_csv(StringIO(data), names=names, na_values=[1, 3]) + expected = DataFrame.from_dict({"0": [np.nan, np.nan], "1": [2, 4]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_dict_na_values(all_parsers): + # GH#53813 + parser = all_parsers + names = ["0", "1"] + data = "1, 2\n3, 4" + result = parser.read_csv(StringIO(data), names=names, na_values={0: [1, 3]}) + expected = DataFrame.from_dict({"0": [np.nan, np.nan], "1": [2, 4]}) + tm.assert_frame_equal(result, expected) From 9712e04edcda8696eb1ad64cf6227733b3b925b2 Mon Sep 17 00:00:00 2001 From: Kevin Date: Tue, 15 Aug 2023 13:41:02 -0700 Subject: [PATCH 2/3] mend --- pandas/io/parsers/readers.py | 11 +++++------ pandas/tests/io/parser/test_na_values.py | 19 +------------------ 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d8ea3c10d6f90..d3b5125042f66 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -641,8 +641,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - keep_default_na: bool = ..., + | None = None, na_filter: bool = ..., verbose: bool = ..., skip_blank_lines: bool = ..., @@ -701,7 +700,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = ..., + | None = None, keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -761,7 +760,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = ..., + | None = None, keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -821,7 +820,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = ..., + | None = None, keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -894,7 +893,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = ..., + | None = None, keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False, diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 35d8171a7f203..be92525971664 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -20,7 +20,6 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@skip_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -39,7 +38,6 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -95,7 +93,6 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -298,7 +295,6 @@ def test_na_values_keep_default(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_no_na_values_no_keep_default(all_parsers): # see gh-4318: passing na_values=None and # keep_default_na=False yields 'None" as a na_value @@ -502,7 +498,6 @@ def test_na_values_uint64(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_empty_na_values_no_default_with_index(all_parsers): # see gh-15835 data = "a,1\nb,2" @@ -664,7 +659,6 @@ def test_bool_and_nan_to_int(all_parsers): parser.read_csv(StringIO(data), dtype="int") -@skip_pyarrow def test_bool_and_nan_to_float(all_parsers): # GH#42808 parser = all_parsers @@ -678,17 +672,6 @@ def test_bool_and_nan_to_float(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_boolean_na_values(all_parsers): - # GH#53813 - parser = all_parsers - names = ["0"] - data = """True\nFalse""" - result = parser.read_csv(StringIO(data), names=names, na_values=True) - expected = DataFrame.from_dict({"0": [np.nan, False]}) - tm.assert_frame_equal(result, expected) - - @skip_pyarrow def test_tuple_na_values(all_parsers): # GH#53813 @@ -705,7 +688,7 @@ def test_list_na_values(all_parsers): # GH#53813 parser = all_parsers names = ["0", "1"] - data = "1, 2\n3, 4" + data = "1,2\n3,4" result = parser.read_csv(StringIO(data), names=names, na_values=[1, 3]) expected = DataFrame.from_dict({"0": [np.nan, np.nan], "1": [2, 4]}) tm.assert_frame_equal(result, expected) From 399ed511dfb01cb43c581c643c9c44e14d0ed082 Mon Sep 17 00:00:00 2001 From: Kevin Date: Sat, 19 Aug 2023 21:12:49 -0700 Subject: [PATCH 3/3] Changed read_csv type hint for na_values and added tests Issue#53813 --- pandas/io/parsers/readers.py | 8 +++--- pandas/tests/io/parser/test_na_values.py | 34 ------------------------ 2 files changed, 4 insertions(+), 38 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d3b5125042f66..8f37721541fa0 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -641,7 +641,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = None, + | None = ..., na_filter: bool = ..., verbose: bool = ..., skip_blank_lines: bool = ..., @@ -700,7 +700,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = None, + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -760,7 +760,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = None, + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -820,7 +820,7 @@ def read_csv( na_values: Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] - | None = None, + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index be92525971664..9a16ec5a50d36 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -646,7 +646,6 @@ def test_bool_and_nan_to_bool(all_parsers): parser.read_csv(StringIO(data), dtype="bool") -@skip_pyarrow def test_bool_and_nan_to_int(all_parsers): # GH#42808 parser = all_parsers @@ -670,36 +669,3 @@ def test_bool_and_nan_to_float(all_parsers): result = parser.read_csv(StringIO(data), dtype="float") expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_tuple_na_values(all_parsers): - # GH#53813 - parser = all_parsers - names = ["0", "1"] - data = "1, 2\n3, 4" - result = parser.read_csv(StringIO(data), names=names, na_values=(1, 3)) - expected = DataFrame.from_dict({"0": [np.nan, np.nan], "1": [2, 4]}) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_list_na_values(all_parsers): - # GH#53813 - parser = all_parsers - names = ["0", "1"] - data = "1,2\n3,4" - result = parser.read_csv(StringIO(data), names=names, na_values=[1, 3]) - expected = DataFrame.from_dict({"0": [np.nan, np.nan], "1": [2, 4]}) - tm.assert_frame_equal(result, expected) - - -@skip_pyarrow -def test_dict_na_values(all_parsers): - # GH#53813 - parser = all_parsers - names = ["0", "1"] - data = "1, 2\n3, 4" - result = parser.read_csv(StringIO(data), names=names, na_values={0: [1, 3]}) - expected = DataFrame.from_dict({"0": [np.nan, np.nan], "1": [2, 4]}) - tm.assert_frame_equal(result, expected)