From a2c0163d0e5fc81022d48b5ef96bbc0436fb35b6 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:53:41 -0500 Subject: [PATCH 1/6] test reorg --- pandas/tests/io/parser/test_dtypes.py | 956 +++++++++++++------------- 1 file changed, 472 insertions(+), 484 deletions(-) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 1e68e54b413b0..d8f819d896e55 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -17,589 +17,577 @@ import pandas._testing as tm -@pytest.mark.parametrize("dtype", [str, object]) -@pytest.mark.parametrize("check_orig", [True, False]) -def test_dtype_all_columns(all_parsers, dtype, check_orig): - # see gh-3795, gh-6607 - parser = all_parsers - - df = DataFrame( - np.random.rand(5, 2).round(4), - columns=list("AB"), - index=["1A", "1B", "1C", "1D", "1E"], - ) - - with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: - df.to_csv(path) - - result = parser.read_csv(path, dtype=dtype, index_col=0) - - if check_orig: - expected = df.copy() - result = result.astype(float) - else: - expected = df.astype(str) - - tm.assert_frame_equal(result, expected) +class TestParserDtypesBasic: + @pytest.mark.parametrize("dtype", [str, object]) + @pytest.mark.parametrize("check_orig", [True, False]) + def test_dtype_all_columns(self, all_parsers, dtype, check_orig): + # see gh-3795, gh-6607 + parser = all_parsers + + df = DataFrame( + np.random.rand(5, 2).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) + with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: + df.to_csv(path) -def test_dtype_all_columns_empty(all_parsers): - # see gh-12048 - parser = all_parsers - result = parser.read_csv(StringIO("A,B"), dtype=str) + result = parser.read_csv(path, dtype=dtype, index_col=0) - expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) - tm.assert_frame_equal(result, expected) + if check_orig: + expected = df.copy() + result = result.astype(float) + else: + expected = df.astype(str) + tm.assert_frame_equal(result, expected) -def test_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ + def test_dtype_per_column(self, all_parsers): + parser = all_parsers + data = """\ one,two 1,2.5 2,3.5 3,4.5 4,5.5""" - expected = DataFrame( - [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] - ) - expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) - - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) + expected["one"] = expected["one"].astype(np.float64) + expected["two"] = expected["two"].astype(object) + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) + tm.assert_frame_equal(result, expected) -def test_invalid_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ + def test_invalid_dtype_per_column(self, all_parsers): + parser = all_parsers + data = """\ one,two 1,2.5 2,3.5 3,4.5 4,5.5""" - with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): - parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) - - -@pytest.mark.parametrize( - "dtype", - [ - "category", - CategoricalDtype(), - {"a": "category", "b": "category", "c": CategoricalDtype()}, - ], -) -def test_categorical_dtype(all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["a", "a", "b"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): + parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + + def test_raise_on_passed_int_dtype_with_nas(self, all_parsers): + # see gh-2631 + parser = all_parsers + data = """YEAR, DOY, a + 2001,106380451,10 + 2001,,11 + 2001,106380451,67""" + + msg = ( + "Integer column has NA values" + if parser.engine == "c" + else "Unable to convert column DOY" + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True + ) + + def test_dtype_with_converters(self, all_parsers): + parser = all_parsers + data = """a,b +1.1,2.2 +1.2,2.3""" + + # Dtype spec ignored if converted specified. + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) + def test_numeric_dtype(self, all_parsers, dtype): + data = "0\n1" + parser = all_parsers + expected = DataFrame([0, 1], dtype=dtype) + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype) + tm.assert_frame_equal(expected, result) + + def test_boolean_dtype(self, all_parsers): + parser = all_parsers + data = "\n".join( + [ + "a", + "True", + "TRUE", + "true", + "1", + "1.0", + "False", + "FALSE", + "false", + "0", + "0.0", + "NaN", + "nan", + "NA", + "null", + "NULL", + ] + ) + + result = parser.read_csv(StringIO(data), dtype="boolean") + expected = DataFrame( + { + "a": pd.array( + [ + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + None, + None, + None, + None, + None, + ], + dtype="boolean", + ) + } + ) + + tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) -def test_categorical_dtype_single(all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c +class TestParserDtypesCategorical1: + @pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + {"a": "category", "b": "category", "c": CategoricalDtype()}, + ], + ) + def test_categorical_dtype(self, all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c 1,a,3.4 1,a,3.4 2,b,4.5""" - expected = DataFrame( - {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) + def test_categorical_dtype_single(self, all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) -def test_categorical_dtype_unsorted(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c + def test_categorical_dtype_unsorted(self, all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c 1,b,3.4 1,b,3.4 2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", "b", "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) -def test_categorical_dtype_missing(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c + def test_categorical_dtype_missing(self, all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c 1,b,3.4 1,nan,3.4 2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", np.nan, "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - - -@pytest.mark.slow -def test_categorical_dtype_high_cardinality_numeric(all_parsers): - # see gh-18186 - parser = all_parsers - data = np.sort([str(i) for i in range(524289)]) - expected = DataFrame({"a": Categorical(data, ordered=True)}) - - actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") - actual["a"] = actual["a"].cat.reorder_categories( - np.sort(actual.a.cat.categories), ordered=True - ) - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_latin1(all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - encoding = "latin-1" - - expected = parser.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - - actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_utf16(all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - encoding = "utf-16" - sep = "\t" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + @pytest.mark.slow + def test_categorical_dtype_high_cardinality_numeric(self, all_parsers): + # see gh-18186 + parser = all_parsers + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({"a": Categorical(data, ordered=True)}) + + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True + ) + tm.assert_frame_equal(actual, expected) - expected = parser.read_csv(pth, sep=sep, encoding=encoding) - expected = expected.apply(Categorical) + def test_categorical_dtype_utf16(self, all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + encoding = "utf-16" + sep = "\t" - actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") - tm.assert_frame_equal(actual, expected) + expected = parser.read_csv(pth, sep=sep, encoding=encoding) + expected = expected.apply(Categorical) + actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") + tm.assert_frame_equal(actual, expected) -def test_categorical_dtype_chunksize_infer_categories(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b + def test_categorical_dtype_chunksize_infer_categories(self, all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b 1,a 1,b 1,b 2,c""" - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), - DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), - ] - with parser.read_csv( - StringIO(data), dtype={"b": "category"}, chunksize=2 - ) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_chunksize_explicit_categories(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), + ] + with parser.read_csv( + StringIO(data), dtype={"b": "category"}, chunksize=2 + ) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize_explicit_categories(self, all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b 1,a 1,b 1,b 2,c""" - cats = ["a", "b", "c"] - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), - DataFrame( - {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3] - ), - ] - dtype = CategoricalDtype(cats) - with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - -@pytest.mark.parametrize("ordered", [False, True]) -@pytest.mark.parametrize( - "categories", - [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], -) -def test_categorical_category_dtype(all_parsers, categories, ordered): - parser = all_parsers - data = """a,b + cats = ["a", "b", "c"] + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), + DataFrame( + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, + index=[2, 3], + ), + ] + dtype = CategoricalDtype(cats) + with parser.read_csv( + StringIO(data), dtype={"b": dtype}, chunksize=2 + ) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +class TestParserDtypesCategorical2: + def test_categorical_dtype_latin1(self, all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + encoding = "latin-1" + + expected = parser.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + + actual = parser.read_csv( + pth, header=None, encoding=encoding, dtype={1: "category"} + ) + tm.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("ordered", [False, True]) + @pytest.mark.parametrize( + "categories", + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], + ) + def test_categorical_category_dtype(self, all_parsers, categories, ordered): + parser = all_parsers + data = """a,b 1,a 1,b 1,b 2,c""" - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical( - ["a", "b", "b", "c"], categories=categories, ordered=ordered - ), - } - ) - - dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical( + ["a", "b", "b", "c"], categories=categories, ordered=ordered + ), + } + ) + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) -def test_categorical_category_dtype_unsorted(all_parsers): - parser = all_parsers - data = """a,b + def test_categorical_category_dtype_unsorted(self, all_parsers): + parser = all_parsers + data = """a,b 1,a 1,b 1,b 2,c""" - dtype = CategoricalDtype(["c", "b", "a"]) - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), - } - ) - - result = parser.read_csv(StringIO(data), dtype={"b": dtype}) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_numeric(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([1, 2, 3])} - - data = "b\n1\n1\n2\n3" - expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_datetime(all_parsers): - parser = all_parsers - dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) - dtype = {"b": CategoricalDtype(dti)} - - data = "b\n2017-01-01\n2018-01-01\n2019-01-01" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_timestamp(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([Timestamp("2014")])} - - data = "b\n2014-01-01\n2014-01-01T00:00:00" - expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - + dtype = CategoricalDtype(["c", "b", "a"]) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), + } + ) -def test_categorical_coerces_timedelta(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + result = parser.read_csv(StringIO(data), dtype={"b": dtype}) + tm.assert_frame_equal(result, expected) - data = "b\n1H\n2H\n3H" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + def test_categorical_coerces_numeric(self, all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([1, 2, 3])} - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + data = "b\n1\n1\n2\n3" + expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "data", - [ - "b\nTrue\nFalse\nNA\nFalse", - "b\ntrue\nfalse\nNA\nfalse", - "b\nTRUE\nFALSE\nNA\nFALSE", - "b\nTrue\nFalse\nNA\nFALSE", - ], -) -def test_categorical_dtype_coerces_boolean(all_parsers, data): - # see gh-20498 - parser = all_parsers - dtype = {"b": CategoricalDtype([False, True])} - expected = DataFrame({"b": Categorical([True, False, None, False])}) + def test_categorical_coerces_datetime(self, all_parsers): + parser = all_parsers + dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) + dtype = {"b": CategoricalDtype(dti)} - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) -def test_categorical_unexpected_categories(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} + def test_categorical_coerces_timestamp(self, all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([Timestamp("2014")])} - data = "b\nd\na\nc\nd" # Unexpected c - expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) + data = "b\n2014-01-01\n2014-01-01T00:00:00" + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + def test_categorical_coerces_timedelta(self, all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} -def test_empty_pass_dtype(all_parsers): - parser = all_parsers + data = "b\n1H\n2H\n3H" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - data = "one,two" - result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, - index=Index([], dtype=object), + @pytest.mark.parametrize( + "data", + [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", + ], ) - tm.assert_frame_equal(result, expected) + def test_categorical_dtype_coerces_boolean(self, all_parsers, data): + # see gh-20498 + parser = all_parsers + dtype = {"b": CategoricalDtype([False, True])} + expected = DataFrame({"b": Categorical([True, False, None, False])}) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) -def test_empty_with_index_pass_dtype(all_parsers): - parser = all_parsers + def test_categorical_unexpected_categories(self, all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} - data = "one,two" - result = parser.read_csv( - StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} - ) + data = "b\nd\na\nc\nd" # Unexpected c + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) - expected = DataFrame( - {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") - ) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) -def test_empty_with_multi_index_pass_dtype(all_parsers): - parser = all_parsers +class TestParserDtypesEmpty: + def test_dtype_all_columns_empty(self, all_parsers): + # see gh-12048 + parser = all_parsers + result = parser.read_csv(StringIO("A,B"), dtype=str) - data = "one,two,three" - result = parser.read_csv( - StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} - ) - - exp_idx = MultiIndex.from_arrays( - [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"] - ) - expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) - tm.assert_frame_equal(result, expected) + expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) + tm.assert_frame_equal(result, expected) + def test_empty_pass_dtype(self, all_parsers): + parser = all_parsers -def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): - parser = all_parsers + data = "one,two" + result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) + def test_empty_with_index_pass_dtype(self, all_parsers): + parser = all_parsers + data = "one,two" + result = parser.read_csv( + StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} + ) -def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): - parser = all_parsers + expected = DataFrame( + {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") + ) + tm.assert_frame_equal(result, expected) - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + def test_empty_with_multi_index_pass_dtype(self, all_parsers): + parser = all_parsers - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) + data = "one,two,three" + result = parser.read_csv( + StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} + ) + exp_idx = MultiIndex.from_arrays( + [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], + names=["one", "two"], + ) + expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) + tm.assert_frame_equal(result, expected) -def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) + def test_empty_with_mangled_column_pass_dtype_by_names(self, all_parsers): + parser = all_parsers - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - tm.assert_frame_equal(result, expected) + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) -def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) + def test_empty_with_mangled_column_pass_dtype_by_indexes(self, all_parsers): + parser = all_parsers - with pytest.raises(ValueError, match="Duplicate names"): - data = "" - parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) -def test_raise_on_passed_int_dtype_with_nas(all_parsers): - # see gh-2631 - parser = all_parsers - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" + def test_empty_with_dup_column_pass_dtype_by_indexes(self, all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) - msg = ( - "Integer column has NA values" - if parser.engine == "c" - else "Unable to convert column DOY" - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + tm.assert_frame_equal(result, expected) + def test_empty_with_dup_column_pass_dtype_by_indexes_raises(self, all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) -def test_dtype_with_converters(all_parsers): - parser = all_parsers - data = """a,b -1.1,2.2 -1.2,2.3""" + with pytest.raises(ValueError, match="Duplicate names"): + data = "" + parser.read_csv( + StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"} + ) - # Dtype spec ignored if converted specified. - with tm.assert_produces_warning(ParserWarning): - result = parser.read_csv( - StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} - ) - expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype,expected", - [ - (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), - ("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])), - ( - {"a": "category", "b": "category"}, - DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), - ), - ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), - ( - "timedelta64[ns]", - DataFrame( - { - "a": Series([], dtype="timedelta64[ns]"), - "b": Series([], dtype="timedelta64[ns]"), - }, - index=[], + @pytest.mark.parametrize( + "dtype,expected", + [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ( + "category", + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), ), - ), - ( - {"a": np.int64, "b": np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], + ( + {"a": "category", "b": "category"}, + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), ), - ), - ( - {0: np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ( + "timedelta64[ns]", + DataFrame( + { + "a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]"), + }, + index=[], + ), ), - ), - ( - {"a": np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], + ( + {"a": np.int64, "b": np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), ), - ), - ], -) -def test_empty_dtype(all_parsers, dtype, expected): - # see gh-14712 - parser = all_parsers - data = "a,b" - - result = parser.read_csv(StringIO(data), header=0, dtype=dtype) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) -) -def test_numeric_dtype(all_parsers, dtype): - data = "0\n1" - parser = all_parsers - expected = DataFrame([0, 1], dtype=dtype) - - result = parser.read_csv(StringIO(data), header=None, dtype=dtype) - tm.assert_frame_equal(expected, result) - - -def test_boolean_dtype(all_parsers): - parser = all_parsers - data = "\n".join( - [ - "a", - "True", - "TRUE", - "true", - "1", - "1.0", - "False", - "FALSE", - "false", - "0", - "0.0", - "NaN", - "nan", - "NA", - "null", - "NULL", - ] - ) - - result = parser.read_csv(StringIO(data), dtype="boolean") - expected = DataFrame( - { - "a": pd.array( - [ - True, - True, - True, - True, - True, - False, - False, - False, - False, - False, - None, - None, - None, - None, - None, - ], - dtype="boolean", - ) - } + ( + {0: np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {"a": np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ], ) + def test_empty_dtype(self, all_parsers, dtype, expected): + # see gh-14712 + parser = all_parsers + data = "a,b" - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), header=0, dtype=dtype) + tm.assert_frame_equal(result, expected) From 347751f4e073e6a5c65566169ad2c029741fed6f Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sat, 19 Dec 2020 02:54:51 -0500 Subject: [PATCH 2/6] test reorg --- pandas/tests/io/parser/test_usecols.py | 1005 ++++++++++++------------ 1 file changed, 507 insertions(+), 498 deletions(-) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 7cdfb7d11ed83..e75046d3017f2 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -22,544 +22,553 @@ ) -def test_raise_on_mixed_dtype_usecols(all_parsers): - # See gh-12678 - data = """a,b,c - 1000,2000,3000 - 4000,5000,6000 - """ - usecols = [0, "b", 2] - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - -@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) -def test_usecols(all_parsers, usecols): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv(StringIO(data), usecols=usecols) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_names(all_parsers): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - names = ["foo", "bar"] - result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] -) -def test_usecols_relative_to_names(all_parsers, names, usecols): - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_relative_to_names2(all_parsers): - # see gh-5766 - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - result = parser.read_csv( - StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] +class TestParserUsecolsBasic: + def test_raise_on_mixed_dtype_usecols(self, all_parsers): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + usecols = [0, "b", 2] + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) + def test_usecols(self, all_parsers, usecols): + data = """\ + a,b,c + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + def test_usecols_with_names(self, all_parsers): + data = """\ + a,b,c + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + names = ["foo", "bar"] + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) + def test_usecols_relative_to_names(self, all_parsers, names, usecols): + data = """\ + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), names=names, header=None, usecols=usecols + ) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) - expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) + def test_usecols_relative_to_names2(self, all_parsers): + # see gh-5766 + data = """\ + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] + ) + + expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + def test_usecols_name_length_conflict(self, all_parsers): + data = """\ + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12""" + parser = all_parsers + msg = "Number of passed names did not match number of header fields in the file" -def test_usecols_name_length_conflict(all_parsers): - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - parser = all_parsers - msg = "Number of passed names did not match number of header fields in the file" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) + def test_usecols_single_string(self, all_parsers): + # see gh-20558 + parser = all_parsers + data = """foo, bar, baz + 1000, 2000, 3000 + 4000, 5000, 6000""" + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols="foo") -def test_usecols_single_string(all_parsers): - # see gh-20558 - parser = all_parsers - data = """foo, bar, baz -1000, 2000, 3000 -4000, 5000, 6000""" + @pytest.mark.parametrize( + "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] + ) + def test_usecols_index_col_false(self, all_parsers, data): + # see gh-9082 + parser = all_parsers + usecols = ["a", "c", "d"] + expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols="foo") + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("index_col", ["b", 0]) + @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) + def test_usecols_index_col_conflict(self, all_parsers, usecols, index_col): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) -@pytest.mark.parametrize( - "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] -) -def test_usecols_index_col_false(all_parsers, data): - # see gh-9082 - parser = all_parsers - usecols = ["a", "c", "d"] - expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) - tm.assert_frame_equal(result, expected) + def test_usecols_index_col_conflict2(self, all_parsers): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) + expected = expected.set_index(["b", "c"]) -@pytest.mark.parametrize("index_col", ["b", 0]) -@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) -def test_usecols_index_col_conflict(all_parsers, usecols, index_col): - # see gh-4201: test that index_col as integer reflects usecols - parser = all_parsers - data = "a,b,c,d\nA,a,1,one\nB,b,2,two" - expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) + result = parser.read_csv( + StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + ) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) - tm.assert_frame_equal(result, expected) + def test_usecols_implicit_index_col(self, all_parsers): + # see gh-2654 + parser = all_parsers + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" + result = parser.read_csv(StringIO(data), usecols=["a", "b"]) + expected = DataFrame( + {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] + ) + tm.assert_frame_equal(result, expected) -def test_usecols_index_col_conflict2(all_parsers): - # see gh-4201: test that index_col as integer reflects usecols - parser = all_parsers - data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + def test_usecols_regex_sep(self, all_parsers): + # see gh-2733 + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) - expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) - expected = expected.set_index(["b", "c"]) + expected = DataFrame( + {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] + ) + tm.assert_frame_equal(result, expected) - result = parser.read_csv( - StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] - ) - tm.assert_frame_equal(result, expected) + def test_usecols_with_whitespace(self, all_parsers): + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + result = parser.read_csv( + StringIO(data), delim_whitespace=True, usecols=("a", "b") + ) + expected = DataFrame( + {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] + ) + tm.assert_frame_equal(result, expected) -def test_usecols_implicit_index_col(all_parsers): - # see gh-2654 - parser = all_parsers - data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" + @pytest.mark.parametrize( + "usecols,expected", + [ + # Column selection by index. + ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), + # Column selection by name. + ( + ["0", "1"], + DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]), + ), + ], + ) + def test_usecols_with_integer_like_header(self, all_parsers, usecols, expected): + parser = all_parsers + data = """2,0,1 + 1000,2000,3000 + 4000,5000,6000""" - result = parser.read_csv(StringIO(data), usecols=["a", "b"]) - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + def test_empty_usecols(self, all_parsers): + data = "a,b,c\n1,2,3\n4,5,6" + expected = DataFrame() + parser = all_parsers -def test_usecols_regex_sep(all_parsers): - # see gh-2733 - parser = all_parsers - data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + result = parser.read_csv(StringIO(data), usecols=set()) + tm.assert_frame_equal(result, expected) - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) + def test_np_array_usecols(self, all_parsers): + # see gh-12546 + parser = all_parsers + data = "a,b,c\n1,2,3" + usecols = np.array(["a", "b"]) + expected = DataFrame([[1, 2]], columns=usecols) + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) -def test_usecols_with_whitespace(all_parsers): - parser = all_parsers - data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + @pytest.mark.parametrize( + "usecols,expected", + [ + ( + lambda x: x.upper() in ["AAA", "BBB", "DDD"], + DataFrame( + { + "AaA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "bBb": {0: 8, 1: 2, 2: 7}, + "ddd": {0: "a", 1: "b", 2: "a"}, + } + ), + ), + (lambda x: False, DataFrame()), + ], + ) + def test_callable_usecols(self, all_parsers, usecols, expected): + # see gh-14154 + data = """AaA,bBb,CCC,ddd + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) - expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) + def test_incomplete_first_row(self, all_parsers, usecols): + # see gh-6710 + data = "1,2\n1,2,3" + parser = all_parsers + names = ["a", "b", "c"] + expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) + result = parser.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "usecols,expected", - [ - # Column selection by index. - ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), - # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), - ], -) -def test_usecols_with_integer_like_header(all_parsers, usecols, expected): - parser = all_parsers - data = """2,0,1 -1000,2000,3000 -4000,5000,6000""" - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -def test_usecols_with_parse_dates(all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates2(all_parsers): - # see gh-13604 - parser = all_parsers - data = """2008-02-07 09:40,1032.43 -2008-02-07 09:50,1042.54 -2008-02-07 10:00,1051.65""" - - names = ["date", "values"] - usecols = names[:] - parse_dates = [0] - - index = Index( + @pytest.mark.parametrize( + "data,usecols,kwargs,expected", [ - Timestamp("2008-02-07 09:40"), - Timestamp("2008-02-07 09:50"), - Timestamp("2008-02-07 10:00"), + # see gh-8985 + ( + "19,29,39\n" * 2 + "10,20,30,40", + [0, 1, 2], + {"header": None}, + DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), + ), + # see gh-9549 + ( + ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), + ["A", "B", "C"], + {}, + DataFrame( + { + "A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7], + } + ), + ), ], - name="date", ) - cols = {"values": [1032.43, 1042.54, 1051.65]} - expected = DataFrame(cols, index=index) - - result = parser.read_csv( - StringIO(data), - parse_dates=parse_dates, - index_col=0, - usecols=usecols, - header=None, - names=names, - ) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates3(all_parsers): - # see gh-14792 - parser = all_parsers - data = """a,b,c,d,e,f,g,h,i,j -2016/09/21,1,1,2,3,4,5,6,7,8""" - - usecols = list("abcdefghij") - parse_dates = [0] - - cols = { - "a": Timestamp("2016-09-21"), - "b": [1], - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=usecols) - - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates4(all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], -) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): - # see gh-9755 - s = """0,1,20140101,0900,4 -0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols - ) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_unicode_strings(all_parsers): - # see gh-13219 - data = """AAA,BBB,CCC,DDD -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "BBB": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_single_byte_unicode_strings(all_parsers): - # see gh-13219 - data = """A,B,C,D -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "B": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) -def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): - data = """AAA,BBB,CCC,DDD -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - -@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) -def test_usecols_with_multi_byte_characters(all_parsers, usecols): - data = """あああ,いい,ううう,ええええ -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "いい": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -def test_empty_usecols(all_parsers): - data = "a,b,c\n1,2,3\n4,5,6" - expected = DataFrame() - parser = all_parsers - - result = parser.read_csv(StringIO(data), usecols=set()) - tm.assert_frame_equal(result, expected) - - -def test_np_array_usecols(all_parsers): - # see gh-12546 - parser = all_parsers - data = "a,b,c\n1,2,3" - usecols = np.array(["a", "b"]) - - expected = DataFrame([[1, 2]], columns=usecols) - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "usecols,expected", - [ - ( - lambda x: x.upper() in ["AAA", "BBB", "DDD"], - DataFrame( - { - "AaA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "bBb": {0: 8, 1: 2, 2: 7}, - "ddd": {0: "a", 1: "b", 2: "a"}, - } - ), - ), - (lambda x: False, DataFrame()), - ], -) -def test_callable_usecols(all_parsers, usecols, expected): - # see gh-14154 - data = """AaA,bBb,CCC,ddd -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers + def test_uneven_length_cols(self, all_parsers, data, usecols, kwargs, expected): + # see gh-8985 + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "usecols,kwargs,expected,msg", + [ + ( + ["a", "b", "c", "d"], + {}, + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + None, + ), + ( + ["a", "b", "c", "f"], + {}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")), + ( + ["a", "b", "f", "g"], + {}, + None, + _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), + ), + # see gh-14671 + ( + None, + {"header": 0, "names": ["A", "B", "C", "D"]}, + DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), + None, + ), + ( + ["A", "B", "C", "f"], + {"header": 0, "names": ["A", "B", "C", "D"]}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ( + ["A", "B", "f"], + {"names": ["A", "B", "C", "D"]}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ], + ) + def test_raises_on_usecols_names_mismatch( + self, all_parsers, usecols, kwargs, expected, msg + ): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + kwargs.update(usecols=usecols) + parser = all_parsers + + if expected is None: + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) + def test_usecols_subset_names_mismatch_orig_columns(self, all_parsers, usecols): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + expected = DataFrame({"A": [1, 5], "C": [3, 7]}) + tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) -def test_incomplete_first_row(all_parsers, usecols): - # see gh-6710 - data = "1,2\n1,2,3" - parser = all_parsers - names = ["a", "b", "c"] - expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) +class TestUsecolsParseDates: + @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) + def test_usecols_with_parse_dates(self, all_parsers, usecols): + # see gh-9755 + data = """a,b,c,d,e + 0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parser = all_parsers + parse_dates = [[1, 2]] + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + result = parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), names=names, usecols=usecols) - tm.assert_frame_equal(result, expected) + def test_usecols_with_parse_dates2(self, all_parsers): + # see gh-13604 + parser = all_parsers + data = """2008-02-07 09:40,1032.43 + 2008-02-07 09:50,1042.54 + 2008-02-07 10:00,1051.65""" + + names = ["date", "values"] + usecols = names[:] + parse_dates = [0] + + index = Index( + [ + Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00"), + ], + name="date", + ) + cols = {"values": [1032.43, 1042.54, 1051.65]} + expected = DataFrame(cols, index=index) + + result = parser.read_csv( + StringIO(data), + parse_dates=parse_dates, + index_col=0, + usecols=usecols, + header=None, + names=names, + ) + tm.assert_frame_equal(result, expected) + def test_usecols_with_parse_dates3(self, all_parsers): + # see gh-14792 + parser = all_parsers + data = """a,b,c,d,e,f,g,h,i,j + 2016/09/21,1,1,2,3,4,5,6,7,8""" + + usecols = list("abcdefghij") + parse_dates = [0] + + cols = { + "a": Timestamp("2016-09-21"), + "b": [1], + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=usecols) + + result = parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "data,usecols,kwargs,expected", - [ - # see gh-8985 - ( - "19,29,39\n" * 2 + "10,20,30,40", - [0, 1, 2], - {"header": None}, - DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), - ), - # see gh-9549 - ( - ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), - ["A", "B", "C"], - {}, - DataFrame( - { - "A": [1, 3, 1, 1, 1, 5], - "B": [2, 4, 2, 2, 2, 6], - "C": [3, 5, 4, 3, 3, 7], - } - ), - ), - ], -) -def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): - # see gh-8985 - parser = all_parsers - result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "usecols,kwargs,expected,msg", - [ - ( - ["a", "b", "c", "d"], - {}, - DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), - None, - ), - ( - ["a", "b", "c", "f"], - {}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")), - ( - ["a", "b", "f", "g"], - {}, - None, - _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), - ), - # see gh-14671 - ( - None, - {"header": 0, "names": ["A", "B", "C", "D"]}, - DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), - None, - ), - ( - ["A", "B", "C", "f"], - {"header": 0, "names": ["A", "B", "C", "D"]}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - ( - ["A", "B", "f"], - {"names": ["A", "B", "C", "D"]}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - ], -) -def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): - data = "a,b,c,d\n1,2,3,4\n5,6,7,8" - kwargs.update(usecols=usecols) - parser = all_parsers + def test_usecols_with_parse_dates4(self, all_parsers): + data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" + usecols = list("abcdefghij") + parse_dates = [[0, 1]] + parser = all_parsers + + cols = { + "a_b": "2016/09/21 1", + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) + + result = parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + tm.assert_frame_equal(result, expected) - if expected is None: - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - result = parser.read_csv(StringIO(data), **kwargs) + @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) + @pytest.mark.parametrize( + "names", + [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. + ], + ) + def test_usecols_with_parse_dates_and_names(self, all_parsers, usecols, names): + # see gh-9755 + s = """0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + parser = all_parsers + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) -def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): - data = "a,b,c,d\n1,2,3,4\n5,6,7,8" - names = ["A", "B", "C", "D"] - parser = all_parsers +class TestUsecolsStrings: + def test_usecols_with_unicode_strings(self, all_parsers): + # see gh-13219 + data = """AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "AAA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "BBB": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) + tm.assert_frame_equal(result, expected) + + def test_usecols_with_single_byte_unicode_strings(self, all_parsers): + # see gh-13219 + data = """A,B,C,D + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "A": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "B": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["A", "B"]) + tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) - expected = DataFrame({"A": [1, 5], "C": [3, 7]}) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) + def test_usecols_with_mixed_encoding_strings(self, all_parsers, usecols): + data = """AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + @pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) + def test_usecols_with_multi_byte_characters(self, all_parsers, usecols): + data = """あああ,いい,ううう,ええええ + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "あああ": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "いい": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) From 6f92c2399fa96cbb38d362eb1ff2cd9c0afc44e3 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 30 Dec 2020 23:31:09 -0500 Subject: [PATCH 3/6] split test_dtypes.py into multiple files --- pandas/tests/io/parser/dtypes/test_basic.py | 167 +++++ .../io/parser/dtypes/test_categorical.py | 294 +++++++++ pandas/tests/io/parser/dtypes/test_empty.py | 172 +++++ pandas/tests/io/parser/test_dtypes.py | 593 ------------------ 4 files changed, 633 insertions(+), 593 deletions(-) create mode 100644 pandas/tests/io/parser/dtypes/test_basic.py create mode 100644 pandas/tests/io/parser/dtypes/test_categorical.py create mode 100644 pandas/tests/io/parser/dtypes/test_empty.py delete mode 100644 pandas/tests/io/parser/test_dtypes.py diff --git a/pandas/tests/io/parser/dtypes/test_basic.py b/pandas/tests/io/parser/dtypes/test_basic.py new file mode 100644 index 0000000000000..e416d8dcdd905 --- /dev/null +++ b/pandas/tests/io/parser/dtypes/test_basic.py @@ -0,0 +1,167 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import ParserWarning + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.mark.parametrize("dtype", [str, object]) +@pytest.mark.parametrize("check_orig", [True, False]) +def test_dtype_all_columns(all_parsers, dtype, check_orig): + # see gh-3795, gh-6607 + parser = all_parsers + + df = DataFrame( + np.random.rand(5, 2).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) + + with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, dtype=dtype, index_col=0) + + if check_orig: + expected = df.copy() + result = result.astype(float) + else: + expected = df.astype(str) + + tm.assert_frame_equal(result, expected) + + +def test_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) + expected["one"] = expected["one"].astype(np.float64) + expected["two"] = expected["two"].astype(object) + + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) + tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): + parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + + +def test_raise_on_passed_int_dtype_with_nas(all_parsers): + # see gh-2631 + parser = all_parsers + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + + msg = ( + "Integer column has NA values" + if parser.engine == "c" + else "Unable to convert column DOY" + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) + + +def test_dtype_with_converters(all_parsers): + parser = all_parsers + data = """a,b +1.1,2.2 +1.2,2.3""" + + # Dtype spec ignored if converted specified. + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) +) +def test_numeric_dtype(all_parsers, dtype): + data = "0\n1" + parser = all_parsers + expected = DataFrame([0, 1], dtype=dtype) + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype) + tm.assert_frame_equal(expected, result) + + +def test_boolean_dtype(all_parsers): + parser = all_parsers + data = "\n".join( + [ + "a", + "True", + "TRUE", + "true", + "1", + "1.0", + "False", + "FALSE", + "false", + "0", + "0.0", + "NaN", + "nan", + "NA", + "null", + "NULL", + ] + ) + + result = parser.read_csv(StringIO(data), dtype="boolean") + expected = DataFrame( + { + "a": pd.array( + [ + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + None, + None, + None, + None, + None, + ], + dtype="boolean", + ) + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py new file mode 100644 index 0000000000000..2f569424a82f5 --- /dev/null +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -0,0 +1,294 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Timestamp +import pandas._testing as tm + + +@pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + {"a": "category", "b": "category", "c": CategoricalDtype()}, + ], +) +def test_categorical_dtype(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) +def test_categorical_dtype_single(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_unsorted(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_missing(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.slow +def test_categorical_dtype_high_cardinality_numeric(all_parsers): + # see gh-18186 + parser = all_parsers + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({"a": Categorical(data, ordered=True)}) + + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True + ) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_utf16(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + encoding = "utf-16" + sep = "\t" + + expected = parser.read_csv(pth, sep=sep, encoding=encoding) + expected = expected.apply(Categorical) + + actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_infer_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), + ] + with parser.read_csv( + StringIO(data), dtype={"b": "category"}, chunksize=2 + ) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_explicit_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + cats = ["a", "b", "c"] + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), + DataFrame( + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, + index=[2, 3], + ), + ] + dtype = CategoricalDtype(cats) + with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_latin1(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + encoding = "latin-1" + + expected = parser.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + + actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize( + "categories", + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], +) +def test_categorical_category_dtype(all_parsers, categories, ordered): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical( + ["a", "b", "b", "c"], categories=categories, ordered=ordered + ), + } + ) + + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_category_dtype_unsorted(all_parsers): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + dtype = CategoricalDtype(["c", "b", "a"]) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), + } + ) + + result = parser.read_csv(StringIO(data), dtype={"b": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_numeric(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([1, 2, 3])} + + data = "b\n1\n1\n2\n3" + expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_datetime(all_parsers): + parser = all_parsers + dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) + dtype = {"b": CategoricalDtype(dti)} + + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timestamp(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([Timestamp("2014")])} + + data = "b\n2014-01-01\n2014-01-01T00:00:00" + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timedelta(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + + data = "b\n1H\n2H\n3H" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", + ], +) +def test_categorical_dtype_coerces_boolean(all_parsers, data): + # see gh-20498 + parser = all_parsers + dtype = {"b": CategoricalDtype([False, True])} + expected = DataFrame({"b": Categorical([True, False, None, False])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_unexpected_categories(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} + + data = "b\nd\na\nc\nd" # Unexpected c + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py new file mode 100644 index 0000000000000..57d729fb4b7fc --- /dev/null +++ b/pandas/tests/io/parser/dtypes/test_empty.py @@ -0,0 +1,172 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, concat +import pandas._testing as tm + + +def test_dtype_all_columns_empty(all_parsers): + # see gh-12048 + parser = all_parsers + result = parser.read_csv(StringIO("A,B"), dtype=str) + + expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) + tm.assert_frame_equal(result, expected) + + +def test_empty_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv( + StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} + ) + + expected = DataFrame( + {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_multi_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two,three" + result = parser.read_csv( + StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} + ) + + exp_idx = MultiIndex.from_arrays( + [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], + names=["one", "two"], + ) + expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) + + with pytest.raises(ValueError, match="Duplicate names"): + data = "" + parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) + + +@pytest.mark.parametrize( + "dtype,expected", + [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ( + "category", + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ( + {"a": "category", "b": "category"}, + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ( + "timedelta64[ns]", + DataFrame( + { + "a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]"), + }, + index=[], + ), + ), + ( + {"a": np.int64, "b": np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {0: np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {"a": np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ], +) +def test_empty_dtype(all_parsers, dtype, expected): + # see gh-14712 + parser = all_parsers + data = "a,b" + + result = parser.read_csv(StringIO(data), header=0, dtype=dtype) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py deleted file mode 100644 index d8f819d896e55..0000000000000 --- a/pandas/tests/io/parser/test_dtypes.py +++ /dev/null @@ -1,593 +0,0 @@ -""" -Tests dtype specification during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO -import os - -import numpy as np -import pytest - -from pandas.errors import ParserWarning - -from pandas.core.dtypes.dtypes import CategoricalDtype - -import pandas as pd -from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat -import pandas._testing as tm - - -class TestParserDtypesBasic: - @pytest.mark.parametrize("dtype", [str, object]) - @pytest.mark.parametrize("check_orig", [True, False]) - def test_dtype_all_columns(self, all_parsers, dtype, check_orig): - # see gh-3795, gh-6607 - parser = all_parsers - - df = DataFrame( - np.random.rand(5, 2).round(4), - columns=list("AB"), - index=["1A", "1B", "1C", "1D", "1E"], - ) - - with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: - df.to_csv(path) - - result = parser.read_csv(path, dtype=dtype, index_col=0) - - if check_orig: - expected = df.copy() - result = result.astype(float) - else: - expected = df.astype(str) - - tm.assert_frame_equal(result, expected) - - def test_dtype_per_column(self, all_parsers): - parser = all_parsers - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - expected = DataFrame( - [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] - ) - expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) - - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) - tm.assert_frame_equal(result, expected) - - def test_invalid_dtype_per_column(self, all_parsers): - parser = all_parsers - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): - parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) - - def test_raise_on_passed_int_dtype_with_nas(self, all_parsers): - # see gh-2631 - parser = all_parsers - data = """YEAR, DOY, a - 2001,106380451,10 - 2001,,11 - 2001,106380451,67""" - - msg = ( - "Integer column has NA values" - if parser.engine == "c" - else "Unable to convert column DOY" - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv( - StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True - ) - - def test_dtype_with_converters(self, all_parsers): - parser = all_parsers - data = """a,b -1.1,2.2 -1.2,2.3""" - - # Dtype spec ignored if converted specified. - with tm.assert_produces_warning(ParserWarning): - result = parser.read_csv( - StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} - ) - expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) - ) - def test_numeric_dtype(self, all_parsers, dtype): - data = "0\n1" - parser = all_parsers - expected = DataFrame([0, 1], dtype=dtype) - - result = parser.read_csv(StringIO(data), header=None, dtype=dtype) - tm.assert_frame_equal(expected, result) - - def test_boolean_dtype(self, all_parsers): - parser = all_parsers - data = "\n".join( - [ - "a", - "True", - "TRUE", - "true", - "1", - "1.0", - "False", - "FALSE", - "false", - "0", - "0.0", - "NaN", - "nan", - "NA", - "null", - "NULL", - ] - ) - - result = parser.read_csv(StringIO(data), dtype="boolean") - expected = DataFrame( - { - "a": pd.array( - [ - True, - True, - True, - True, - True, - False, - False, - False, - False, - False, - None, - None, - None, - None, - None, - ], - dtype="boolean", - ) - } - ) - - tm.assert_frame_equal(result, expected) - - -class TestParserDtypesCategorical1: - @pytest.mark.parametrize( - "dtype", - [ - "category", - CategoricalDtype(), - {"a": "category", "b": "category", "c": CategoricalDtype()}, - ], - ) - def test_categorical_dtype(self, all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["a", "a", "b"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - - @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) - def test_categorical_dtype_single(self, all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = DataFrame( - {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_unsorted(self, all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,b,3.4 -1,b,3.4 -2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", "b", "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_missing(self, all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,b,3.4 -1,nan,3.4 -2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", np.nan, "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - - @pytest.mark.slow - def test_categorical_dtype_high_cardinality_numeric(self, all_parsers): - # see gh-18186 - parser = all_parsers - data = np.sort([str(i) for i in range(524289)]) - expected = DataFrame({"a": Categorical(data, ordered=True)}) - - actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") - actual["a"] = actual["a"].cat.reorder_categories( - np.sort(actual.a.cat.categories), ordered=True - ) - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_utf16(self, all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - encoding = "utf-16" - sep = "\t" - - expected = parser.read_csv(pth, sep=sep, encoding=encoding) - expected = expected.apply(Categorical) - - actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_chunksize_infer_categories(self, all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), - DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), - ] - with parser.read_csv( - StringIO(data), dtype={"b": "category"}, chunksize=2 - ) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_chunksize_explicit_categories(self, all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - cats = ["a", "b", "c"] - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), - DataFrame( - {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, - index=[2, 3], - ), - ] - dtype = CategoricalDtype(cats) - with parser.read_csv( - StringIO(data), dtype={"b": dtype}, chunksize=2 - ) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - -class TestParserDtypesCategorical2: - def test_categorical_dtype_latin1(self, all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - encoding = "latin-1" - - expected = parser.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - - actual = parser.read_csv( - pth, header=None, encoding=encoding, dtype={1: "category"} - ) - tm.assert_frame_equal(actual, expected) - - @pytest.mark.parametrize("ordered", [False, True]) - @pytest.mark.parametrize( - "categories", - [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], - ) - def test_categorical_category_dtype(self, all_parsers, categories, ordered): - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical( - ["a", "b", "b", "c"], categories=categories, ordered=ordered - ), - } - ) - - dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categorical_category_dtype_unsorted(self, all_parsers): - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - dtype = CategoricalDtype(["c", "b", "a"]) - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), - } - ) - - result = parser.read_csv(StringIO(data), dtype={"b": dtype}) - tm.assert_frame_equal(result, expected) - - def test_categorical_coerces_numeric(self, all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([1, 2, 3])} - - data = "b\n1\n1\n2\n3" - expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categorical_coerces_datetime(self, all_parsers): - parser = all_parsers - dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) - dtype = {"b": CategoricalDtype(dti)} - - data = "b\n2017-01-01\n2018-01-01\n2019-01-01" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categorical_coerces_timestamp(self, all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([Timestamp("2014")])} - - data = "b\n2014-01-01\n2014-01-01T00:00:00" - expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categorical_coerces_timedelta(self, all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} - - data = "b\n1H\n2H\n3H" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "data", - [ - "b\nTrue\nFalse\nNA\nFalse", - "b\ntrue\nfalse\nNA\nfalse", - "b\nTRUE\nFALSE\nNA\nFALSE", - "b\nTrue\nFalse\nNA\nFALSE", - ], - ) - def test_categorical_dtype_coerces_boolean(self, all_parsers, data): - # see gh-20498 - parser = all_parsers - dtype = {"b": CategoricalDtype([False, True])} - expected = DataFrame({"b": Categorical([True, False, None, False])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categorical_unexpected_categories(self, all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} - - data = "b\nd\na\nc\nd" # Unexpected c - expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -class TestParserDtypesEmpty: - def test_dtype_all_columns_empty(self, all_parsers): - # see gh-12048 - parser = all_parsers - result = parser.read_csv(StringIO("A,B"), dtype=str) - - expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) - tm.assert_frame_equal(result, expected) - - def test_empty_pass_dtype(self, all_parsers): - parser = all_parsers - - data = "one,two" - result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) - - def test_empty_with_index_pass_dtype(self, all_parsers): - parser = all_parsers - - data = "one,two" - result = parser.read_csv( - StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} - ) - - expected = DataFrame( - {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") - ) - tm.assert_frame_equal(result, expected) - - def test_empty_with_multi_index_pass_dtype(self, all_parsers): - parser = all_parsers - - data = "one,two,three" - result = parser.read_csv( - StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} - ) - - exp_idx = MultiIndex.from_arrays( - [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], - names=["one", "two"], - ) - expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) - tm.assert_frame_equal(result, expected) - - def test_empty_with_mangled_column_pass_dtype_by_names(self, all_parsers): - parser = all_parsers - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) - - def test_empty_with_mangled_column_pass_dtype_by_indexes(self, all_parsers): - parser = all_parsers - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) - - def test_empty_with_dup_column_pass_dtype_by_indexes(self, all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - tm.assert_frame_equal(result, expected) - - def test_empty_with_dup_column_pass_dtype_by_indexes_raises(self, all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) - - with pytest.raises(ValueError, match="Duplicate names"): - data = "" - parser.read_csv( - StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"} - ) - - @pytest.mark.parametrize( - "dtype,expected", - [ - (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), - ( - "category", - DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), - ), - ( - {"a": "category", "b": "category"}, - DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), - ), - ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), - ( - "timedelta64[ns]", - DataFrame( - { - "a": Series([], dtype="timedelta64[ns]"), - "b": Series([], dtype="timedelta64[ns]"), - }, - index=[], - ), - ), - ( - {"a": np.int64, "b": np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ( - {0: np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ( - {"a": np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ], - ) - def test_empty_dtype(self, all_parsers, dtype, expected): - # see gh-14712 - parser = all_parsers - data = "a,b" - - result = parser.read_csv(StringIO(data), header=0, dtype=dtype) - tm.assert_frame_equal(result, expected) From e17ddfddcf22eb0ad790d5cc44e640599d1ef826 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 30 Dec 2020 23:56:58 -0500 Subject: [PATCH 4/6] split test_usecols.py into multiple files --- pandas/tests/io/parser/test_usecols.py | 574 ------------------ pandas/tests/io/parser/usecols/test_basic.py | 372 ++++++++++++ .../io/parser/usecols/test_parse_dates.py | 149 +++++ .../tests/io/parser/usecols/test_strings.py | 97 +++ 4 files changed, 618 insertions(+), 574 deletions(-) delete mode 100644 pandas/tests/io/parser/test_usecols.py create mode 100644 pandas/tests/io/parser/usecols/test_basic.py create mode 100644 pandas/tests/io/parser/usecols/test_parse_dates.py create mode 100644 pandas/tests/io/parser/usecols/test_strings.py diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py deleted file mode 100644 index e75046d3017f2..0000000000000 --- a/pandas/tests/io/parser/test_usecols.py +++ /dev/null @@ -1,574 +0,0 @@ -""" -Tests the usecols functionality during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas._libs.tslib import Timestamp - -from pandas import DataFrame, Index -import pandas._testing as tm - -_msg_validate_usecols_arg = ( - "'usecols' must either be list-like " - "of all strings, all unicode, all " - "integers or a callable." -) -_msg_validate_usecols_names = ( - "Usecols do not match columns, columns expected but not found: {0}" -) - - -class TestParserUsecolsBasic: - def test_raise_on_mixed_dtype_usecols(self, all_parsers): - # See gh-12678 - data = """a,b,c - 1000,2000,3000 - 4000,5000,6000 - """ - usecols = [0, "b", 2] - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) - def test_usecols(self, all_parsers, usecols): - data = """\ - a,b,c - 1,2,3 - 4,5,6 - 7,8,9 - 10,11,12""" - parser = all_parsers - result = parser.read_csv(StringIO(data), usecols=usecols) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) - tm.assert_frame_equal(result, expected) - - def test_usecols_with_names(self, all_parsers): - data = """\ - a,b,c - 1,2,3 - 4,5,6 - 7,8,9 - 10,11,12""" - parser = all_parsers - names = ["foo", "bar"] - result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] - ) - def test_usecols_relative_to_names(self, all_parsers, names, usecols): - data = """\ - 1,2,3 - 4,5,6 - 7,8,9 - 10,11,12""" - parser = all_parsers - result = parser.read_csv( - StringIO(data), names=names, header=None, usecols=usecols - ) - - expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) - tm.assert_frame_equal(result, expected) - - def test_usecols_relative_to_names2(self, all_parsers): - # see gh-5766 - data = """\ - 1,2,3 - 4,5,6 - 7,8,9 - 10,11,12""" - parser = all_parsers - result = parser.read_csv( - StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] - ) - - expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - def test_usecols_name_length_conflict(self, all_parsers): - data = """\ - 1,2,3 - 4,5,6 - 7,8,9 - 10,11,12""" - parser = all_parsers - msg = "Number of passed names did not match number of header fields in the file" - - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) - - def test_usecols_single_string(self, all_parsers): - # see gh-20558 - parser = all_parsers - data = """foo, bar, baz - 1000, 2000, 3000 - 4000, 5000, 6000""" - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols="foo") - - @pytest.mark.parametrize( - "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] - ) - def test_usecols_index_col_false(self, all_parsers, data): - # see gh-9082 - parser = all_parsers - usecols = ["a", "c", "d"] - expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) - - result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("index_col", ["b", 0]) - @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) - def test_usecols_index_col_conflict(self, all_parsers, usecols, index_col): - # see gh-4201: test that index_col as integer reflects usecols - parser = all_parsers - data = "a,b,c,d\nA,a,1,one\nB,b,2,two" - expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) - - result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) - tm.assert_frame_equal(result, expected) - - def test_usecols_index_col_conflict2(self, all_parsers): - # see gh-4201: test that index_col as integer reflects usecols - parser = all_parsers - data = "a,b,c,d\nA,a,1,one\nB,b,2,two" - - expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) - expected = expected.set_index(["b", "c"]) - - result = parser.read_csv( - StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] - ) - tm.assert_frame_equal(result, expected) - - def test_usecols_implicit_index_col(self, all_parsers): - # see gh-2654 - parser = all_parsers - data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" - - result = parser.read_csv(StringIO(data), usecols=["a", "b"]) - expected = DataFrame( - {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] - ) - tm.assert_frame_equal(result, expected) - - def test_usecols_regex_sep(self, all_parsers): - # see gh-2733 - parser = all_parsers - data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) - - expected = DataFrame( - {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] - ) - tm.assert_frame_equal(result, expected) - - def test_usecols_with_whitespace(self, all_parsers): - parser = all_parsers - data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - - result = parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) - expected = DataFrame( - {"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8] - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "usecols,expected", - [ - # Column selection by index. - ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), - # Column selection by name. - ( - ["0", "1"], - DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]), - ), - ], - ) - def test_usecols_with_integer_like_header(self, all_parsers, usecols, expected): - parser = all_parsers - data = """2,0,1 - 1000,2000,3000 - 4000,5000,6000""" - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - def test_empty_usecols(self, all_parsers): - data = "a,b,c\n1,2,3\n4,5,6" - expected = DataFrame() - parser = all_parsers - - result = parser.read_csv(StringIO(data), usecols=set()) - tm.assert_frame_equal(result, expected) - - def test_np_array_usecols(self, all_parsers): - # see gh-12546 - parser = all_parsers - data = "a,b,c\n1,2,3" - usecols = np.array(["a", "b"]) - - expected = DataFrame([[1, 2]], columns=usecols) - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "usecols,expected", - [ - ( - lambda x: x.upper() in ["AAA", "BBB", "DDD"], - DataFrame( - { - "AaA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "bBb": {0: 8, 1: 2, 2: 7}, - "ddd": {0: "a", 1: "b", 2: "a"}, - } - ), - ), - (lambda x: False, DataFrame()), - ], - ) - def test_callable_usecols(self, all_parsers, usecols, expected): - # see gh-14154 - data = """AaA,bBb,CCC,ddd - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a""" - parser = all_parsers - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) - def test_incomplete_first_row(self, all_parsers, usecols): - # see gh-6710 - data = "1,2\n1,2,3" - parser = all_parsers - names = ["a", "b", "c"] - expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) - - result = parser.read_csv(StringIO(data), names=names, usecols=usecols) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "data,usecols,kwargs,expected", - [ - # see gh-8985 - ( - "19,29,39\n" * 2 + "10,20,30,40", - [0, 1, 2], - {"header": None}, - DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), - ), - # see gh-9549 - ( - ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), - ["A", "B", "C"], - {}, - DataFrame( - { - "A": [1, 3, 1, 1, 1, 5], - "B": [2, 4, 2, 2, 2, 6], - "C": [3, 5, 4, 3, 3, 7], - } - ), - ), - ], - ) - def test_uneven_length_cols(self, all_parsers, data, usecols, kwargs, expected): - # see gh-8985 - parser = all_parsers - result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "usecols,kwargs,expected,msg", - [ - ( - ["a", "b", "c", "d"], - {}, - DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), - None, - ), - ( - ["a", "b", "c", "f"], - {}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")), - ( - ["a", "b", "f", "g"], - {}, - None, - _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), - ), - # see gh-14671 - ( - None, - {"header": 0, "names": ["A", "B", "C", "D"]}, - DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), - None, - ), - ( - ["A", "B", "C", "f"], - {"header": 0, "names": ["A", "B", "C", "D"]}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - ( - ["A", "B", "f"], - {"names": ["A", "B", "C", "D"]}, - None, - _msg_validate_usecols_names.format(r"\['f'\]"), - ), - ], - ) - def test_raises_on_usecols_names_mismatch( - self, all_parsers, usecols, kwargs, expected, msg - ): - data = "a,b,c,d\n1,2,3,4\n5,6,7,8" - kwargs.update(usecols=usecols) - parser = all_parsers - - if expected is None: - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - else: - result = parser.read_csv(StringIO(data), **kwargs) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) - def test_usecols_subset_names_mismatch_orig_columns(self, all_parsers, usecols): - data = "a,b,c,d\n1,2,3,4\n5,6,7,8" - names = ["A", "B", "C", "D"] - parser = all_parsers - - result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) - expected = DataFrame({"A": [1, 5], "C": [3, 7]}) - tm.assert_frame_equal(result, expected) - - -class TestUsecolsParseDates: - @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) - def test_usecols_with_parse_dates(self, all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e - 0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - tm.assert_frame_equal(result, expected) - - def test_usecols_with_parse_dates2(self, all_parsers): - # see gh-13604 - parser = all_parsers - data = """2008-02-07 09:40,1032.43 - 2008-02-07 09:50,1042.54 - 2008-02-07 10:00,1051.65""" - - names = ["date", "values"] - usecols = names[:] - parse_dates = [0] - - index = Index( - [ - Timestamp("2008-02-07 09:40"), - Timestamp("2008-02-07 09:50"), - Timestamp("2008-02-07 10:00"), - ], - name="date", - ) - cols = {"values": [1032.43, 1042.54, 1051.65]} - expected = DataFrame(cols, index=index) - - result = parser.read_csv( - StringIO(data), - parse_dates=parse_dates, - index_col=0, - usecols=usecols, - header=None, - names=names, - ) - tm.assert_frame_equal(result, expected) - - def test_usecols_with_parse_dates3(self, all_parsers): - # see gh-14792 - parser = all_parsers - data = """a,b,c,d,e,f,g,h,i,j - 2016/09/21,1,1,2,3,4,5,6,7,8""" - - usecols = list("abcdefghij") - parse_dates = [0] - - cols = { - "a": Timestamp("2016-09-21"), - "b": [1], - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=usecols) - - result = parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - tm.assert_frame_equal(result, expected) - - def test_usecols_with_parse_dates4(self, all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - result = parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) - @pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], - ) - def test_usecols_with_parse_dates_and_names(self, all_parsers, usecols, names): - # see gh-9755 - s = """0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols - ) - tm.assert_frame_equal(result, expected) - - -class TestUsecolsStrings: - def test_usecols_with_unicode_strings(self, all_parsers): - # see gh-13219 - data = """AAA,BBB,CCC,DDD - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "AAA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "BBB": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) - tm.assert_frame_equal(result, expected) - - def test_usecols_with_single_byte_unicode_strings(self, all_parsers): - # see gh-13219 - data = """A,B,C,D - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "A": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "B": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["A", "B"]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) - def test_usecols_with_mixed_encoding_strings(self, all_parsers, usecols): - data = """AAA,BBB,CCC,DDD - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a""" - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - @pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) - def test_usecols_with_multi_byte_characters(self, all_parsers, usecols): - data = """あああ,いい,ううう,ええええ - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "あああ": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002, - }, - "いい": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_basic.py b/pandas/tests/io/parser/usecols/test_basic.py new file mode 100644 index 0000000000000..7d81a88e09012 --- /dev/null +++ b/pandas/tests/io/parser/usecols/test_basic.py @@ -0,0 +1,372 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame, Index +import pandas._testing as tm + +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns expected but not found: {0}" +) + + +def test_raise_on_mixed_dtype_usecols(all_parsers): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + usecols = [0, "b", 2] + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) +def test_usecols(all_parsers, usecols): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_names(all_parsers): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + names = ["foo", "bar"] + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] +) +def test_usecols_relative_to_names(all_parsers, names, usecols): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_relative_to_names2(all_parsers): + # see gh-5766 + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] + ) + + expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_name_length_conflict(all_parsers): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + msg = "Number of passed names did not match number of header fields in the file" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) + + +def test_usecols_single_string(all_parsers): + # see gh-20558 + parser = all_parsers + data = """foo, bar, baz +1000, 2000, 3000 +4000, 5000, 6000""" + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols="foo") + + +@pytest.mark.parametrize( + "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] +) +def test_usecols_index_col_false(all_parsers, data): + # see gh-9082 + parser = all_parsers + usecols = ["a", "c", "d"] + expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) + + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", ["b", 0]) +@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) +def test_usecols_index_col_conflict(all_parsers, usecols, index_col): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) + + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) + tm.assert_frame_equal(result, expected) + + +def test_usecols_index_col_conflict2(all_parsers): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + + expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) + expected = expected.set_index(["b", "c"]) + + result = parser.read_csv( + StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + ) + tm.assert_frame_equal(result, expected) + + +def test_usecols_implicit_index_col(all_parsers): + # see gh-2654 + parser = all_parsers + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" + + result = parser.read_csv(StringIO(data), usecols=["a", "b"]) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_regex_sep(all_parsers): + # see gh-2733 + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_whitespace(all_parsers): + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + + result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,expected", + [ + # Column selection by index. + ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), + # Column selection by name. + ( + ["0", "1"], + DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]), + ), + ], +) +def test_usecols_with_integer_like_header(all_parsers, usecols, expected): + parser = all_parsers + data = """2,0,1 +1000,2000,3000 +4000,5000,6000""" + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +def test_empty_usecols(all_parsers): + data = "a,b,c\n1,2,3\n4,5,6" + expected = DataFrame() + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=set()) + tm.assert_frame_equal(result, expected) + + +def test_np_array_usecols(all_parsers): + # see gh-12546 + parser = all_parsers + data = "a,b,c\n1,2,3" + usecols = np.array(["a", "b"]) + + expected = DataFrame([[1, 2]], columns=usecols) + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,expected", + [ + ( + lambda x: x.upper() in ["AAA", "BBB", "DDD"], + DataFrame( + { + "AaA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "bBb": {0: 8, 1: 2, 2: 7}, + "ddd": {0: "a", 1: "b", 2: "a"}, + } + ), + ), + (lambda x: False, DataFrame()), + ], +) +def test_callable_usecols(all_parsers, usecols, expected): + # see gh-14154 + data = """AaA,bBb,CCC,ddd +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) +def test_incomplete_first_row(all_parsers, usecols): + # see gh-6710 + data = "1,2\n1,2,3" + parser = all_parsers + names = ["a", "b", "c"] + expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) + + result = parser.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,usecols,kwargs,expected", + [ + # see gh-8985 + ( + "19,29,39\n" * 2 + "10,20,30,40", + [0, 1, 2], + {"header": None}, + DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), + ), + # see gh-9549 + ( + ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), + ["A", "B", "C"], + {}, + DataFrame( + { + "A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7], + } + ), + ), + ], +) +def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): + # see gh-8985 + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,kwargs,expected,msg", + [ + ( + ["a", "b", "c", "d"], + {}, + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + None, + ), + ( + ["a", "b", "c", "f"], + {}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + (["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")), + ( + ["a", "b", "f", "g"], + {}, + None, + _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), + ), + # see gh-14671 + ( + None, + {"header": 0, "names": ["A", "B", "C", "D"]}, + DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), + None, + ), + ( + ["A", "B", "C", "f"], + {"header": 0, "names": ["A", "B", "C", "D"]}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ( + ["A", "B", "f"], + {"names": ["A", "B", "C", "D"]}, + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ], +) +def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + kwargs.update(usecols=usecols) + parser = all_parsers + + if expected is None: + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) +def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + expected = DataFrame({"A": [1, 5], "C": [3, 7]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py new file mode 100644 index 0000000000000..c6b700c0adfff --- /dev/null +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -0,0 +1,149 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import pytest + +from pandas._libs.tslib import Timestamp + +from pandas import DataFrame, Index +import pandas._testing as tm + +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns expected but not found: {0}" +) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +def test_usecols_with_parse_dates(all_parsers, usecols): + # see gh-9755 + data = """a,b,c,d,e +0,1,20140101,0900,4 +0,1,20140102,1000,4""" + parser = all_parsers + parse_dates = [[1, 2]] + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates2(all_parsers): + # see gh-13604 + parser = all_parsers + data = """2008-02-07 09:40,1032.43 +2008-02-07 09:50,1042.54 +2008-02-07 10:00,1051.65""" + + names = ["date", "values"] + usecols = names[:] + parse_dates = [0] + + index = Index( + [ + Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00"), + ], + name="date", + ) + cols = {"values": [1032.43, 1042.54, 1051.65]} + expected = DataFrame(cols, index=index) + + result = parser.read_csv( + StringIO(data), + parse_dates=parse_dates, + index_col=0, + usecols=usecols, + header=None, + names=names, + ) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates3(all_parsers): + # see gh-14792 + parser = all_parsers + data = """a,b,c,d,e,f,g,h,i,j +2016/09/21,1,1,2,3,4,5,6,7,8""" + + usecols = list("abcdefghij") + parse_dates = [0] + + cols = { + "a": Timestamp("2016-09-21"), + "b": [1], + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=usecols) + + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates4(all_parsers): + data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" + usecols = list("abcdefghij") + parse_dates = [[0, 1]] + parser = all_parsers + + cols = { + "a_b": "2016/09/21 1", + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) + + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +@pytest.mark.parametrize( + "names", + [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. + ], +) +def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): + # see gh-9755 + s = """0,1,20140101,0900,4 +0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + parser = all_parsers + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_strings.py b/pandas/tests/io/parser/usecols/test_strings.py new file mode 100644 index 0000000000000..8cecf1fc981ee --- /dev/null +++ b/pandas/tests/io/parser/usecols/test_strings.py @@ -0,0 +1,97 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns expected but not found: {0}" +) + + +def test_usecols_with_unicode_strings(all_parsers): + # see gh-13219 + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "AAA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "BBB": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_single_byte_unicode_strings(all_parsers): + # see gh-13219 + data = """A,B,C,D +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "A": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "B": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) +def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) +def test_usecols_with_multi_byte_characters(all_parsers, usecols): + data = """あああ,いい,ううう,ええええ +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "あああ": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "いい": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) From a88314d3d867506ee8fc943df166ae5416950d2a Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 31 Dec 2020 14:18:49 -0500 Subject: [PATCH 5/6] dedeuplicate base filenames --- .../io/parser/dtypes/test_dtypes_basic.py | 167 ++++++++++++++++++ .../{test_basic.py => test_usecols_basic.py} | 0 2 files changed, 167 insertions(+) create mode 100644 pandas/tests/io/parser/dtypes/test_dtypes_basic.py rename pandas/tests/io/parser/usecols/{test_basic.py => test_usecols_basic.py} (100%) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py new file mode 100644 index 0000000000000..e416d8dcdd905 --- /dev/null +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -0,0 +1,167 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import ParserWarning + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.mark.parametrize("dtype", [str, object]) +@pytest.mark.parametrize("check_orig", [True, False]) +def test_dtype_all_columns(all_parsers, dtype, check_orig): + # see gh-3795, gh-6607 + parser = all_parsers + + df = DataFrame( + np.random.rand(5, 2).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) + + with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, dtype=dtype, index_col=0) + + if check_orig: + expected = df.copy() + result = result.astype(float) + else: + expected = df.astype(str) + + tm.assert_frame_equal(result, expected) + + +def test_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) + expected["one"] = expected["one"].astype(np.float64) + expected["two"] = expected["two"].astype(object) + + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) + tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): + parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + + +def test_raise_on_passed_int_dtype_with_nas(all_parsers): + # see gh-2631 + parser = all_parsers + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + + msg = ( + "Integer column has NA values" + if parser.engine == "c" + else "Unable to convert column DOY" + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) + + +def test_dtype_with_converters(all_parsers): + parser = all_parsers + data = """a,b +1.1,2.2 +1.2,2.3""" + + # Dtype spec ignored if converted specified. + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) +) +def test_numeric_dtype(all_parsers, dtype): + data = "0\n1" + parser = all_parsers + expected = DataFrame([0, 1], dtype=dtype) + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype) + tm.assert_frame_equal(expected, result) + + +def test_boolean_dtype(all_parsers): + parser = all_parsers + data = "\n".join( + [ + "a", + "True", + "TRUE", + "true", + "1", + "1.0", + "False", + "FALSE", + "false", + "0", + "0.0", + "NaN", + "nan", + "NA", + "null", + "NULL", + ] + ) + + result = parser.read_csv(StringIO(data), dtype="boolean") + expected = DataFrame( + { + "a": pd.array( + [ + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + None, + None, + None, + None, + None, + ], + dtype="boolean", + ) + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py similarity index 100% rename from pandas/tests/io/parser/usecols/test_basic.py rename to pandas/tests/io/parser/usecols/test_usecols_basic.py From bc43d16c10d7eb88e24d7f6b6c5dc6242365f0b4 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 31 Dec 2020 14:55:39 -0500 Subject: [PATCH 6/6] complete file renaming --- pandas/tests/io/parser/dtypes/test_basic.py | 167 -------------------- 1 file changed, 167 deletions(-) delete mode 100644 pandas/tests/io/parser/dtypes/test_basic.py diff --git a/pandas/tests/io/parser/dtypes/test_basic.py b/pandas/tests/io/parser/dtypes/test_basic.py deleted file mode 100644 index e416d8dcdd905..0000000000000 --- a/pandas/tests/io/parser/dtypes/test_basic.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -Tests dtype specification during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO - -import numpy as np -import pytest - -from pandas.errors import ParserWarning - -import pandas as pd -from pandas import DataFrame -import pandas._testing as tm - - -@pytest.mark.parametrize("dtype", [str, object]) -@pytest.mark.parametrize("check_orig", [True, False]) -def test_dtype_all_columns(all_parsers, dtype, check_orig): - # see gh-3795, gh-6607 - parser = all_parsers - - df = DataFrame( - np.random.rand(5, 2).round(4), - columns=list("AB"), - index=["1A", "1B", "1C", "1D", "1E"], - ) - - with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: - df.to_csv(path) - - result = parser.read_csv(path, dtype=dtype, index_col=0) - - if check_orig: - expected = df.copy() - result = result.astype(float) - else: - expected = df.astype(str) - - tm.assert_frame_equal(result, expected) - - -def test_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - expected = DataFrame( - [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] - ) - expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) - - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) - tm.assert_frame_equal(result, expected) - - -def test_invalid_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): - parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) - - -def test_raise_on_passed_int_dtype_with_nas(all_parsers): - # see gh-2631 - parser = all_parsers - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - - msg = ( - "Integer column has NA values" - if parser.engine == "c" - else "Unable to convert column DOY" - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) - - -def test_dtype_with_converters(all_parsers): - parser = all_parsers - data = """a,b -1.1,2.2 -1.2,2.3""" - - # Dtype spec ignored if converted specified. - with tm.assert_produces_warning(ParserWarning): - result = parser.read_csv( - StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} - ) - expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) -) -def test_numeric_dtype(all_parsers, dtype): - data = "0\n1" - parser = all_parsers - expected = DataFrame([0, 1], dtype=dtype) - - result = parser.read_csv(StringIO(data), header=None, dtype=dtype) - tm.assert_frame_equal(expected, result) - - -def test_boolean_dtype(all_parsers): - parser = all_parsers - data = "\n".join( - [ - "a", - "True", - "TRUE", - "true", - "1", - "1.0", - "False", - "FALSE", - "false", - "0", - "0.0", - "NaN", - "nan", - "NA", - "null", - "NULL", - ] - ) - - result = parser.read_csv(StringIO(data), dtype="boolean") - expected = DataFrame( - { - "a": pd.array( - [ - True, - True, - True, - True, - True, - False, - False, - False, - False, - False, - None, - None, - None, - None, - None, - ], - dtype="boolean", - ) - } - ) - - tm.assert_frame_equal(result, expected)