diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py new file mode 100644 index 0000000000000..2f569424a82f5 --- /dev/null +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -0,0 +1,294 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Timestamp +import pandas._testing as tm + + +@pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + {"a": "category", "b": "category", "c": CategoricalDtype()}, + ], +) +def test_categorical_dtype(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) +def test_categorical_dtype_single(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_unsorted(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_missing(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.slow +def test_categorical_dtype_high_cardinality_numeric(all_parsers): + # see gh-18186 + parser = all_parsers + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({"a": Categorical(data, ordered=True)}) + + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True + ) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_utf16(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + encoding = "utf-16" + sep = "\t" + + expected = parser.read_csv(pth, sep=sep, encoding=encoding) + expected = expected.apply(Categorical) + + actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_infer_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), + ] + with parser.read_csv( + StringIO(data), dtype={"b": "category"}, chunksize=2 + ) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_explicit_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + cats = ["a", "b", "c"] + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), + DataFrame( + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, + index=[2, 3], + ), + ] + dtype = CategoricalDtype(cats) + with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_latin1(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + encoding = "latin-1" + + expected = parser.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + + actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize( + "categories", + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], +) +def test_categorical_category_dtype(all_parsers, categories, ordered): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical( + ["a", "b", "b", "c"], categories=categories, ordered=ordered + ), + } + ) + + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_category_dtype_unsorted(all_parsers): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + dtype = CategoricalDtype(["c", "b", "a"]) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), + } + ) + + result = parser.read_csv(StringIO(data), dtype={"b": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_numeric(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([1, 2, 3])} + + data = "b\n1\n1\n2\n3" + expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_datetime(all_parsers): + parser = all_parsers + dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) + dtype = {"b": CategoricalDtype(dti)} + + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timestamp(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([Timestamp("2014")])} + + data = "b\n2014-01-01\n2014-01-01T00:00:00" + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timedelta(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + + data = "b\n1H\n2H\n3H" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", + ], +) +def test_categorical_dtype_coerces_boolean(all_parsers, data): + # see gh-20498 + parser = all_parsers + dtype = {"b": CategoricalDtype([False, True])} + expected = DataFrame({"b": Categorical([True, False, None, False])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_unexpected_categories(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} + + data = "b\nd\na\nc\nd" # Unexpected c + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py new file mode 100644 index 0000000000000..e416d8dcdd905 --- /dev/null +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -0,0 +1,167 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import ParserWarning + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.mark.parametrize("dtype", [str, object]) +@pytest.mark.parametrize("check_orig", [True, False]) +def test_dtype_all_columns(all_parsers, dtype, check_orig): + # see gh-3795, gh-6607 + parser = all_parsers + + df = DataFrame( + np.random.rand(5, 2).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) + + with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, dtype=dtype, index_col=0) + + if check_orig: + expected = df.copy() + result = result.astype(float) + else: + expected = df.astype(str) + + tm.assert_frame_equal(result, expected) + + +def test_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) + expected["one"] = expected["one"].astype(np.float64) + expected["two"] = expected["two"].astype(object) + + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) + tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): + parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + + +def test_raise_on_passed_int_dtype_with_nas(all_parsers): + # see gh-2631 + parser = all_parsers + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + + msg = ( + "Integer column has NA values" + if parser.engine == "c" + else "Unable to convert column DOY" + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) + + +def test_dtype_with_converters(all_parsers): + parser = all_parsers + data = """a,b +1.1,2.2 +1.2,2.3""" + + # Dtype spec ignored if converted specified. + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) +) +def test_numeric_dtype(all_parsers, dtype): + data = "0\n1" + parser = all_parsers + expected = DataFrame([0, 1], dtype=dtype) + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype) + tm.assert_frame_equal(expected, result) + + +def test_boolean_dtype(all_parsers): + parser = all_parsers + data = "\n".join( + [ + "a", + "True", + "TRUE", + "true", + "1", + "1.0", + "False", + "FALSE", + "false", + "0", + "0.0", + "NaN", + "nan", + "NA", + "null", + "NULL", + ] + ) + + result = parser.read_csv(StringIO(data), dtype="boolean") + expected = DataFrame( + { + "a": pd.array( + [ + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + None, + None, + None, + None, + None, + ], + dtype="boolean", + ) + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py new file mode 100644 index 0000000000000..57d729fb4b7fc --- /dev/null +++ b/pandas/tests/io/parser/dtypes/test_empty.py @@ -0,0 +1,172 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, concat +import pandas._testing as tm + + +def test_dtype_all_columns_empty(all_parsers): + # see gh-12048 + parser = all_parsers + result = parser.read_csv(StringIO("A,B"), dtype=str) + + expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) + tm.assert_frame_equal(result, expected) + + +def test_empty_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv( + StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} + ) + + expected = DataFrame( + {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_multi_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two,three" + result = parser.read_csv( + StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} + ) + + exp_idx = MultiIndex.from_arrays( + [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], + names=["one", "two"], + ) + expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) + + with pytest.raises(ValueError, match="Duplicate names"): + data = "" + parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) + + +@pytest.mark.parametrize( + "dtype,expected", + [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ( + "category", + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ( + {"a": "category", "b": "category"}, + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ( + "timedelta64[ns]", + DataFrame( + { + "a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]"), + }, + index=[], + ), + ), + ( + {"a": np.int64, "b": np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {0: np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {"a": np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ], +) +def test_empty_dtype(all_parsers, dtype, expected): + # see gh-14712 + parser = all_parsers + data = "a,b" + + result = parser.read_csv(StringIO(data), header=0, dtype=dtype) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py deleted file mode 100644 index 1e68e54b413b0..0000000000000 --- a/pandas/tests/io/parser/test_dtypes.py +++ /dev/null @@ -1,605 +0,0 @@ -""" -Tests dtype specification during parsing -for all of the parsers defined in parsers.py -""" -from io import StringIO -import os - -import numpy as np -import pytest - -from pandas.errors import ParserWarning - -from pandas.core.dtypes.dtypes import CategoricalDtype - -import pandas as pd -from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat -import pandas._testing as tm - - -@pytest.mark.parametrize("dtype", [str, object]) -@pytest.mark.parametrize("check_orig", [True, False]) -def test_dtype_all_columns(all_parsers, dtype, check_orig): - # see gh-3795, gh-6607 - parser = all_parsers - - df = DataFrame( - np.random.rand(5, 2).round(4), - columns=list("AB"), - index=["1A", "1B", "1C", "1D", "1E"], - ) - - with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: - df.to_csv(path) - - result = parser.read_csv(path, dtype=dtype, index_col=0) - - if check_orig: - expected = df.copy() - result = result.astype(float) - else: - expected = df.astype(str) - - tm.assert_frame_equal(result, expected) - - -def test_dtype_all_columns_empty(all_parsers): - # see gh-12048 - parser = all_parsers - result = parser.read_csv(StringIO("A,B"), dtype=str) - - expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) - tm.assert_frame_equal(result, expected) - - -def test_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - expected = DataFrame( - [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] - ) - expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) - - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) - tm.assert_frame_equal(result, expected) - - -def test_invalid_dtype_per_column(all_parsers): - parser = all_parsers - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): - parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) - - -@pytest.mark.parametrize( - "dtype", - [ - "category", - CategoricalDtype(), - {"a": "category", "b": "category", "c": CategoricalDtype()}, - ], -) -def test_categorical_dtype(all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["a", "a", "b"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - - -@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) -def test_categorical_dtype_single(all_parsers, dtype): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = DataFrame( - {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} - ) - actual = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_unsorted(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,b,3.4 -1,b,3.4 -2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", "b", "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_missing(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b,c -1,b,3.4 -1,nan,3.4 -2,a,4.5""" - expected = DataFrame( - { - "a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", np.nan, "a"]), - "c": Categorical(["3.4", "3.4", "4.5"]), - } - ) - actual = parser.read_csv(StringIO(data), dtype="category") - tm.assert_frame_equal(actual, expected) - - -@pytest.mark.slow -def test_categorical_dtype_high_cardinality_numeric(all_parsers): - # see gh-18186 - parser = all_parsers - data = np.sort([str(i) for i in range(524289)]) - expected = DataFrame({"a": Categorical(data, ordered=True)}) - - actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") - actual["a"] = actual["a"].cat.reorder_categories( - np.sort(actual.a.cat.categories), ordered=True - ) - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_latin1(all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - encoding = "latin-1" - - expected = parser.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - - actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_utf16(all_parsers, csv_dir_path): - # see gh-10153 - pth = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - encoding = "utf-16" - sep = "\t" - - expected = parser.read_csv(pth, sep=sep, encoding=encoding) - expected = expected.apply(Categorical) - - actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_chunksize_infer_categories(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), - DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), - ] - with parser.read_csv( - StringIO(data), dtype={"b": "category"}, chunksize=2 - ) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - -def test_categorical_dtype_chunksize_explicit_categories(all_parsers): - # see gh-10153 - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - cats = ["a", "b", "c"] - expecteds = [ - DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), - DataFrame( - {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3] - ), - ] - dtype = CategoricalDtype(cats) - with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - -@pytest.mark.parametrize("ordered", [False, True]) -@pytest.mark.parametrize( - "categories", - [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], -) -def test_categorical_category_dtype(all_parsers, categories, ordered): - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical( - ["a", "b", "b", "c"], categories=categories, ordered=ordered - ), - } - ) - - dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_category_dtype_unsorted(all_parsers): - parser = all_parsers - data = """a,b -1,a -1,b -1,b -2,c""" - dtype = CategoricalDtype(["c", "b", "a"]) - expected = DataFrame( - { - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), - } - ) - - result = parser.read_csv(StringIO(data), dtype={"b": dtype}) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_numeric(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([1, 2, 3])} - - data = "b\n1\n1\n2\n3" - expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_datetime(all_parsers): - parser = all_parsers - dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) - dtype = {"b": CategoricalDtype(dti)} - - data = "b\n2017-01-01\n2018-01-01\n2019-01-01" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_timestamp(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype([Timestamp("2014")])} - - data = "b\n2014-01-01\n2014-01-01T00:00:00" - expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_coerces_timedelta(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} - - data = "b\n1H\n2H\n3H" - expected = DataFrame({"b": Categorical(dtype["b"].categories)}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data", - [ - "b\nTrue\nFalse\nNA\nFalse", - "b\ntrue\nfalse\nNA\nfalse", - "b\nTRUE\nFALSE\nNA\nFALSE", - "b\nTrue\nFalse\nNA\nFALSE", - ], -) -def test_categorical_dtype_coerces_boolean(all_parsers, data): - # see gh-20498 - parser = all_parsers - dtype = {"b": CategoricalDtype([False, True])} - expected = DataFrame({"b": Categorical([True, False, None, False])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_categorical_unexpected_categories(all_parsers): - parser = all_parsers - dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} - - data = "b\nd\na\nc\nd" # Unexpected c - expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) - - result = parser.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - -def test_empty_pass_dtype(all_parsers): - parser = all_parsers - - data = "one,two" - result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_index_pass_dtype(all_parsers): - parser = all_parsers - - data = "one,two" - result = parser.read_csv( - StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} - ) - - expected = DataFrame( - {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") - ) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_multi_index_pass_dtype(all_parsers): - parser = all_parsers - - data = "one,two,three" - result = parser.read_csv( - StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} - ) - - exp_idx = MultiIndex.from_arrays( - [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"] - ) - expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): - parser = all_parsers - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): - parser = all_parsers - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - - expected = DataFrame( - {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object), - ) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) - - data = "one,one" - result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - tm.assert_frame_equal(result, expected) - - -def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): - # see gh-9424 - parser = all_parsers - expected = concat( - [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], - axis=1, - ) - expected.index = expected.index.astype(object) - - with pytest.raises(ValueError, match="Duplicate names"): - data = "" - parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) - - -def test_raise_on_passed_int_dtype_with_nas(all_parsers): - # see gh-2631 - parser = all_parsers - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - - msg = ( - "Integer column has NA values" - if parser.engine == "c" - else "Unable to convert column DOY" - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) - - -def test_dtype_with_converters(all_parsers): - parser = all_parsers - data = """a,b -1.1,2.2 -1.2,2.3""" - - # Dtype spec ignored if converted specified. - with tm.assert_produces_warning(ParserWarning): - result = parser.read_csv( - StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} - ) - expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype,expected", - [ - (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), - ("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])), - ( - {"a": "category", "b": "category"}, - DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), - ), - ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), - ( - "timedelta64[ns]", - DataFrame( - { - "a": Series([], dtype="timedelta64[ns]"), - "b": Series([], dtype="timedelta64[ns]"), - }, - index=[], - ), - ), - ( - {"a": np.int64, "b": np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ( - {0: np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ( - {"a": np.int64, 1: np.int32}, - DataFrame( - {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, - index=[], - ), - ), - ], -) -def test_empty_dtype(all_parsers, dtype, expected): - # see gh-14712 - parser = all_parsers - data = "a,b" - - result = parser.read_csv(StringIO(data), header=0, dtype=dtype) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) -) -def test_numeric_dtype(all_parsers, dtype): - data = "0\n1" - parser = all_parsers - expected = DataFrame([0, 1], dtype=dtype) - - result = parser.read_csv(StringIO(data), header=None, dtype=dtype) - tm.assert_frame_equal(expected, result) - - -def test_boolean_dtype(all_parsers): - parser = all_parsers - data = "\n".join( - [ - "a", - "True", - "TRUE", - "true", - "1", - "1.0", - "False", - "FALSE", - "false", - "0", - "0.0", - "NaN", - "nan", - "NA", - "null", - "NULL", - ] - ) - - result = parser.read_csv(StringIO(data), dtype="boolean") - expected = DataFrame( - { - "a": pd.array( - [ - True, - True, - True, - True, - True, - False, - False, - False, - False, - False, - None, - None, - None, - None, - None, - ], - dtype="boolean", - ) - } - ) - - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py new file mode 100644 index 0000000000000..c6b700c0adfff --- /dev/null +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -0,0 +1,149 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import pytest + +from pandas._libs.tslib import Timestamp + +from pandas import DataFrame, Index +import pandas._testing as tm + +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns expected but not found: {0}" +) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +def test_usecols_with_parse_dates(all_parsers, usecols): + # see gh-9755 + data = """a,b,c,d,e +0,1,20140101,0900,4 +0,1,20140102,1000,4""" + parser = all_parsers + parse_dates = [[1, 2]] + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates2(all_parsers): + # see gh-13604 + parser = all_parsers + data = """2008-02-07 09:40,1032.43 +2008-02-07 09:50,1042.54 +2008-02-07 10:00,1051.65""" + + names = ["date", "values"] + usecols = names[:] + parse_dates = [0] + + index = Index( + [ + Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00"), + ], + name="date", + ) + cols = {"values": [1032.43, 1042.54, 1051.65]} + expected = DataFrame(cols, index=index) + + result = parser.read_csv( + StringIO(data), + parse_dates=parse_dates, + index_col=0, + usecols=usecols, + header=None, + names=names, + ) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates3(all_parsers): + # see gh-14792 + parser = all_parsers + data = """a,b,c,d,e,f,g,h,i,j +2016/09/21,1,1,2,3,4,5,6,7,8""" + + usecols = list("abcdefghij") + parse_dates = [0] + + cols = { + "a": Timestamp("2016-09-21"), + "b": [1], + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=usecols) + + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates4(all_parsers): + data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" + usecols = list("abcdefghij") + parse_dates = [[0, 1]] + parser = all_parsers + + cols = { + "a_b": "2016/09/21 1", + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) + + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +@pytest.mark.parametrize( + "names", + [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. + ], +) +def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): + # see gh-9755 + s = """0,1,20140101,0900,4 +0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + parser = all_parsers + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_strings.py b/pandas/tests/io/parser/usecols/test_strings.py new file mode 100644 index 0000000000000..8cecf1fc981ee --- /dev/null +++ b/pandas/tests/io/parser/usecols/test_strings.py @@ -0,0 +1,97 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns expected but not found: {0}" +) + + +def test_usecols_with_unicode_strings(all_parsers): + # see gh-13219 + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "AAA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "BBB": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_single_byte_unicode_strings(all_parsers): + # see gh-13219 + data = """A,B,C,D +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "A": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "B": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) +def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) +def test_usecols_with_multi_byte_characters(all_parsers, usecols): + data = """あああ,いい,ううう,ええええ +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "あああ": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "いい": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py similarity index 66% rename from pandas/tests/io/parser/test_usecols.py rename to pandas/tests/io/parser/usecols/test_usecols_basic.py index 7cdfb7d11ed83..7d81a88e09012 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._libs.tslib import Timestamp - from pandas import DataFrame, Index import pandas._testing as tm @@ -195,7 +193,10 @@ def test_usecols_with_whitespace(all_parsers): # Column selection by index. ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), + ( + ["0", "1"], + DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]), + ), ], ) def test_usecols_with_integer_like_header(all_parsers, usecols, expected): @@ -208,200 +209,6 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -def test_usecols_with_parse_dates(all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates2(all_parsers): - # see gh-13604 - parser = all_parsers - data = """2008-02-07 09:40,1032.43 -2008-02-07 09:50,1042.54 -2008-02-07 10:00,1051.65""" - - names = ["date", "values"] - usecols = names[:] - parse_dates = [0] - - index = Index( - [ - Timestamp("2008-02-07 09:40"), - Timestamp("2008-02-07 09:50"), - Timestamp("2008-02-07 10:00"), - ], - name="date", - ) - cols = {"values": [1032.43, 1042.54, 1051.65]} - expected = DataFrame(cols, index=index) - - result = parser.read_csv( - StringIO(data), - parse_dates=parse_dates, - index_col=0, - usecols=usecols, - header=None, - names=names, - ) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates3(all_parsers): - # see gh-14792 - parser = all_parsers - data = """a,b,c,d,e,f,g,h,i,j -2016/09/21,1,1,2,3,4,5,6,7,8""" - - usecols = list("abcdefghij") - parse_dates = [0] - - cols = { - "a": Timestamp("2016-09-21"), - "b": [1], - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=usecols) - - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates4(all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], -) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): - # see gh-9755 - s = """0,1,20140101,0900,4 -0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols - ) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_unicode_strings(all_parsers): - # see gh-13219 - data = """AAA,BBB,CCC,DDD -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "BBB": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) - tm.assert_frame_equal(result, expected) - - -def test_usecols_with_single_byte_unicode_strings(all_parsers): - # see gh-13219 - data = """A,B,C,D -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "B": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=["A", "B"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) -def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): - data = """AAA,BBB,CCC,DDD -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - with pytest.raises(ValueError, match=_msg_validate_usecols_arg): - parser.read_csv(StringIO(data), usecols=usecols) - - -@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) -def test_usecols_with_multi_byte_characters(all_parsers, usecols): - data = """あああ,いい,ううう,ええええ -0.056674973,8,True,a -2.613230982,2,False,b -3.568935038,7,False,a""" - parser = all_parsers - - exp_data = { - "あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, - "いい": {0: 8, 1: 2, 2: 7}, - } - expected = DataFrame(exp_data) - - result = parser.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - def test_empty_usecols(all_parsers): data = "a,b,c\n1,2,3\n4,5,6" expected = DataFrame()