Skip to content

Commit 9b47091

Browse files
authored
TST/REF: io/parser/(test_dtypes.py, test_usecols.py) (#38578)
* test reorg * test reorg * split test_dtypes.py into multiple files * split test_usecols.py into multiple files * dedeuplicate base filenames * complete file renaming
1 parent bdc5a67 commit 9b47091

File tree

7 files changed

+883
-802
lines changed

7 files changed

+883
-802
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
"""
2+
Tests dtype specification during parsing
3+
for all of the parsers defined in parsers.py
4+
"""
5+
from io import StringIO
6+
import os
7+
8+
import numpy as np
9+
import pytest
10+
11+
from pandas.core.dtypes.dtypes import CategoricalDtype
12+
13+
import pandas as pd
14+
from pandas import Categorical, DataFrame, Timestamp
15+
import pandas._testing as tm
16+
17+
18+
@pytest.mark.parametrize(
19+
"dtype",
20+
[
21+
"category",
22+
CategoricalDtype(),
23+
{"a": "category", "b": "category", "c": CategoricalDtype()},
24+
],
25+
)
26+
def test_categorical_dtype(all_parsers, dtype):
27+
# see gh-10153
28+
parser = all_parsers
29+
data = """a,b,c
30+
1,a,3.4
31+
1,a,3.4
32+
2,b,4.5"""
33+
expected = DataFrame(
34+
{
35+
"a": Categorical(["1", "1", "2"]),
36+
"b": Categorical(["a", "a", "b"]),
37+
"c": Categorical(["3.4", "3.4", "4.5"]),
38+
}
39+
)
40+
actual = parser.read_csv(StringIO(data), dtype=dtype)
41+
tm.assert_frame_equal(actual, expected)
42+
43+
44+
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
45+
def test_categorical_dtype_single(all_parsers, dtype):
46+
# see gh-10153
47+
parser = all_parsers
48+
data = """a,b,c
49+
1,a,3.4
50+
1,a,3.4
51+
2,b,4.5"""
52+
expected = DataFrame(
53+
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
54+
)
55+
actual = parser.read_csv(StringIO(data), dtype=dtype)
56+
tm.assert_frame_equal(actual, expected)
57+
58+
59+
def test_categorical_dtype_unsorted(all_parsers):
60+
# see gh-10153
61+
parser = all_parsers
62+
data = """a,b,c
63+
1,b,3.4
64+
1,b,3.4
65+
2,a,4.5"""
66+
expected = DataFrame(
67+
{
68+
"a": Categorical(["1", "1", "2"]),
69+
"b": Categorical(["b", "b", "a"]),
70+
"c": Categorical(["3.4", "3.4", "4.5"]),
71+
}
72+
)
73+
actual = parser.read_csv(StringIO(data), dtype="category")
74+
tm.assert_frame_equal(actual, expected)
75+
76+
77+
def test_categorical_dtype_missing(all_parsers):
78+
# see gh-10153
79+
parser = all_parsers
80+
data = """a,b,c
81+
1,b,3.4
82+
1,nan,3.4
83+
2,a,4.5"""
84+
expected = DataFrame(
85+
{
86+
"a": Categorical(["1", "1", "2"]),
87+
"b": Categorical(["b", np.nan, "a"]),
88+
"c": Categorical(["3.4", "3.4", "4.5"]),
89+
}
90+
)
91+
actual = parser.read_csv(StringIO(data), dtype="category")
92+
tm.assert_frame_equal(actual, expected)
93+
94+
95+
@pytest.mark.slow
96+
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
97+
# see gh-18186
98+
parser = all_parsers
99+
data = np.sort([str(i) for i in range(524289)])
100+
expected = DataFrame({"a": Categorical(data, ordered=True)})
101+
102+
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
103+
actual["a"] = actual["a"].cat.reorder_categories(
104+
np.sort(actual.a.cat.categories), ordered=True
105+
)
106+
tm.assert_frame_equal(actual, expected)
107+
108+
109+
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
110+
# see gh-10153
111+
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
112+
parser = all_parsers
113+
encoding = "utf-16"
114+
sep = "\t"
115+
116+
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
117+
expected = expected.apply(Categorical)
118+
119+
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
120+
tm.assert_frame_equal(actual, expected)
121+
122+
123+
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
124+
# see gh-10153
125+
parser = all_parsers
126+
data = """a,b
127+
1,a
128+
1,b
129+
1,b
130+
2,c"""
131+
expecteds = [
132+
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
133+
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
134+
]
135+
with parser.read_csv(
136+
StringIO(data), dtype={"b": "category"}, chunksize=2
137+
) as actuals:
138+
for actual, expected in zip(actuals, expecteds):
139+
tm.assert_frame_equal(actual, expected)
140+
141+
142+
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
143+
# see gh-10153
144+
parser = all_parsers
145+
data = """a,b
146+
1,a
147+
1,b
148+
1,b
149+
2,c"""
150+
cats = ["a", "b", "c"]
151+
expecteds = [
152+
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
153+
DataFrame(
154+
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
155+
index=[2, 3],
156+
),
157+
]
158+
dtype = CategoricalDtype(cats)
159+
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
160+
for actual, expected in zip(actuals, expecteds):
161+
tm.assert_frame_equal(actual, expected)
162+
163+
164+
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
165+
# see gh-10153
166+
pth = os.path.join(csv_dir_path, "unicode_series.csv")
167+
parser = all_parsers
168+
encoding = "latin-1"
169+
170+
expected = parser.read_csv(pth, header=None, encoding=encoding)
171+
expected[1] = Categorical(expected[1])
172+
173+
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
174+
tm.assert_frame_equal(actual, expected)
175+
176+
177+
@pytest.mark.parametrize("ordered", [False, True])
178+
@pytest.mark.parametrize(
179+
"categories",
180+
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
181+
)
182+
def test_categorical_category_dtype(all_parsers, categories, ordered):
183+
parser = all_parsers
184+
data = """a,b
185+
1,a
186+
1,b
187+
1,b
188+
2,c"""
189+
expected = DataFrame(
190+
{
191+
"a": [1, 1, 1, 2],
192+
"b": Categorical(
193+
["a", "b", "b", "c"], categories=categories, ordered=ordered
194+
),
195+
}
196+
)
197+
198+
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
199+
result = parser.read_csv(StringIO(data), dtype=dtype)
200+
tm.assert_frame_equal(result, expected)
201+
202+
203+
def test_categorical_category_dtype_unsorted(all_parsers):
204+
parser = all_parsers
205+
data = """a,b
206+
1,a
207+
1,b
208+
1,b
209+
2,c"""
210+
dtype = CategoricalDtype(["c", "b", "a"])
211+
expected = DataFrame(
212+
{
213+
"a": [1, 1, 1, 2],
214+
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
215+
}
216+
)
217+
218+
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
219+
tm.assert_frame_equal(result, expected)
220+
221+
222+
def test_categorical_coerces_numeric(all_parsers):
223+
parser = all_parsers
224+
dtype = {"b": CategoricalDtype([1, 2, 3])}
225+
226+
data = "b\n1\n1\n2\n3"
227+
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
228+
229+
result = parser.read_csv(StringIO(data), dtype=dtype)
230+
tm.assert_frame_equal(result, expected)
231+
232+
233+
def test_categorical_coerces_datetime(all_parsers):
234+
parser = all_parsers
235+
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
236+
dtype = {"b": CategoricalDtype(dti)}
237+
238+
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
239+
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
240+
241+
result = parser.read_csv(StringIO(data), dtype=dtype)
242+
tm.assert_frame_equal(result, expected)
243+
244+
245+
def test_categorical_coerces_timestamp(all_parsers):
246+
parser = all_parsers
247+
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
248+
249+
data = "b\n2014-01-01\n2014-01-01T00:00:00"
250+
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
251+
252+
result = parser.read_csv(StringIO(data), dtype=dtype)
253+
tm.assert_frame_equal(result, expected)
254+
255+
256+
def test_categorical_coerces_timedelta(all_parsers):
257+
parser = all_parsers
258+
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
259+
260+
data = "b\n1H\n2H\n3H"
261+
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
262+
263+
result = parser.read_csv(StringIO(data), dtype=dtype)
264+
tm.assert_frame_equal(result, expected)
265+
266+
267+
@pytest.mark.parametrize(
268+
"data",
269+
[
270+
"b\nTrue\nFalse\nNA\nFalse",
271+
"b\ntrue\nfalse\nNA\nfalse",
272+
"b\nTRUE\nFALSE\nNA\nFALSE",
273+
"b\nTrue\nFalse\nNA\nFALSE",
274+
],
275+
)
276+
def test_categorical_dtype_coerces_boolean(all_parsers, data):
277+
# see gh-20498
278+
parser = all_parsers
279+
dtype = {"b": CategoricalDtype([False, True])}
280+
expected = DataFrame({"b": Categorical([True, False, None, False])})
281+
282+
result = parser.read_csv(StringIO(data), dtype=dtype)
283+
tm.assert_frame_equal(result, expected)
284+
285+
286+
def test_categorical_unexpected_categories(all_parsers):
287+
parser = all_parsers
288+
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
289+
290+
data = "b\nd\na\nc\nd" # Unexpected c
291+
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
292+
293+
result = parser.read_csv(StringIO(data), dtype=dtype)
294+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)