|
| 1 | +""" |
| 2 | +Tests dtype specification during parsing |
| 3 | +for all of the parsers defined in parsers.py |
| 4 | +""" |
| 5 | +from io import StringIO |
| 6 | +import os |
| 7 | + |
| 8 | +import numpy as np |
| 9 | +import pytest |
| 10 | + |
| 11 | +from pandas.core.dtypes.dtypes import CategoricalDtype |
| 12 | + |
| 13 | +import pandas as pd |
| 14 | +from pandas import Categorical, DataFrame, Timestamp |
| 15 | +import pandas._testing as tm |
| 16 | + |
| 17 | + |
| 18 | +@pytest.mark.parametrize( |
| 19 | + "dtype", |
| 20 | + [ |
| 21 | + "category", |
| 22 | + CategoricalDtype(), |
| 23 | + {"a": "category", "b": "category", "c": CategoricalDtype()}, |
| 24 | + ], |
| 25 | +) |
| 26 | +def test_categorical_dtype(all_parsers, dtype): |
| 27 | + # see gh-10153 |
| 28 | + parser = all_parsers |
| 29 | + data = """a,b,c |
| 30 | +1,a,3.4 |
| 31 | +1,a,3.4 |
| 32 | +2,b,4.5""" |
| 33 | + expected = DataFrame( |
| 34 | + { |
| 35 | + "a": Categorical(["1", "1", "2"]), |
| 36 | + "b": Categorical(["a", "a", "b"]), |
| 37 | + "c": Categorical(["3.4", "3.4", "4.5"]), |
| 38 | + } |
| 39 | + ) |
| 40 | + actual = parser.read_csv(StringIO(data), dtype=dtype) |
| 41 | + tm.assert_frame_equal(actual, expected) |
| 42 | + |
| 43 | + |
| 44 | +@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) |
| 45 | +def test_categorical_dtype_single(all_parsers, dtype): |
| 46 | + # see gh-10153 |
| 47 | + parser = all_parsers |
| 48 | + data = """a,b,c |
| 49 | +1,a,3.4 |
| 50 | +1,a,3.4 |
| 51 | +2,b,4.5""" |
| 52 | + expected = DataFrame( |
| 53 | + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} |
| 54 | + ) |
| 55 | + actual = parser.read_csv(StringIO(data), dtype=dtype) |
| 56 | + tm.assert_frame_equal(actual, expected) |
| 57 | + |
| 58 | + |
| 59 | +def test_categorical_dtype_unsorted(all_parsers): |
| 60 | + # see gh-10153 |
| 61 | + parser = all_parsers |
| 62 | + data = """a,b,c |
| 63 | +1,b,3.4 |
| 64 | +1,b,3.4 |
| 65 | +2,a,4.5""" |
| 66 | + expected = DataFrame( |
| 67 | + { |
| 68 | + "a": Categorical(["1", "1", "2"]), |
| 69 | + "b": Categorical(["b", "b", "a"]), |
| 70 | + "c": Categorical(["3.4", "3.4", "4.5"]), |
| 71 | + } |
| 72 | + ) |
| 73 | + actual = parser.read_csv(StringIO(data), dtype="category") |
| 74 | + tm.assert_frame_equal(actual, expected) |
| 75 | + |
| 76 | + |
| 77 | +def test_categorical_dtype_missing(all_parsers): |
| 78 | + # see gh-10153 |
| 79 | + parser = all_parsers |
| 80 | + data = """a,b,c |
| 81 | +1,b,3.4 |
| 82 | +1,nan,3.4 |
| 83 | +2,a,4.5""" |
| 84 | + expected = DataFrame( |
| 85 | + { |
| 86 | + "a": Categorical(["1", "1", "2"]), |
| 87 | + "b": Categorical(["b", np.nan, "a"]), |
| 88 | + "c": Categorical(["3.4", "3.4", "4.5"]), |
| 89 | + } |
| 90 | + ) |
| 91 | + actual = parser.read_csv(StringIO(data), dtype="category") |
| 92 | + tm.assert_frame_equal(actual, expected) |
| 93 | + |
| 94 | + |
| 95 | +@pytest.mark.slow |
| 96 | +def test_categorical_dtype_high_cardinality_numeric(all_parsers): |
| 97 | + # see gh-18186 |
| 98 | + parser = all_parsers |
| 99 | + data = np.sort([str(i) for i in range(524289)]) |
| 100 | + expected = DataFrame({"a": Categorical(data, ordered=True)}) |
| 101 | + |
| 102 | + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") |
| 103 | + actual["a"] = actual["a"].cat.reorder_categories( |
| 104 | + np.sort(actual.a.cat.categories), ordered=True |
| 105 | + ) |
| 106 | + tm.assert_frame_equal(actual, expected) |
| 107 | + |
| 108 | + |
| 109 | +def test_categorical_dtype_utf16(all_parsers, csv_dir_path): |
| 110 | + # see gh-10153 |
| 111 | + pth = os.path.join(csv_dir_path, "utf16_ex.txt") |
| 112 | + parser = all_parsers |
| 113 | + encoding = "utf-16" |
| 114 | + sep = "\t" |
| 115 | + |
| 116 | + expected = parser.read_csv(pth, sep=sep, encoding=encoding) |
| 117 | + expected = expected.apply(Categorical) |
| 118 | + |
| 119 | + actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") |
| 120 | + tm.assert_frame_equal(actual, expected) |
| 121 | + |
| 122 | + |
| 123 | +def test_categorical_dtype_chunksize_infer_categories(all_parsers): |
| 124 | + # see gh-10153 |
| 125 | + parser = all_parsers |
| 126 | + data = """a,b |
| 127 | +1,a |
| 128 | +1,b |
| 129 | +1,b |
| 130 | +2,c""" |
| 131 | + expecteds = [ |
| 132 | + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), |
| 133 | + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), |
| 134 | + ] |
| 135 | + with parser.read_csv( |
| 136 | + StringIO(data), dtype={"b": "category"}, chunksize=2 |
| 137 | + ) as actuals: |
| 138 | + for actual, expected in zip(actuals, expecteds): |
| 139 | + tm.assert_frame_equal(actual, expected) |
| 140 | + |
| 141 | + |
| 142 | +def test_categorical_dtype_chunksize_explicit_categories(all_parsers): |
| 143 | + # see gh-10153 |
| 144 | + parser = all_parsers |
| 145 | + data = """a,b |
| 146 | +1,a |
| 147 | +1,b |
| 148 | +1,b |
| 149 | +2,c""" |
| 150 | + cats = ["a", "b", "c"] |
| 151 | + expecteds = [ |
| 152 | + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), |
| 153 | + DataFrame( |
| 154 | + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, |
| 155 | + index=[2, 3], |
| 156 | + ), |
| 157 | + ] |
| 158 | + dtype = CategoricalDtype(cats) |
| 159 | + with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: |
| 160 | + for actual, expected in zip(actuals, expecteds): |
| 161 | + tm.assert_frame_equal(actual, expected) |
| 162 | + |
| 163 | + |
| 164 | +def test_categorical_dtype_latin1(all_parsers, csv_dir_path): |
| 165 | + # see gh-10153 |
| 166 | + pth = os.path.join(csv_dir_path, "unicode_series.csv") |
| 167 | + parser = all_parsers |
| 168 | + encoding = "latin-1" |
| 169 | + |
| 170 | + expected = parser.read_csv(pth, header=None, encoding=encoding) |
| 171 | + expected[1] = Categorical(expected[1]) |
| 172 | + |
| 173 | + actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) |
| 174 | + tm.assert_frame_equal(actual, expected) |
| 175 | + |
| 176 | + |
| 177 | +@pytest.mark.parametrize("ordered", [False, True]) |
| 178 | +@pytest.mark.parametrize( |
| 179 | + "categories", |
| 180 | + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], |
| 181 | +) |
| 182 | +def test_categorical_category_dtype(all_parsers, categories, ordered): |
| 183 | + parser = all_parsers |
| 184 | + data = """a,b |
| 185 | +1,a |
| 186 | +1,b |
| 187 | +1,b |
| 188 | +2,c""" |
| 189 | + expected = DataFrame( |
| 190 | + { |
| 191 | + "a": [1, 1, 1, 2], |
| 192 | + "b": Categorical( |
| 193 | + ["a", "b", "b", "c"], categories=categories, ordered=ordered |
| 194 | + ), |
| 195 | + } |
| 196 | + ) |
| 197 | + |
| 198 | + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} |
| 199 | + result = parser.read_csv(StringIO(data), dtype=dtype) |
| 200 | + tm.assert_frame_equal(result, expected) |
| 201 | + |
| 202 | + |
| 203 | +def test_categorical_category_dtype_unsorted(all_parsers): |
| 204 | + parser = all_parsers |
| 205 | + data = """a,b |
| 206 | +1,a |
| 207 | +1,b |
| 208 | +1,b |
| 209 | +2,c""" |
| 210 | + dtype = CategoricalDtype(["c", "b", "a"]) |
| 211 | + expected = DataFrame( |
| 212 | + { |
| 213 | + "a": [1, 1, 1, 2], |
| 214 | + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), |
| 215 | + } |
| 216 | + ) |
| 217 | + |
| 218 | + result = parser.read_csv(StringIO(data), dtype={"b": dtype}) |
| 219 | + tm.assert_frame_equal(result, expected) |
| 220 | + |
| 221 | + |
| 222 | +def test_categorical_coerces_numeric(all_parsers): |
| 223 | + parser = all_parsers |
| 224 | + dtype = {"b": CategoricalDtype([1, 2, 3])} |
| 225 | + |
| 226 | + data = "b\n1\n1\n2\n3" |
| 227 | + expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) |
| 228 | + |
| 229 | + result = parser.read_csv(StringIO(data), dtype=dtype) |
| 230 | + tm.assert_frame_equal(result, expected) |
| 231 | + |
| 232 | + |
| 233 | +def test_categorical_coerces_datetime(all_parsers): |
| 234 | + parser = all_parsers |
| 235 | + dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) |
| 236 | + dtype = {"b": CategoricalDtype(dti)} |
| 237 | + |
| 238 | + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" |
| 239 | + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) |
| 240 | + |
| 241 | + result = parser.read_csv(StringIO(data), dtype=dtype) |
| 242 | + tm.assert_frame_equal(result, expected) |
| 243 | + |
| 244 | + |
| 245 | +def test_categorical_coerces_timestamp(all_parsers): |
| 246 | + parser = all_parsers |
| 247 | + dtype = {"b": CategoricalDtype([Timestamp("2014")])} |
| 248 | + |
| 249 | + data = "b\n2014-01-01\n2014-01-01T00:00:00" |
| 250 | + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) |
| 251 | + |
| 252 | + result = parser.read_csv(StringIO(data), dtype=dtype) |
| 253 | + tm.assert_frame_equal(result, expected) |
| 254 | + |
| 255 | + |
| 256 | +def test_categorical_coerces_timedelta(all_parsers): |
| 257 | + parser = all_parsers |
| 258 | + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} |
| 259 | + |
| 260 | + data = "b\n1H\n2H\n3H" |
| 261 | + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) |
| 262 | + |
| 263 | + result = parser.read_csv(StringIO(data), dtype=dtype) |
| 264 | + tm.assert_frame_equal(result, expected) |
| 265 | + |
| 266 | + |
| 267 | +@pytest.mark.parametrize( |
| 268 | + "data", |
| 269 | + [ |
| 270 | + "b\nTrue\nFalse\nNA\nFalse", |
| 271 | + "b\ntrue\nfalse\nNA\nfalse", |
| 272 | + "b\nTRUE\nFALSE\nNA\nFALSE", |
| 273 | + "b\nTrue\nFalse\nNA\nFALSE", |
| 274 | + ], |
| 275 | +) |
| 276 | +def test_categorical_dtype_coerces_boolean(all_parsers, data): |
| 277 | + # see gh-20498 |
| 278 | + parser = all_parsers |
| 279 | + dtype = {"b": CategoricalDtype([False, True])} |
| 280 | + expected = DataFrame({"b": Categorical([True, False, None, False])}) |
| 281 | + |
| 282 | + result = parser.read_csv(StringIO(data), dtype=dtype) |
| 283 | + tm.assert_frame_equal(result, expected) |
| 284 | + |
| 285 | + |
| 286 | +def test_categorical_unexpected_categories(all_parsers): |
| 287 | + parser = all_parsers |
| 288 | + dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} |
| 289 | + |
| 290 | + data = "b\nd\na\nc\nd" # Unexpected c |
| 291 | + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) |
| 292 | + |
| 293 | + result = parser.read_csv(StringIO(data), dtype=dtype) |
| 294 | + tm.assert_frame_equal(result, expected) |
0 commit comments