Skip to content

TST/REF: io/parser/(test_dtypes.py, test_usecols.py) #38578

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
294 changes: 294 additions & 0 deletions pandas/tests/io/parser/dtypes/test_categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import os

import numpy as np
import pytest

from pandas.core.dtypes.dtypes import CategoricalDtype

import pandas as pd
from pandas import Categorical, DataFrame, Timestamp
import pandas._testing as tm


@pytest.mark.parametrize(
"dtype",
[
"category",
CategoricalDtype(),
{"a": "category", "b": "category", "c": CategoricalDtype()},
],
)
def test_categorical_dtype(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["a", "a", "b"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)


@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
def test_categorical_dtype_single(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_unsorted(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,b,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", "b", "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_missing(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,nan,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", np.nan, "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)


@pytest.mark.slow
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
# see gh-18186
parser = all_parsers
data = np.sort([str(i) for i in range(524289)])
expected = DataFrame({"a": Categorical(data, ordered=True)})

actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
actual["a"] = actual["a"].cat.reorder_categories(
np.sort(actual.a.cat.categories), ordered=True
)
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
encoding = "utf-16"
sep = "\t"

expected = parser.read_csv(pth, sep=sep, encoding=encoding)
expected = expected.apply(Categorical)

actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_chunksize_infer_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
]
with parser.read_csv(
StringIO(data), dtype={"b": "category"}, chunksize=2
) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
cats = ["a", "b", "c"]
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
DataFrame(
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
index=[2, 3],
),
]
dtype = CategoricalDtype(cats)
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
encoding = "latin-1"

expected = parser.read_csv(pth, header=None, encoding=encoding)
expected[1] = Categorical(expected[1])

actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
tm.assert_frame_equal(actual, expected)


@pytest.mark.parametrize("ordered", [False, True])
@pytest.mark.parametrize(
"categories",
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
)
def test_categorical_category_dtype(all_parsers, categories, ordered):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(
["a", "b", "b", "c"], categories=categories, ordered=ordered
),
}
)

dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_category_dtype_unsorted(all_parsers):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
dtype = CategoricalDtype(["c", "b", "a"])
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
}
)

result = parser.read_csv(StringIO(data), dtype={"b": dtype})
tm.assert_frame_equal(result, expected)


def test_categorical_coerces_numeric(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([1, 2, 3])}

data = "b\n1\n1\n2\n3"
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_coerces_datetime(all_parsers):
parser = all_parsers
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
dtype = {"b": CategoricalDtype(dti)}

data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_coerces_timestamp(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([Timestamp("2014")])}

data = "b\n2014-01-01\n2014-01-01T00:00:00"
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_coerces_timedelta(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}

data = "b\n1H\n2H\n3H"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"data",
[
"b\nTrue\nFalse\nNA\nFalse",
"b\ntrue\nfalse\nNA\nfalse",
"b\nTRUE\nFALSE\nNA\nFALSE",
"b\nTrue\nFalse\nNA\nFALSE",
],
)
def test_categorical_dtype_coerces_boolean(all_parsers, data):
# see gh-20498
parser = all_parsers
dtype = {"b": CategoricalDtype([False, True])}
expected = DataFrame({"b": Categorical([True, False, None, False])})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_unexpected_categories(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}

data = "b\nd\na\nc\nd" # Unexpected c
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
Loading