Skip to content

Commit 4b588cd

Browse files
committed
Added casting
1 parent c5f6e04 commit 4b588cd

File tree

3 files changed

+59
-22
lines changed

3 files changed

+59
-22
lines changed

pandas/_libs/parsers.pyx

+21-3
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ from pandas.core.dtypes.common import (
4848
from pandas.core.categorical import Categorical
4949
from pandas.core.algorithms import take_1d
5050
from pandas.core.dtypes.concat import union_categoricals
51-
from pandas import Index
51+
from pandas import Index, to_numeric, to_datetime, to_timedelta
5252

5353
import pandas.io.common as com
5454

@@ -1267,12 +1267,30 @@ cdef class TextReader:
12671267
return self._string_convert(i, start, end, na_filter,
12681268
na_hashset)
12691269
elif is_categorical_dtype(dtype):
1270-
# TODO: I suspect that this could be optimized when dtype
1271-
# is an instance of CategoricalDtype
1270+
# TODO: I suspect that _categorical_convert could be
1271+
# optimized when dtype is an instance of CategoricalDtype
12721272
codes, cats, na_count = _categorical_convert(
12731273
self.parser, i, start, end, na_filter,
12741274
na_hashset, self.c_encoding)
12751275
cats = Index(cats)
1276+
1277+
# Here is where we'll do the casting...
1278+
if (isinstance(dtype, CategoricalDtype) and
1279+
dtype.categories is not None):
1280+
if dtype.categories.is_numeric():
1281+
# is ignore correct?
1282+
cats = to_numeric(cats, errors='ignore')
1283+
elif dtype.categories.is_all_dates:
1284+
# is ignore correct?
1285+
if is_datetime64_dtype(dtype.categories):
1286+
print("before", cats)
1287+
cats = to_datetime(cats, errors='ignore')
1288+
print("after", cats)
1289+
else:
1290+
print("before", cats)
1291+
cats = to_timedelta(cats, errors='ignore')
1292+
print("after", cats)
1293+
12761294
if (isinstance(dtype, CategoricalDtype) and
12771295
dtype.categories is not None):
12781296
# recode for dtype.categories

pandas/io/parsers.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,16 @@
1212

1313
import numpy as np
1414

15-
from pandas import compat
15+
from pandas import compat, to_numeric, to_timedelta
1616
from pandas.compat import (range, lrange, PY3, StringIO, lzip,
1717
zip, string_types, map, u)
1818
from pandas.core.dtypes.common import (
1919
is_integer, _ensure_object,
2020
is_list_like, is_integer_dtype,
2121
is_float, is_dtype_equal,
2222
is_object_dtype, is_string_dtype,
23-
is_scalar, is_categorical_dtype)
23+
is_scalar, is_categorical_dtype,
24+
is_datetime64_dtype, is_timedelta64_dtype)
2425
from pandas.core.dtypes.dtypes import CategoricalDtype
2526
from pandas.core.dtypes.missing import isna
2627
from pandas.core.dtypes.cast import astype_nansafe
@@ -1606,11 +1607,21 @@ def _cast_types(self, values, cast_type, column):
16061607
# XXX this is for consistency with
16071608
# c-parser which parses all categories
16081609
# as strings
1609-
if not is_object_dtype(values):
1610-
values = astype_nansafe(values, str)
1611-
if isinstance(cast_type, CategoricalDtype):
1610+
known_cats = (isinstance(cast_type, CategoricalDtype) and
1611+
cast_type.categories is not None)
1612+
str_values = is_object_dtype(values)
1613+
1614+
if known_cats and str_values:
1615+
if cast_type.categories.is_numeric():
1616+
values = to_numeric(values, errors='ignore')
1617+
elif is_datetime64_dtype(cast_type.categories):
1618+
values = tools.to_datetime(values, errors='ignore')
1619+
elif is_timedelta64_dtype(cast_type.categories):
1620+
values = to_timedelta(values, errors='ignore')
16121621
values = Categorical(values, categories=cast_type.categories,
16131622
ordered=cast_type.ordered)
1623+
elif not is_object_dtype(values):
1624+
values = astype_nansafe(values, str)
16141625
else:
16151626
values = Categorical(values)
16161627
else:

pandas/tests/io/parser/dtypes.py

+22-14
Original file line numberDiff line numberDiff line change
@@ -187,20 +187,28 @@ def test_categorical_categoricaldtype_unsorted(self):
187187
result = self.read_csv(StringIO(data), dtype={'b': dtype})
188188
tm.assert_frame_equal(result, expected)
189189

190-
# @pytest.mark.parametrize('ordered', [True, False])
191-
# def test_categoricaldtype_coerces(self, ordered):
192-
# dtype = {'b': CategoricalDtype([10, 11, 12, 13], ordered=ordered)}
193-
# data = """a,b
194-
# 1,10
195-
# 1,11
196-
# 1,12
197-
# 2,13"""
198-
# expected = pd.DataFrame({
199-
# 'a': [1, 1, 1, 2],
200-
# 'b': Categorical([10, 11, 12, 13], ordered=ordered),
201-
# }, columns=['a', 'b'])
202-
# result = self.read_csv(StringIO(data), dtype=dtype)
203-
# tm.assert_frame_equal(result, expected)
190+
def test_categoricaldtype_coerces_numeric(self):
191+
dtype = {'b': CategoricalDtype([1, 2, 3])}
192+
data = "b\n1\n1\n2\n3"
193+
expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])})
194+
result = self.read_csv(StringIO(data), dtype=dtype)
195+
tm.assert_frame_equal(result, expected)
196+
197+
def test_categoricaldtype_coerces_datetime(self):
198+
dtype = {
199+
'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS'))
200+
}
201+
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
202+
expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
203+
result = self.read_csv(StringIO(data), dtype=dtype)
204+
tm.assert_frame_equal(result, expected)
205+
206+
def test_categoricaldtype_coerces_timedelta(self):
207+
dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))}
208+
data = "b\n1H\n2H\n3H"
209+
expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
210+
result = self.read_csv(StringIO(data), dtype=dtype)
211+
tm.assert_frame_equal(result, expected)
204212

205213
def test_categorical_categoricaldtype_chunksize(self):
206214
# GH 10153

0 commit comments

Comments
 (0)