Skip to content

Commit 079289c

Browse files
authored
BUG: to_csv casting datetimes in categorical to int (#44930)
1 parent d64df84 commit 079289c

File tree

6 files changed

+50
-9
lines changed

6 files changed

+50
-9
lines changed

doc/source/whatsnew/v1.4.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,7 @@ Categorical
613613
^^^^^^^^^^^
614614
- Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
615615
- Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`)
616+
- Bug in :meth:`Categorical.astype` casting datetimes and :class:`Timestamp` to int for dtype ``object`` (:issue:`44930`)
616617
- Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
617618
- Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
618619
- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)
@@ -760,6 +761,7 @@ I/O
760761
- Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`)
761762
- Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`)
762763
- Bug in :func:`read_csv` not raising an ``ValueError`` when ``\n`` was specified as ``delimiter`` or ``sep`` which conflicts with ``lineterminator`` (:issue:`43528`)
764+
- Bug in :func:`to_csv` converting datetimes in categorical :class:`Series` to integers (:issue:`40754`)
763765
- Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`)
764766
- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
765767
- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)

pandas/core/arrays/categorical.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@
108108
)
109109
import pandas.core.common as com
110110
from pandas.core.construction import (
111+
ensure_wrapped_if_datetimelike,
111112
extract_array,
112113
sanitize_array,
113114
)
@@ -538,8 +539,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
538539

539540
else:
540541
# GH8628 (PERF): astype category codes instead of astyping array
541-
try:
542+
if is_datetime64_dtype(self.categories):
543+
new_cats = ensure_wrapped_if_datetimelike(self.categories._values)
544+
else:
542545
new_cats = np.asarray(self.categories)
546+
547+
try:
543548
new_cats = new_cats.astype(dtype=dtype, copy=copy)
544549
fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype))
545550
except (

pandas/core/internals/blocks.py

+10
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
soft_convert_objects,
4747
)
4848
from pandas.core.dtypes.common import (
49+
ensure_platform_int,
4950
is_1d_only_ea_dtype,
5051
is_1d_only_ea_obj,
5152
is_dtype_equal,
@@ -88,6 +89,7 @@
8889
replace_regex,
8990
should_use_regex,
9091
)
92+
from pandas.core.array_algos.take import take_nd
9193
from pandas.core.array_algos.transforms import shift
9294
from pandas.core.arrays import (
9395
Categorical,
@@ -2087,6 +2089,14 @@ def to_native_types(
20872089
**kwargs,
20882090
) -> np.ndarray:
20892091
"""convert to our native types format"""
2092+
if isinstance(values, Categorical):
2093+
# GH#40754 Convert categorical datetimes to datetime array
2094+
values = take_nd(
2095+
values.categories._values,
2096+
ensure_platform_int(values._codes),
2097+
fill_value=na_rep,
2098+
)
2099+
20902100
values = ensure_wrapped_if_datetimelike(values)
20912101

20922102
if isinstance(values, (DatetimeArray, TimedeltaArray)):

pandas/tests/arrays/categorical/test_dtypes.py

+16
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
Categorical,
88
CategoricalIndex,
99
Index,
10+
NaT,
1011
Series,
1112
Timestamp,
13+
to_datetime,
1214
)
1315
import pandas._testing as tm
1416

@@ -176,6 +178,20 @@ def test_astype_category(self, dtype_ordered, cat_ordered):
176178
expected = cat
177179
tm.assert_categorical_equal(result, expected)
178180

181+
def test_astype_object_datetime_categories(self):
182+
# GH#40754
183+
cat = Categorical(to_datetime(["2021-03-27", NaT]))
184+
result = cat.astype(object)
185+
expected = np.array([Timestamp("2021-03-27 00:00:00"), np.nan], dtype="object")
186+
tm.assert_numpy_array_equal(result, expected)
187+
188+
def test_astype_object_timestamp_categories(self):
189+
# GH#18024
190+
cat = Categorical([Timestamp("2014-01-01")])
191+
result = cat.astype(object)
192+
expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object")
193+
tm.assert_numpy_array_equal(result, expected)
194+
179195
def test_iter_python_types(self):
180196
# GH-19909
181197
cat = Categorical([1, 2])

pandas/tests/io/formats/test_to_csv.py

+15
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,21 @@ def test_to_csv_date_format(self):
282282
df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
283283
assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
284284

285+
def test_to_csv_date_format_in_categorical(self):
286+
# GH#40754
287+
ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d"))
288+
ser = ser.astype("category")
289+
expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""'])
290+
assert ser.to_csv(index=False) == expected
291+
292+
ser = pd.Series(
293+
pd.date_range(
294+
start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin"
295+
).append(pd.DatetimeIndex([pd.NaT]))
296+
)
297+
ser = ser.astype("category")
298+
assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected
299+
285300
def test_to_csv_multi_index(self):
286301
# see gh-6618
287302
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))

pandas/tests/series/indexing/test_where.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
import pandas.util._test_decorators as td
5-
64
from pandas.core.dtypes.common import is_integer
75

86
import pandas as pd
@@ -440,15 +438,10 @@ def test_where_categorical(frame_or_series):
440438
tm.assert_equal(exp, res)
441439

442440

443-
def test_where_datetimelike_categorical(request, tz_naive_fixture, using_array_manager):
441+
def test_where_datetimelike_categorical(request, tz_naive_fixture):
444442
# GH#37682
445443
tz = tz_naive_fixture
446444

447-
if using_array_manager and tz is None:
448-
# TODO(ArrayManager) DataFrame.values not yet correctly returning datetime array
449-
# for categorical with datetime categories
450-
td.mark_array_manager_not_yet_implemented(request)
451-
452445
dr = date_range("2001-01-01", periods=3, tz=tz)._with_freq(None)
453446
lvals = pd.DatetimeIndex([dr[0], dr[1], pd.NaT])
454447
rvals = pd.Categorical([dr[0], pd.NaT, dr[2]])

0 commit comments

Comments
 (0)