Skip to content

Commit d9e39f7

Browse files
jbrockmendelTLouf
authored andcommitted
BUG: lib.infer_dtype with mixed-freq Periods (pandas-dev#41526)
1 parent f78f047 commit d9e39f7

File tree

8 files changed

+112
-52
lines changed

8 files changed

+112
-52
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ Other enhancements
228228
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
229229
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
230230
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
231-
-
231+
- :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
232232

233233
.. ---------------------------------------------------------------------------
234234

pandas/_libs/lib.pyi

+14-5
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ def is_integer(val: object) -> bool: ...
4040
def is_float(val: object) -> bool: ...
4141

4242
def is_interval_array(values: np.ndarray) -> bool: ...
43-
def is_period_array(values: np.ndarray) -> bool: ...
4443
def is_datetime64_array(values: np.ndarray) -> bool: ...
4544
def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ...
4645
def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
@@ -67,50 +66,60 @@ def map_infer(
6766
@overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray
6867
def maybe_convert_objects(
6968
objects: np.ndarray, # np.ndarray[object]
69+
*,
7070
try_float: bool = ...,
7171
safe: bool = ...,
7272
convert_datetime: Literal[False] = ...,
7373
convert_timedelta: bool = ...,
74+
convert_period: Literal[False] = ...,
7475
convert_to_nullable_integer: Literal[False] = ...,
7576
) -> np.ndarray: ...
7677

7778
@overload
7879
def maybe_convert_objects(
7980
objects: np.ndarray, # np.ndarray[object]
81+
*,
8082
try_float: bool = ...,
8183
safe: bool = ...,
82-
convert_datetime: Literal[False] = False,
84+
convert_datetime: bool = ...,
8385
convert_timedelta: bool = ...,
86+
convert_period: bool = ...,
8487
convert_to_nullable_integer: Literal[True] = ...,
8588
) -> ArrayLike: ...
8689

8790
@overload
8891
def maybe_convert_objects(
8992
objects: np.ndarray, # np.ndarray[object]
93+
*,
9094
try_float: bool = ...,
9195
safe: bool = ...,
9296
convert_datetime: Literal[True] = ...,
9397
convert_timedelta: bool = ...,
94-
convert_to_nullable_integer: Literal[False] = ...,
98+
convert_period: bool = ...,
99+
convert_to_nullable_integer: bool = ...,
95100
) -> ArrayLike: ...
96101

97102
@overload
98103
def maybe_convert_objects(
99104
objects: np.ndarray, # np.ndarray[object]
105+
*,
100106
try_float: bool = ...,
101107
safe: bool = ...,
102-
convert_datetime: Literal[True] = ...,
108+
convert_datetime: bool = ...,
103109
convert_timedelta: bool = ...,
104-
convert_to_nullable_integer: Literal[True] = ...,
110+
convert_period: Literal[True] = ...,
111+
convert_to_nullable_integer: bool = ...,
105112
) -> ArrayLike: ...
106113

107114
@overload
108115
def maybe_convert_objects(
109116
objects: np.ndarray, # np.ndarray[object]
117+
*,
110118
try_float: bool = ...,
111119
safe: bool = ...,
112120
convert_datetime: bool = ...,
113121
convert_timedelta: bool = ...,
122+
convert_period: bool = ...,
114123
convert_to_nullable_integer: bool = ...,
115124
) -> ArrayLike: ...
116125

pandas/_libs/lib.pyx

+54-23
Original file line numberDiff line numberDiff line change
@@ -1186,6 +1186,7 @@ cdef class Seen:
11861186
bint coerce_numeric # coerce data to numeric
11871187
bint timedelta_ # seen_timedelta
11881188
bint datetimetz_ # seen_datetimetz
1189+
bint period_ # seen_period
11891190

11901191
def __cinit__(self, bint coerce_numeric=False):
11911192
"""
@@ -1210,6 +1211,7 @@ cdef class Seen:
12101211
self.datetime_ = False
12111212
self.timedelta_ = False
12121213
self.datetimetz_ = False
1214+
self.period_ = False
12131215
self.coerce_numeric = coerce_numeric
12141216

12151217
cdef inline bint check_uint64_conflict(self) except -1:
@@ -1996,18 +1998,35 @@ cpdef bint is_time_array(ndarray values, bint skipna=False):
19961998
return validator.validate(values)
19971999

19982000

1999-
cdef class PeriodValidator(TemporalValidator):
2000-
cdef inline bint is_value_typed(self, object value) except -1:
2001-
return is_period_object(value)
2001+
cdef bint is_period_array(ndarray[object] values):
2002+
"""
2003+
Is this an ndarray of Period objects (or NaT) with a single `freq`?
2004+
"""
2005+
cdef:
2006+
Py_ssize_t i, n = len(values)
2007+
int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND
2008+
object val
20022009

2003-
cdef inline bint is_valid_null(self, object value) except -1:
2004-
return checknull_with_nat(value)
2010+
if len(values) == 0:
2011+
return False
20052012

2013+
for val in values:
2014+
if is_period_object(val):
2015+
if dtype_code == -10000:
2016+
dtype_code = val._dtype._dtype_code
2017+
elif dtype_code != val._dtype._dtype_code:
2018+
# mismatched freqs
2019+
return False
2020+
elif checknull_with_nat(val):
2021+
pass
2022+
else:
2023+
# Not a Period or NaT-like
2024+
return False
20062025

2007-
cpdef bint is_period_array(ndarray values):
2008-
cdef:
2009-
PeriodValidator validator = PeriodValidator(len(values), skipna=True)
2010-
return validator.validate(values)
2026+
if dtype_code == -10000:
2027+
# we saw all-NaTs, no actual Periods
2028+
return False
2029+
return True
20112030

20122031

20132032
cdef class IntervalValidator(Validator):
@@ -2249,9 +2268,13 @@ def maybe_convert_numeric(
22492268

22502269
@cython.boundscheck(False)
22512270
@cython.wraparound(False)
2252-
def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
2253-
bint safe=False, bint convert_datetime=False,
2271+
def maybe_convert_objects(ndarray[object] objects,
2272+
*,
2273+
bint try_float=False,
2274+
bint safe=False,
2275+
bint convert_datetime=False,
22542276
bint convert_timedelta=False,
2277+
bint convert_period=False,
22552278
bint convert_to_nullable_integer=False) -> "ArrayLike":
22562279
"""
22572280
Type inference function-- convert object array to proper dtype
@@ -2272,6 +2295,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
22722295
convert_timedelta : bool, default False
22732296
If an array-like object contains only timedelta values or NaT is
22742297
encountered, whether to convert and return an array of m8[ns] dtype.
2298+
convert_period : bool, default False
2299+
If an array-like object contains only (homogeneous-freq) Period values
2300+
or NaT, whether to convert and return a PeriodArray.
22752301
convert_to_nullable_integer : bool, default False
22762302
If an array-like object contains only integer values (and NaN) is
22772303
encountered, whether to convert and return an IntegerArray.
@@ -2292,7 +2318,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
22922318
int64_t[:] itimedeltas
22932319
Seen seen = Seen()
22942320
object val
2295-
float64_t fval, fnan
2321+
float64_t fval, fnan = np.nan
22962322

22972323
n = len(objects)
22982324

@@ -2311,8 +2337,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
23112337
timedeltas = np.empty(n, dtype='m8[ns]')
23122338
itimedeltas = timedeltas.view(np.int64)
23132339

2314-
fnan = np.nan
2315-
23162340
for i in range(n):
23172341
val = objects[i]
23182342
if itemsize_max != -1:
@@ -2330,7 +2354,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
23302354
idatetimes[i] = NPY_NAT
23312355
if convert_timedelta:
23322356
itimedeltas[i] = NPY_NAT
2333-
if not (convert_datetime or convert_timedelta):
2357+
if not (convert_datetime or convert_timedelta or convert_period):
23342358
seen.object_ = True
23352359
break
23362360
elif val is np.nan:
@@ -2343,14 +2367,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
23432367
elif util.is_float_object(val):
23442368
floats[i] = complexes[i] = val
23452369
seen.float_ = True
2346-
elif util.is_datetime64_object(val):
2347-
if convert_datetime:
2348-
idatetimes[i] = convert_to_tsobject(
2349-
val, None, None, 0, 0).value
2350-
seen.datetime_ = True
2351-
else:
2352-
seen.object_ = True
2353-
break
23542370
elif is_timedelta(val):
23552371
if convert_timedelta:
23562372
itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8")
@@ -2396,6 +2412,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
23962412
else:
23972413
seen.object_ = True
23982414
break
2415+
elif is_period_object(val):
2416+
if convert_period:
2417+
seen.period_ = True
2418+
break
2419+
else:
2420+
seen.object_ = True
2421+
break
23992422
elif try_float and not isinstance(val, str):
24002423
# this will convert Decimal objects
24012424
try:
@@ -2419,6 +2442,14 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
24192442
return dti._data
24202443
seen.object_ = True
24212444

2445+
if seen.period_:
2446+
if is_period_array(objects):
2447+
from pandas import PeriodIndex
2448+
pi = PeriodIndex(objects)
2449+
2450+
# unbox to PeriodArray
2451+
return pi._data
2452+
24222453
if not seen.object_:
24232454
result = None
24242455
if not safe:

pandas/core/arrays/datetimelike.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ def _box_values(self, values) -> np.ndarray:
261261
"""
262262
apply box func to passed values
263263
"""
264-
return lib.map_infer(values, self._box_func)
264+
return lib.map_infer(values, self._box_func, convert=False)
265265

266266
def __iter__(self):
267267
if self.ndim > 1:

pandas/core/dtypes/cast.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,7 @@ def soft_convert_objects(
13151315
datetime: bool = True,
13161316
numeric: bool = True,
13171317
timedelta: bool = True,
1318+
period: bool = True,
13181319
copy: bool = True,
13191320
) -> ArrayLike:
13201321
"""
@@ -1327,6 +1328,7 @@ def soft_convert_objects(
13271328
datetime : bool, default True
13281329
numeric: bool, default True
13291330
timedelta : bool, default True
1331+
period : bool, default True
13301332
copy : bool, default True
13311333
13321334
Returns
@@ -1348,7 +1350,10 @@ def soft_convert_objects(
13481350
# bound of nanosecond-resolution 64-bit integers.
13491351
try:
13501352
converted = lib.maybe_convert_objects(
1351-
values, convert_datetime=datetime, convert_timedelta=timedelta
1353+
values,
1354+
convert_datetime=datetime,
1355+
convert_timedelta=timedelta,
1356+
convert_period=period,
13521357
)
13531358
except (OutOfBoundsDatetime, ValueError):
13541359
return values

pandas/tests/dtypes/test_inference.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -1105,8 +1105,9 @@ def test_infer_dtype_period(self):
11051105
arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="D")])
11061106
assert lib.infer_dtype(arr, skipna=True) == "period"
11071107

1108+
# non-homogeneous freqs -> mixed
11081109
arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="M")])
1109-
assert lib.infer_dtype(arr, skipna=True) == "period"
1110+
assert lib.infer_dtype(arr, skipna=True) == "mixed"
11101111

11111112
@pytest.mark.parametrize("klass", [pd.array, Series, Index])
11121113
@pytest.mark.parametrize("skipna", [True, False])
@@ -1121,6 +1122,18 @@ def test_infer_dtype_period_array(self, klass, skipna):
11211122
)
11221123
assert lib.infer_dtype(values, skipna=skipna) == "period"
11231124

1125+
# periods but mixed freq
1126+
values = klass(
1127+
[
1128+
Period("2011-01-01", freq="D"),
1129+
Period("2011-01-02", freq="M"),
1130+
pd.NaT,
1131+
]
1132+
)
1133+
# with pd.array this becomes PandasArray which ends up as "unknown-array"
1134+
exp = "unknown-array" if klass is pd.array else "mixed"
1135+
assert lib.infer_dtype(values, skipna=skipna) == exp
1136+
11241137
def test_infer_dtype_period_mixed(self):
11251138
arr = np.array(
11261139
[Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object
@@ -1319,7 +1332,6 @@ def test_is_datetimelike_array_all_nan_nat_like(self):
13191332
"is_date_array",
13201333
"is_time_array",
13211334
"is_interval_array",
1322-
"is_period_array",
13231335
],
13241336
)
13251337
def test_other_dtypes_for_array(self, func):

pandas/tests/frame/methods/test_replace.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -1021,11 +1021,9 @@ def test_replace_period(self):
10211021
columns=["fname"],
10221022
)
10231023
assert set(df.fname.values) == set(d["fname"].keys())
1024-
# We don't support converting object -> specialized EA in
1025-
# replace yet.
1026-
expected = DataFrame(
1027-
{"fname": [d["fname"][k] for k in df.fname.values]}, dtype=object
1028-
)
1024+
1025+
expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]})
1026+
assert expected.dtypes[0] == "Period[M]"
10291027
result = df.replace(d)
10301028
tm.assert_frame_equal(result, expected)
10311029

0 commit comments

Comments
 (0)