Skip to content

Commit 7403240

Browse files
committed
Merge remote-tracking branch 'upstream/main' into 47216
# Conflicts: # doc/source/whatsnew/v1.5.0.rst
2 parents 3c614a9 + 7d2f9b8 commit 7403240

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1228
-315
lines changed

doc/source/reference/testing.rst

+2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ Exceptions and warnings
4343
errors.ParserError
4444
errors.ParserWarning
4545
errors.PerformanceWarning
46+
errors.PyperclipException
47+
errors.PyperclipWindowsException
4648
errors.SettingWithCopyError
4749
errors.SettingWithCopyWarning
4850
errors.SpecificationError

doc/source/whatsnew/v1.4.4.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Fixed regressions
2323

2424
Bug fixes
2525
~~~~~~~~~
26-
-
26+
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
2727
-
2828

2929
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.5.0.rst

+35-25
Large diffs are not rendered by default.

pandas/_libs/tslibs/conversion.pyx

+3-4
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ from pandas._libs.tslibs.np_datetime cimport (
4040
NPY_FR_ns,
4141
astype_overflowsafe,
4242
check_dts_bounds,
43-
dt64_to_dtstruct,
4443
dtstruct_to_dt64,
4544
get_datetime64_unit,
4645
get_datetime64_value,
@@ -248,7 +247,7 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
248247
elif is_datetime64_object(ts):
249248
obj.value = get_datetime64_nanos(ts)
250249
if obj.value != NPY_NAT:
251-
dt64_to_dtstruct(obj.value, &obj.dts)
250+
pandas_datetime_to_datetimestruct(obj.value, NPY_FR_ns, &obj.dts)
252251
elif is_integer_object(ts):
253252
try:
254253
ts = <int64_t>ts
@@ -266,7 +265,7 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
266265

267266
ts = ts * cast_from_unit(None, unit)
268267
obj.value = ts
269-
dt64_to_dtstruct(ts, &obj.dts)
268+
pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts)
270269
elif is_float_object(ts):
271270
if ts != ts or ts == NPY_NAT:
272271
obj.value = NPY_NAT
@@ -289,7 +288,7 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
289288

290289
ts = cast_from_unit(ts, unit)
291290
obj.value = ts
292-
dt64_to_dtstruct(ts, &obj.dts)
291+
pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts)
293292
elif PyDateTime_Check(ts):
294293
return convert_datetime_to_tsobject(ts, tz, nanos)
295294
elif PyDate_Check(ts):

pandas/_libs/tslibs/np_datetime.pxd

-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1
7676
cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=?)
7777

7878
cdef int64_t dtstruct_to_dt64(npy_datetimestruct* dts) nogil
79-
cdef void dt64_to_dtstruct(int64_t dt64, npy_datetimestruct* out) nogil
8079

8180
cdef int64_t pydatetime_to_dt64(datetime val, npy_datetimestruct *dts)
8281
cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts)

pandas/_libs/tslibs/np_datetime.pyx

-8
Original file line numberDiff line numberDiff line change
@@ -217,14 +217,6 @@ cdef inline int64_t dtstruct_to_dt64(npy_datetimestruct* dts) nogil:
217217
return npy_datetimestruct_to_datetime(NPY_FR_ns, dts)
218218

219219

220-
cdef inline void dt64_to_dtstruct(int64_t dt64,
221-
npy_datetimestruct* out) nogil:
222-
"""Convenience function to call pandas_datetime_to_datetimestruct
223-
with the by-far-most-common frequency NPY_FR_ns"""
224-
pandas_datetime_to_datetimestruct(dt64, NPY_FR_ns, out)
225-
return
226-
227-
228220
# just exposed for testing at the moment
229221
def py_td64_to_tdstruct(int64_t td64, NPY_DATETIMEUNIT unit):
230222
cdef:

pandas/_libs/tslibs/period.pyx

+2-4
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,6 @@ from pandas._libs.tslibs.np_datetime cimport (
4949
NPY_FR_us,
5050
astype_overflowsafe,
5151
check_dts_bounds,
52-
dt64_to_dtstruct,
53-
dtstruct_to_dt64,
5452
get_timedelta64_value,
5553
npy_datetimestruct,
5654
npy_datetimestruct_to_datetime,
@@ -813,7 +811,7 @@ cdef void get_date_info(int64_t ordinal, int freq, npy_datetimestruct *dts) nogi
813811

814812
pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, dts)
815813

816-
dt64_to_dtstruct(nanos, &dts2)
814+
pandas_datetime_to_datetimestruct(nanos, NPY_DATETIMEUNIT.NPY_FR_ns, &dts2)
817815
dts.hour = dts2.hour
818816
dts.min = dts2.min
819817
dts.sec = dts2.sec
@@ -1149,7 +1147,7 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1:
11491147
get_date_info(ordinal, freq, &dts)
11501148

11511149
check_dts_bounds(&dts)
1152-
return dtstruct_to_dt64(&dts)
1150+
return npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT.NPY_FR_ns, &dts)
11531151

11541152

11551153
cdef str period_format(int64_t value, int freq, object fmt=None):

pandas/_libs/tslibs/vectorized.pyx

-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ from .nattype cimport (
3030
from .np_datetime cimport (
3131
NPY_DATETIMEUNIT,
3232
NPY_FR_ns,
33-
dt64_to_dtstruct,
3433
npy_datetimestruct,
3534
pandas_datetime_to_datetimestruct,
3635
)

pandas/core/algorithms.py

+97-13
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
from __future__ import annotations
66

7+
import inspect
78
import operator
89
from textwrap import dedent
910
from typing import (
@@ -14,7 +15,7 @@
1415
cast,
1516
final,
1617
)
17-
from warnings import warn
18+
import warnings
1819

1920
import numpy as np
2021

@@ -57,6 +58,7 @@
5758
is_numeric_dtype,
5859
is_object_dtype,
5960
is_scalar,
61+
is_signed_integer_dtype,
6062
is_timedelta64_dtype,
6163
needs_i8_conversion,
6264
)
@@ -446,7 +448,12 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
446448
)
447449

448450
if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
449-
values = _ensure_arraylike(list(values))
451+
if not is_signed_integer_dtype(comps):
452+
# GH#46485 Use object to avoid upcast to float64 later
453+
# TODO: Share with _find_common_type_compat
454+
values = construct_1d_object_array_from_listlike(list(values))
455+
else:
456+
values = _ensure_arraylike(list(values))
450457
elif isinstance(values, ABCMultiIndex):
451458
# Avoid raising in extract_array
452459
values = np.array(values)
@@ -580,7 +587,8 @@ def factorize_array(
580587
def factorize(
581588
values,
582589
sort: bool = False,
583-
na_sentinel: int | None = -1,
590+
na_sentinel: int | None | lib.NoDefault = lib.no_default,
591+
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
584592
size_hint: int | None = None,
585593
) -> tuple[np.ndarray, np.ndarray | Index]:
586594
"""
@@ -598,7 +606,19 @@ def factorize(
598606
Value to mark "not found". If None, will not drop the NaN
599607
from the uniques of the values.
600608
609+
.. deprecated:: 1.5.0
610+
The na_sentinel argument is deprecated and
611+
will be removed in a future version of pandas. Specify use_na_sentinel as
612+
either True or False.
613+
601614
.. versionchanged:: 1.1.2
615+
616+
use_na_sentinel : bool, default True
617+
If True, the sentinel -1 will be used for NaN values. If False,
618+
NaN values will be encoded as non-negative integers and will not drop the
619+
NaN from the uniques of the values.
620+
621+
.. versionadded:: 1.5.0
602622
{size_hint}\
603623
604624
Returns
@@ -646,8 +666,8 @@ def factorize(
646666
>>> uniques
647667
array(['a', 'b', 'c'], dtype=object)
648668
649-
Missing values are indicated in `codes` with `na_sentinel`
650-
(``-1`` by default). Note that missing values are never
669+
When ``use_na_sentinel=True`` (the default), missing values are indicated in
670+
the `codes` with the sentinel value ``-1`` and missing values are not
651671
included in `uniques`.
652672
653673
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
@@ -682,16 +702,16 @@ def factorize(
682702
Index(['a', 'c'], dtype='object')
683703
684704
If NaN is in the values, and we want to include NaN in the uniques of the
685-
values, it can be achieved by setting ``na_sentinel=None``.
705+
values, it can be achieved by setting ``use_na_sentinel=False``.
686706
687707
>>> values = np.array([1, 2, 1, np.nan])
688-
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
708+
>>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
689709
>>> codes
690710
array([ 0, 1, 0, -1])
691711
>>> uniques
692712
array([1., 2.])
693713
694-
>>> codes, uniques = pd.factorize(values, na_sentinel=None)
714+
>>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
695715
>>> codes
696716
array([0, 1, 0, 2])
697717
>>> uniques
@@ -706,6 +726,7 @@ def factorize(
706726
# responsible only for factorization. All data coercion, sorting and boxing
707727
# should happen here.
708728

729+
na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
709730
if isinstance(values, ABCRangeIndex):
710731
return values.factorize(sort=sort)
711732

@@ -730,9 +751,22 @@ def factorize(
730751
codes, uniques = values.factorize(sort=sort)
731752
return _re_wrap_factorize(original, uniques, codes)
732753

733-
if not isinstance(values.dtype, np.dtype):
734-
# i.e. ExtensionDtype
735-
codes, uniques = values.factorize(na_sentinel=na_sentinel)
754+
elif not isinstance(values.dtype, np.dtype):
755+
if (
756+
na_sentinel == -1
757+
and "use_na_sentinel" in inspect.signature(values.factorize).parameters
758+
):
759+
# Avoid using catch_warnings when possible
760+
# GH#46910 - TimelikeOps has deprecated signature
761+
codes, uniques = values.factorize( # type: ignore[call-arg]
762+
use_na_sentinel=True
763+
)
764+
else:
765+
with warnings.catch_warnings():
766+
# We've already warned above
767+
warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
768+
codes, uniques = values.factorize(na_sentinel=na_sentinel)
769+
736770
else:
737771
values = np.asarray(values) # convert DTA/TDA/MultiIndex
738772
codes, uniques = factorize_array(
@@ -757,6 +791,56 @@ def factorize(
757791
return _re_wrap_factorize(original, uniques, codes)
758792

759793

794+
def resolve_na_sentinel(
795+
na_sentinel: int | None | lib.NoDefault,
796+
use_na_sentinel: bool | lib.NoDefault,
797+
) -> int | None:
798+
"""
799+
Determine value of na_sentinel for factorize methods.
800+
801+
See GH#46910 for details on the deprecation.
802+
803+
Parameters
804+
----------
805+
na_sentinel : int, None, or lib.no_default
806+
Value passed to the method.
807+
use_na_sentinel : bool or lib.no_default
808+
Value passed to the method.
809+
810+
Returns
811+
-------
812+
Resolved value of na_sentinel.
813+
"""
814+
if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default:
815+
raise ValueError(
816+
"Cannot specify both `na_sentinel` and `use_na_sentile`; "
817+
f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`"
818+
)
819+
if na_sentinel is lib.no_default:
820+
result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None
821+
else:
822+
if na_sentinel is None:
823+
msg = (
824+
"Specifying `na_sentinel=None` is deprecated, specify "
825+
"`use_na_sentinel=False` instead."
826+
)
827+
elif na_sentinel == -1:
828+
msg = (
829+
"Specifying `na_sentinel=-1` is deprecated, specify "
830+
"`use_na_sentinel=True` instead."
831+
)
832+
else:
833+
msg = (
834+
"Specifying the specific value to use for `na_sentinel` is "
835+
"deprecated and will be removed in a future version of pandas. "
836+
"Specify `use_na_sentinel=True` to use the sentinel value -1, and "
837+
"`use_na_sentinel=False` to encode NaN values."
838+
)
839+
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
840+
result = na_sentinel
841+
return result
842+
843+
760844
def _re_wrap_factorize(original, uniques, codes: np.ndarray):
761845
"""
762846
Wrap factorize results in Series or Index depending on original type.
@@ -950,7 +1034,7 @@ def mode(
9501034
try:
9511035
npresult = np.sort(npresult)
9521036
except TypeError as err:
953-
warn(f"Unable to sort modes: {err}")
1037+
warnings.warn(f"Unable to sort modes: {err}")
9541038

9551039
result = _reconstruct_data(npresult, original.dtype, original)
9561040
return result
@@ -1570,7 +1654,7 @@ def diff(arr, n: int, axis: int = 0):
15701654
raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
15711655
return op(arr, arr.shift(n))
15721656
else:
1573-
warn(
1657+
warnings.warn(
15741658
"dtype lost in 'diff()'. In the future this will raise a "
15751659
"TypeError. Convert to a suitable dtype prior to calling 'diff'.",
15761660
FutureWarning,

0 commit comments

Comments
 (0)