Skip to content

Commit b767d02

Browse files
authored
Merge branch 'main' into issue-50977
2 parents fd0e408 + a254bcf commit b767d02

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+160
-256
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,7 @@ Other API changes
620620
new DataFrame (shallow copy) instead of the original DataFrame, consistent with other
621621
methods to get a full slice (for example ``df.loc[:]`` or ``df[:]``) (:issue:`49469`)
622622
- Disallow computing ``cumprod`` for :class:`Timedelta` object; previously this returned incorrect values (:issue:`50246`)
623+
- :class:`DataFrame` objects read from a :class:`HDFStore` file without an index now have a :class:`RangeIndex` instead of an ``int64`` index (:issue:`51076`)
623624
- Instantiating an :class:`Index` with an numeric numpy dtype with data containing :class:`NA` and/or :class:`NaT` now raises a ``ValueError``. Previously a ``TypeError`` was raised (:issue:`51050`)
624625
- Loading a JSON file with duplicate columns using ``read_json(orient='split')`` renames columns to avoid duplicates, as :func:`read_csv` and the other readers do (:issue:`50370`)
625626
- The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`)

pandas/_libs/index.pyx

-2
Original file line numberDiff line numberDiff line change
@@ -238,8 +238,6 @@ cdef class IndexEngine:
238238
return self.unique == 1
239239

240240
cdef _do_unique_check(self):
241-
242-
# this de-facto the same
243241
self._ensure_mapping_populated()
244242

245243
@property

pandas/_libs/indexing.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ _IndexingMixinT = TypeVar("_IndexingMixinT", bound=IndexingMixin)
99

1010
class NDFrameIndexerBase(Generic[_IndexingMixinT]):
1111
name: str
12-
# in practise obj is either a DataFrame or a Series
12+
# in practice obj is either a DataFrame or a Series
1313
obj: _IndexingMixinT
1414

1515
def __init__(self, name: str, obj: _IndexingMixinT) -> None: ...

pandas/_libs/internals.pyx

+3-4
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ from pandas._libs.util cimport (
3333
@cython.final
3434
@cython.freelist(32)
3535
cdef class BlockPlacement:
36-
# __slots__ = '_as_slice', '_as_array', '_len'
3736
cdef:
3837
slice _as_slice
3938
ndarray _as_array # Note: this still allows `None`; will be intp_t
@@ -621,7 +620,7 @@ cdef class NumpyBlock(SharedBlock):
621620
public ndarray values
622621

623622
def __cinit__(self, ndarray values, BlockPlacement placement, int ndim):
624-
# set values here the (implicit) call to SharedBlock.__cinit__ will
623+
# set values here; the (implicit) call to SharedBlock.__cinit__ will
625624
# set placement and ndim
626625
self.values = values
627626

@@ -643,7 +642,7 @@ cdef class NDArrayBackedBlock(SharedBlock):
643642
NDArrayBacked values
644643

645644
def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
646-
# set values here the (implicit) call to SharedBlock.__cinit__ will
645+
# set values here; the (implicit) call to SharedBlock.__cinit__ will
647646
# set placement and ndim
648647
self.values = values
649648

@@ -662,7 +661,7 @@ cdef class Block(SharedBlock):
662661
public object values
663662

664663
def __cinit__(self, object values, BlockPlacement placement, int ndim):
665-
# set values here the (implicit) call to SharedBlock.__cinit__ will
664+
# set values here; the (implicit) call to SharedBlock.__cinit__ will
666665
# set placement and ndim
667666
self.values = values
668667

pandas/_libs/lib.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
492492

493493
@cython.wraparound(False)
494494
@cython.boundscheck(False)
495-
# Can add const once https://github.com/cython/cython/issues/1772 resolved
495+
# TODO(cython3): Can add const once cython#1772 is resolved
496496
def has_infs(floating[:] arr) -> bool:
497497
cdef:
498498
Py_ssize_t i, n = len(arr)

pandas/_libs/parsers.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ from libc.string cimport (
4646

4747

4848
cdef extern from "Python.h":
49+
# TODO(cython3): get this from cpython.unicode
4950
object PyUnicode_FromString(char *v)
5051

5152

@@ -453,14 +454,12 @@ cdef class TextReader:
453454

454455
self.skipfooter = skipfooter
455456

456-
# suboptimal
457457
if usecols is not None:
458458
self.has_usecols = 1
459459
# GH-20558, validate usecols at higher level and only pass clean
460460
# usecols into TextReader.
461461
self.usecols = usecols
462462

463-
# TODO: XXX?
464463
if skipfooter > 0:
465464
self.parser.on_bad_lines = SKIP
466465

@@ -501,7 +500,6 @@ cdef class TextReader:
501500
self.dtype = dtype
502501
self.use_nullable_dtypes = use_nullable_dtypes
503502

504-
# XXX
505503
self.noconvert = set()
506504

507505
self.index_col = index_col
@@ -761,7 +759,7 @@ cdef class TextReader:
761759
# Corner case, not enough lines in the file
762760
if self.parser.lines < data_line + 1:
763761
field_count = len(header[0])
764-
else: # not self.has_usecols:
762+
else:
765763

766764
field_count = self.parser.line_fields[data_line]
767765

@@ -1409,6 +1407,8 @@ def _maybe_upcast(arr, use_nullable_dtypes: bool = False):
14091407
The casted array.
14101408
"""
14111409
if is_extension_array_dtype(arr.dtype):
1410+
# TODO: the docstring says arr is an ndarray, in which case this cannot
1411+
# be reached. Is that incorrect?
14121412
return arr
14131413

14141414
na_value = na_values[arr.dtype]

pandas/_libs/reduction.pyi

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
from typing import Any
22

3-
import numpy as np
3+
from pandas._typing import DtypeObj
44

5-
from pandas._typing import ExtensionDtype
6-
7-
def check_result_array(obj: object, dtype: np.dtype | ExtensionDtype) -> None: ...
5+
def check_result_array(obj: object, dtype: DtypeObj) -> None: ...
86
def extract_result(res: object) -> Any: ...

pandas/_libs/sparse.pyx

-3
Original file line numberDiff line numberDiff line change
@@ -301,9 +301,6 @@ cdef class BlockIndex(SparseIndex):
301301
self.nblocks = np.int32(len(self.blocs))
302302
self.npoints = self.blengths.sum()
303303

304-
# self.block_start = blocs
305-
# self.block_end = blocs + blengths
306-
307304
self.check_integrity()
308305

309306
def __reduce__(self):

pandas/_libs/sparse_op_helper.pxi.in

+5-5
Original file line numberDiff line numberDiff line change
@@ -137,16 +137,16 @@ cdef tuple block_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
137137
{{dtype}}_t[:] y_,
138138
BlockIndex yindex,
139139
{{dtype}}_t yfill):
140-
'''
140+
"""
141141
Binary operator on BlockIndex objects with fill values
142-
'''
142+
"""
143143

144144
cdef:
145145
BlockIndex out_index
146-
Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
147-
int32_t xbp = 0, ybp = 0 # block positions
146+
Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
147+
int32_t xbp = 0, ybp = 0 # block positions
148148
int32_t xloc, yloc
149-
Py_ssize_t xblock = 0, yblock = 0 # block numbers
149+
Py_ssize_t xblock = 0, yblock = 0 # block numbers
150150

151151
{{dtype}}_t[:] x, y
152152
ndarray[{{rdtype}}_t, ndim=1] out

pandas/_libs/tslib.pyx

+7-7
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def format_array_from_datetime(
115115

116116
Parameters
117117
----------
118-
values : a 1-d i8 array
118+
values : ndarray[int64_t], arbitrary ndim
119119
tz : tzinfo or None, default None
120120
format : str or None, default None
121121
a strftime capable string
@@ -260,9 +260,9 @@ def array_with_unit_to_datetime(
260260
cdef:
261261
Py_ssize_t i, n=len(values)
262262
int64_t mult
263-
bint is_ignore = errors=="ignore"
264-
bint is_coerce = errors=="coerce"
265-
bint is_raise = errors=="raise"
263+
bint is_ignore = errors == "ignore"
264+
bint is_coerce = errors == "coerce"
265+
bint is_raise = errors == "raise"
266266
ndarray[int64_t] iresult
267267
tzinfo tz = None
268268
float fval
@@ -446,9 +446,9 @@ cpdef array_to_datetime(
446446
npy_datetimestruct dts
447447
bint utc_convert = bool(utc)
448448
bint seen_datetime_offset = False
449-
bint is_raise = errors=="raise"
450-
bint is_ignore = errors=="ignore"
451-
bint is_coerce = errors=="coerce"
449+
bint is_raise = errors == "raise"
450+
bint is_ignore = errors == "ignore"
451+
bint is_coerce = errors == "coerce"
452452
bint is_same_offsets
453453
_TSObject _ts
454454
float tz_offset

pandas/_libs/tslibs/conversion.pyx

+1-3
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
5353
from pandas._libs.tslibs.timezones cimport (
5454
get_utcoffset,
5555
is_utc,
56-
maybe_get_tz,
5756
)
5857
from pandas._libs.tslibs.util cimport (
5958
is_datetime64_object,
@@ -124,7 +123,7 @@ cdef int64_t cast_from_unit(object ts, str unit) except? -1:
124123
dt64obj = np.datetime64(ts, unit)
125124
return get_datetime64_nanos(dt64obj, NPY_FR_ns)
126125

127-
# cast the unit, multiply base/frace separately
126+
# cast the unit, multiply base/frac separately
128127
# to avoid precision issues from float -> int
129128
try:
130129
base = <int64_t>ts
@@ -380,7 +379,6 @@ cdef _TSObject convert_datetime_to_tsobject(
380379
obj.creso = reso
381380
obj.fold = ts.fold
382381
if tz is not None:
383-
tz = maybe_get_tz(tz)
384382

385383
if ts.tzinfo is not None:
386384
# Convert the current timezone to the passed timezone

pandas/_libs/tslibs/period.pyx

+1-5
Original file line numberDiff line numberDiff line change
@@ -2612,11 +2612,7 @@ class Period(_Period):
26122612

26132613
if freq is None and ordinal != NPY_NAT:
26142614
# Skip NaT, since it doesn't have a resolution
2615-
try:
2616-
freq = attrname_to_abbrevs[reso]
2617-
except KeyError:
2618-
raise ValueError(f"Invalid frequency or could not "
2619-
f"infer: {reso}")
2615+
freq = attrname_to_abbrevs[reso]
26202616
freq = to_offset(freq)
26212617

26222618
elif PyDateTime_Check(value):

pandas/_libs/tslibs/timedeltas.pxd

-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ cpdef int64_t delta_to_nanoseconds(
1010
) except? -1
1111
cdef convert_to_timedelta64(object ts, str unit)
1212
cdef bint is_any_td_scalar(object obj)
13-
cdef object ensure_td64ns(object ts)
1413

1514

1615
cdef class _Timedelta(timedelta):

pandas/_libs/tslibs/timedeltas.pyx

-4
Original file line numberDiff line numberDiff line change
@@ -691,10 +691,6 @@ cdef timedelta_from_spec(object number, object frac, object unit):
691691
"values and are not supported."
692692
)
693693

694-
if unit == "M":
695-
# To parse ISO 8601 string, 'M' should be treated as minute,
696-
# not month
697-
unit = "m"
698694
unit = parse_timedelta_unit(unit)
699695

700696
n = "".join(number) + "." + "".join(frac)

pandas/_libs/tslibs/tzconversion.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ cdef _get_utc_bounds_zoneinfo(ndarray vals, tz, NPY_DATETIMEUNIT creso):
545545

546546
pandas_datetime_to_datetimestruct(val, creso, &dts)
547547
# casting to pydatetime drops nanoseconds etc, which we will
548-
# need to re-add later as 'extra''
548+
# need to re-add later as 'extra'
549549
extra = (dts.ps // 1000) * (pps // 1_000_000_000)
550550

551551
dt = datetime_new(dts.year, dts.month, dts.day, dts.hour,

pandas/_libs/tslibs/util.pxd

-9
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,6 @@
22
from cpython.object cimport PyTypeObject
33

44

5-
cdef extern from *:
6-
"""
7-
PyObject* char_to_string(const char* data) {
8-
return PyUnicode_FromString(data);
9-
}
10-
"""
11-
object char_to_string(const char* data)
12-
13-
145
cdef extern from "Python.h":
156
# Note: importing extern-style allows us to declare these as nogil
167
# functions, whereas `from cpython cimport` does not.

pandas/_libs/writers.pyi

-1
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,4 @@ def word_len(val: object) -> int: ...
1717
def string_array_replace_from_nan_rep(
1818
arr: np.ndarray, # np.ndarray[object, ndim=1]
1919
nan_rep: object,
20-
replace: object = ...,
2120
) -> None: ...

pandas/_libs/writers.pyx

+2-4
Original file line numberDiff line numberDiff line change
@@ -161,15 +161,13 @@ cpdef inline Py_ssize_t word_len(object val):
161161
def string_array_replace_from_nan_rep(
162162
ndarray[object, ndim=1] arr,
163163
object nan_rep,
164-
object replace=np.nan
165164
) -> None:
166165
"""
167-
Replace the values in the array with 'replacement' if
168-
they are 'nan_rep'. Return the same array.
166+
Replace the values in the array with np.nan if they are nan_rep.
169167
"""
170168
cdef:
171169
Py_ssize_t length = len(arr), i = 0
172170

173171
for i in range(length):
174172
if arr[i] == nan_rep:
175-
arr[i] = replace
173+
arr[i] = np.nan

pandas/core/arrays/datetimes.py

+5-11
Original file line numberDiff line numberDiff line change
@@ -2030,17 +2030,11 @@ def _sequence_to_dt64ns(
20302030
)
20312031
if tz and inferred_tz:
20322032
# two timezones: convert to intended from base UTC repr
2033-
if data.dtype == "i8":
2034-
# GH#42505
2035-
# by convention, these are _already_ UTC, e.g
2036-
return data.view(DT64NS_DTYPE), tz, None
2037-
2038-
if timezones.is_utc(tz):
2039-
# Fastpath, avoid copy made in tzconversion
2040-
utc_vals = data.view("i8")
2041-
else:
2042-
utc_vals = tz_convert_from_utc(data.view("i8"), tz)
2043-
data = utc_vals.view(DT64NS_DTYPE)
2033+
assert data.dtype == "i8"
2034+
# GH#42505
2035+
# by convention, these are _already_ UTC, e.g
2036+
return data.view(DT64NS_DTYPE), tz, None
2037+
20442038
elif inferred_tz:
20452039
tz = inferred_tz
20462040

pandas/core/arrays/timedeltas.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -769,8 +769,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]:
769769
dtype='timedelta64[ns]', freq=None)
770770
771771
>>> idx.total_seconds()
772-
NumericIndex([0.0, 86400.0, 172800.0, 259200.0, 345600.0],
773-
dtype='float64')
772+
Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64')
774773
"""
775774
pps = periods_per_second(self._creso)
776775
return self._maybe_mask_results(self.asi8 / pps, fill_value=None)

pandas/core/dtypes/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def __eq__(self, other: Any) -> bool:
134134

135135
def __hash__(self) -> int:
136136
# for python>=3.10, different nan objects have different hashes
137-
# we need to avoid that und thus use hash function with old behavior
137+
# we need to avoid that and thus use hash function with old behavior
138138
return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
139139

140140
def __ne__(self, other: Any) -> bool:

pandas/core/dtypes/dtypes.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
tz_compare,
2929
)
3030
from pandas._libs.tslibs.dtypes import (
31-
NpyDatetimeUnit,
3231
PeriodDtypeBase,
32+
abbrev_to_npy_unit,
3333
)
3434
from pandas._typing import (
3535
Dtype,
@@ -722,13 +722,7 @@ def _creso(self) -> int:
722722
"""
723723
The NPY_DATETIMEUNIT corresponding to this dtype's resolution.
724724
"""
725-
reso = {
726-
"s": NpyDatetimeUnit.NPY_FR_s,
727-
"ms": NpyDatetimeUnit.NPY_FR_ms,
728-
"us": NpyDatetimeUnit.NPY_FR_us,
729-
"ns": NpyDatetimeUnit.NPY_FR_ns,
730-
}[self.unit]
731-
return reso.value
725+
return abbrev_to_npy_unit(self.unit)
732726

733727
@property
734728
def unit(self) -> str_type:

pandas/core/groupby/generic.py

+1
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,7 @@ def _cython_transform(
503503
"transform", obj._values, how, axis, **kwargs
504504
)
505505
except NotImplementedError as err:
506+
# e.g. test_groupby_raises_string
506507
raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
507508

508509
return obj._constructor(result, index=self.obj.index, name=obj.name)

0 commit comments

Comments
 (0)