Skip to content

Commit a329387

Browse files
authored
Merge branch 'main' into pandas-devgh-10446
2 parents 30b29be + 6bcd303 commit a329387

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+405
-250
lines changed

.pre-commit-config.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,11 @@ repos:
106106
hooks:
107107
- id: meson-fmt
108108
args: ['--inplace']
109+
- repo: https://github.com/shellcheck-py/shellcheck-py
110+
rev: v0.10.0.1
111+
hooks:
112+
- id: shellcheck
113+
args: ["--severity=warning"]
109114
- repo: local
110115
hooks:
111116
- id: pyright

ci/code_checks.sh

+9-9
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ else
2424
fi
2525

2626
[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
27-
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }
27+
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; }
2828

29-
BASE_DIR="$(dirname $0)/.."
29+
BASE_DIR="$(dirname "$0")/.."
3030
RET=0
3131

3232
### CODE ###
3333
if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
3434

35-
MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG
35+
MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo "$MSG"
3636
python -W error -c "
3737
import sys
3838
import pandas
@@ -49,24 +49,24 @@ if mods:
4949
sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods)))
5050
sys.exit(len(mods))
5151
"
52-
RET=$(($RET + $?)) ; echo $MSG "DONE"
52+
RET=$(($RET + $?)) ; echo "$MSG" "DONE"
5353

5454
fi
5555

5656
### DOCTESTS ###
5757
if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
5858

59-
MSG='Python and Cython Doctests' ; echo $MSG
59+
MSG='Python and Cython Doctests' ; echo "$MSG"
6060
python -c 'import pandas as pd; pd.test(run_doctests=True)'
61-
RET=$(($RET + $?)) ; echo $MSG "DONE"
61+
RET=$(($RET + $?)) ; echo "$MSG" "DONE"
6262

6363
fi
6464

6565
### DOCSTRINGS ###
6666
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
6767

68-
MSG='Validate Docstrings' ; echo $MSG
69-
$BASE_DIR/scripts/validate_docstrings.py \
68+
MSG='Validate Docstrings' ; echo "$MSG"
69+
"$BASE_DIR"/scripts/validate_docstrings.py \
7070
--format=actions \
7171
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
7272
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
@@ -265,7 +265,7 @@ fi
265265
if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then
266266

267267
MSG='Notebooks' ; echo $MSG
268-
jupyter nbconvert --execute $(find doc/source -name '*.ipynb') --to notebook
268+
jupyter nbconvert --execute "$(find doc/source -name '*.ipynb')" --to notebook
269269
RET=$(($RET + $?)) ; echo $MSG "DONE"
270270

271271
fi

ci/run_tests.sh

+3-5
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
# Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set)
44
# https://github.com/pytest-dev/pytest/issues/920
55
# https://github.com/pytest-dev/pytest/issues/1075
6-
export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
7-
8-
# May help reproduce flaky CI builds if set in subsequent runs
9-
echo PYTHONHASHSEED=$PYTHONHASHSEED
6+
PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
7+
export PYTHONHASHSEED
108

119
COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml"
1210

@@ -16,5 +14,5 @@ if [[ "$PATTERN" ]]; then
1614
PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""
1715
fi
1816

19-
echo $PYTEST_CMD
17+
echo "$PYTEST_CMD"
2018
sh -c "$PYTEST_CMD"

ci/upload_wheels.sh

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#!/bin/bash
12
# Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh
23

34
set_upload_vars() {
@@ -19,20 +20,20 @@ set_upload_vars() {
1920
fi
2021
}
2122
upload_wheels() {
22-
echo ${PWD}
23+
echo "${PWD}"
2324
if [[ ${ANACONDA_UPLOAD} == true ]]; then
24-
if [ -z ${TOKEN} ]; then
25+
if [ -z "${TOKEN}" ]; then
2526
echo no token set, not uploading
2627
else
2728
# sdists are located under dist folder when built through setup.py
2829
if compgen -G "./dist/*.gz"; then
2930
echo "Found sdist"
30-
anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz
31+
anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz
3132
echo "Uploaded sdist"
3233
fi
3334
if compgen -G "./wheelhouse/*.whl"; then
3435
echo "Found wheel"
35-
anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl
36+
anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl
3637
echo "Uploaded wheel"
3738
fi
3839
echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple"

doc/source/development/contributing_codebase.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ be located.
344344
- tests.scalar
345345
- tests.tseries.offsets
346346

347-
2. Does your test depend only on code in pd._libs?
347+
2. Does your test depend only on code in ``pd._libs``?
348348
This test likely belongs in one of:
349349

350350
- tests.libs

doc/source/development/developer.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ Column metadata
9999
* Boolean: ``'bool'``
100100
* Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'``
101101
* Floats: ``'float16', 'float32', 'float64'``
102-
* Date and Time Types: ``'datetime', 'datetimetz'``, ``'timedelta'``
102+
* Date and Time Types: ``'datetime', 'datetimetz', 'timedelta'``
103103
* String: ``'unicode', 'bytes'``
104104
* Categorical: ``'categorical'``
105105
* Other Python objects: ``'object'``

doc/source/reference/series.rst

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Attributes
2525
Series.array
2626
Series.values
2727
Series.dtype
28+
Series.info
2829
Series.shape
2930
Series.nbytes
3031
Series.ndim

doc/source/user_guide/window.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -356,11 +356,11 @@ See :ref:`enhancing performance with Numba <enhancingperf.numba>` for general us
356356

357357
Numba will be applied in potentially two routines:
358358

359-
#. If ``func`` is a standard Python function, the engine will `JIT <https://numba.pydata.org/numba-doc/latest/user/overview.html>`__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
359+
#. If ``func`` is a standard Python function, the engine will `JIT <https://numba.readthedocs.io/en/stable/user/overview.html>`__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
360360
#. The engine will JIT the for loop where the apply function is applied to each window.
361361

362362
The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the
363-
`numba.jit decorator <https://numba.pydata.org/numba-doc/latest/reference/jit-compilation.html#numba.jit>`__.
363+
`numba.jit decorator <https://numba.readthedocs.io/en/stable/user/jit.html>`__.
364364
These keyword arguments will be applied to *both* the passed function (if a standard Python function)
365365
and the apply for loop over each window.
366366

doc/source/whatsnew/v3.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Other enhancements
3232
- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
3333
- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
3434
- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
35+
- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
3536
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
3637
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
3738
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
@@ -70,6 +71,7 @@ Other enhancements
7071
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
7172
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
7273
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
74+
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
7375
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
7476
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
7577
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)

pandas/_libs/groupby.pyx

+63-34
Original file line numberDiff line numberDiff line change
@@ -753,16 +753,20 @@ def group_sum(
753753

754754
if uses_mask:
755755
isna_entry = mask[i, j]
756-
isna_result = result_mask[lab, j]
757756
else:
758757
isna_entry = _treat_as_na(val, is_datetimelike)
759-
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
760758

761-
if not skipna and isna_result:
762-
# If sum is already NA, don't add to it. This is important for
763-
# datetimelikebecause adding a value to NPY_NAT may not result
764-
# in a NPY_NAT
765-
continue
759+
if not skipna:
760+
if uses_mask:
761+
isna_result = result_mask[lab, j]
762+
else:
763+
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
764+
765+
if isna_result:
766+
# If sum is already NA, don't add to it. This is important for
767+
# datetimelikebecause adding a value to NPY_NAT may not result
768+
# in a NPY_NAT
769+
continue
766770

767771
if not isna_entry:
768772
nobs[lab, j] += 1
@@ -845,14 +849,18 @@ def group_prod(
845849

846850
if uses_mask:
847851
isna_entry = mask[i, j]
848-
isna_result = result_mask[lab, j]
849852
else:
850853
isna_entry = _treat_as_na(val, False)
851-
isna_result = _treat_as_na(prodx[lab, j], False)
852854

853-
if not skipna and isna_result:
854-
# If prod is already NA, no need to update it
855-
continue
855+
if not skipna:
856+
if uses_mask:
857+
isna_result = result_mask[lab, j]
858+
else:
859+
isna_result = _treat_as_na(prodx[lab, j], False)
860+
861+
if isna_result:
862+
# If prod is already NA, no need to update it
863+
continue
856864

857865
if not isna_entry:
858866
nobs[lab, j] += 1
@@ -919,22 +927,30 @@ def group_var(
919927

920928
if uses_mask:
921929
isna_entry = mask[i, j]
922-
isna_result = result_mask[lab, j]
923930
elif is_datetimelike:
924931
# With group_var, we cannot just use _treat_as_na bc
925932
# datetimelike dtypes get cast to float64 instead of
926933
# to int64.
927934
isna_entry = val == NPY_NAT
928-
isna_result = out[lab, j] == NPY_NAT
929935
else:
930936
isna_entry = _treat_as_na(val, is_datetimelike)
931-
isna_result = _treat_as_na(out[lab, j], is_datetimelike)
932937

933-
if not skipna and isna_result:
934-
# If aggregate is already NA, don't add to it. This is important for
935-
# datetimelike because adding a value to NPY_NAT may not result
936-
# in a NPY_NAT
937-
continue
938+
if not skipna:
939+
if uses_mask:
940+
isna_result = result_mask[lab, j]
941+
elif is_datetimelike:
942+
# With group_var, we cannot just use _treat_as_na bc
943+
# datetimelike dtypes get cast to float64 instead of
944+
# to int64.
945+
isna_result = out[lab, j] == NPY_NAT
946+
else:
947+
isna_result = _treat_as_na(out[lab, j], is_datetimelike)
948+
949+
if isna_result:
950+
# If aggregate is already NA, don't add to it. This is
951+
# important for datetimelike because adding a value to NPY_NAT
952+
# may not result in a NPY_NAT
953+
continue
938954

939955
if not isna_entry:
940956
nobs[lab, j] += 1
@@ -1232,22 +1248,30 @@ def group_mean(
12321248

12331249
if uses_mask:
12341250
isna_entry = mask[i, j]
1235-
isna_result = result_mask[lab, j]
12361251
elif is_datetimelike:
12371252
# With group_mean, we cannot just use _treat_as_na bc
12381253
# datetimelike dtypes get cast to float64 instead of
12391254
# to int64.
12401255
isna_entry = val == NPY_NAT
1241-
isna_result = sumx[lab, j] == NPY_NAT
12421256
else:
12431257
isna_entry = _treat_as_na(val, is_datetimelike)
1244-
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
12451258

1246-
if not skipna and isna_result:
1247-
# If sum is already NA, don't add to it. This is important for
1248-
# datetimelike because adding a value to NPY_NAT may not result
1249-
# in NPY_NAT
1250-
continue
1259+
if not skipna:
1260+
if uses_mask:
1261+
isna_result = result_mask[lab, j]
1262+
elif is_datetimelike:
1263+
# With group_mean, we cannot just use _treat_as_na bc
1264+
# datetimelike dtypes get cast to float64 instead of
1265+
# to int64.
1266+
isna_result = sumx[lab, j] == NPY_NAT
1267+
else:
1268+
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
1269+
1270+
if isna_result:
1271+
# If sum is already NA, don't add to it. This is important for
1272+
# datetimelike because adding a value to NPY_NAT may not result
1273+
# in NPY_NAT
1274+
continue
12511275

12521276
if not isna_entry:
12531277
nobs[lab, j] += 1
@@ -1909,15 +1933,20 @@ cdef group_min_max(
19091933

19101934
if uses_mask:
19111935
isna_entry = mask[i, j]
1912-
isna_result = result_mask[lab, j]
19131936
else:
19141937
isna_entry = _treat_as_na(val, is_datetimelike)
1915-
isna_result = _treat_as_na(group_min_or_max[lab, j],
1916-
is_datetimelike)
19171938

1918-
if not skipna and isna_result:
1919-
# If current min/max is already NA, it will always be NA
1920-
continue
1939+
if not skipna:
1940+
if uses_mask:
1941+
isna_result = result_mask[lab, j]
1942+
else:
1943+
isna_result = _treat_as_na(
1944+
group_min_or_max[lab, j], is_datetimelike
1945+
)
1946+
1947+
if isna_result:
1948+
# If current min/max is already NA, it will always be NA
1949+
continue
19211950

19221951
if not isna_entry:
19231952
nobs[lab, j] += 1

pandas/_libs/lib.pyx

+5
Original file line numberDiff line numberDiff line change
@@ -1522,6 +1522,11 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
15221522
"""
15231523
Return a string label of the type of a scalar or list-like of values.
15241524

1525+
This method inspects the elements of the provided input and determines
1526+
classification of its data type. It is particularly useful for
1527+
handling heterogeneous data inputs where explicit dtype conversion may not
1528+
be possible or necessary.
1529+
15251530
Parameters
15261531
----------
15271532
value : scalar, list, ndarray, or pandas type

pandas/_libs/tslibs/strptime.pyx

+7
Original file line numberDiff line numberDiff line change
@@ -924,6 +924,13 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday)
924924

925925
correction = date(iso_year, 1, 4).isoweekday() + 3
926926
ordinal = (iso_week * 7) + iso_weekday - correction
927+
928+
if iso_week == 53:
929+
now = date.fromordinal(date(iso_year, 1, 1).toordinal() + ordinal - iso_weekday)
930+
jan_4th = date(iso_year+1, 1, 4)
931+
if (jan_4th - now).days < 7:
932+
raise ValueError(f"Week 53 does not exist in ISO year {iso_year}.")
933+
927934
# ordinal may be negative or 0 now, which means the date is in the previous
928935
# calendar year
929936
if ordinal < 1:

pandas/core/_numba/kernels/min_max_.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def grouped_min_max(
9898
for i in range(N):
9999
lab = labels[i]
100100
val = values[i]
101-
if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
101+
if lab < 0 or (not skipna and nobs[lab] >= 1 and np.isnan(output[lab])):
102102
continue
103103

104104
if values.dtype.kind == "i" or not np.isnan(val):

pandas/core/arrays/sparse/accessor.py

+5
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,11 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate):
279279
"""
280280
DataFrame accessor for sparse data.
281281
282+
It allows users to interact with a `DataFrame` that contains sparse data types
283+
(`SparseDtype`). It provides methods and attributes to efficiently work with sparse
284+
storage, reducing memory usage while maintaining compatibility with standard pandas
285+
operations.
286+
282287
Parameters
283288
----------
284289
data : scipy.sparse.spmatrix

pandas/core/dtypes/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1836,7 +1836,7 @@ def pandas_dtype(dtype) -> DtypeObj:
18361836
# raise a consistent TypeError if failed
18371837
try:
18381838
with warnings.catch_warnings():
1839-
# TODO: warnings.catch_warnings can be removed when numpy>2.2.2
1839+
# TODO: warnings.catch_warnings can be removed when numpy>2.3.0
18401840
# is the minimum version
18411841
# GH#51523 - Series.astype(np.integer) doesn't show
18421842
# numpy deprecation warning of np.integer

0 commit comments

Comments
 (0)