Skip to content

Commit 9e63a41

Browse files
authored
Merge pull request #247 from pandas-dev/master
Sync Fork from Upstream Repo
2 parents f4a530f + 7963422 commit 9e63a41

27 files changed

+405
-173
lines changed

ci/code_checks.sh

+2
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
121121
pandas/io/parsers/ \
122122
pandas/io/sas/ \
123123
pandas/io/sql.py \
124+
pandas/io/formats/format.py \
125+
pandas/io/formats/style.py \
124126
pandas/tseries/
125127
RET=$(($RET + $?)) ; echo $MSG "DONE"
126128

doc/source/_static/style/df_pipe.png

8.47 KB
Loading

doc/source/whatsnew/v1.3.2.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`)
2020
- Fixed regression in :meth:`DataFrame.shift` where TypeError occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`)
2121
- Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`)
22+
- Regression in :meth:`DataFrame.drop` does nothing if :class:`MultiIndex` has duplicates and indexer is a tuple or list of tuples (:issue:`42771`)
2223
- Fixed regression where :meth:`pandas.read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to None (:issue:`42387`)
2324
- Fixed regression in comparisons between :class:`Timestamp` object and ``datetime64`` objects outside the implementation bounds for nanosecond ``datetime64`` (:issue:`42794`)
2425
-
@@ -29,7 +30,7 @@ Fixed regressions
2930

3031
Bug fixes
3132
~~~~~~~~~
32-
-
33+
- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`)
3334
-
3435

3536
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.4.0.rst

+5
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Other enhancements
3737
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
3838
- Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`)
3939
- Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`)
40+
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
4041
-
4142

4243
.. ---------------------------------------------------------------------------
@@ -170,6 +171,8 @@ Performance improvements
170171
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
171172
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
172173
- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`)
174+
- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
175+
-
173176

174177
.. ---------------------------------------------------------------------------
175178
@@ -229,6 +232,8 @@ Indexing
229232
- Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`)
230233
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`)
231234
- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
235+
- Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
236+
- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
232237

233238
Missing
234239
^^^^^^^

pandas/_libs/algos.pyx

+49-13
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,12 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
326326
Py_ssize_t i, j, xi, yi, N, K
327327
bint minpv
328328
float64_t[:, ::1] result
329+
# Initialize to None since we only use in the no missing value case
330+
float64_t[::1] means=None, ssqds=None
329331
ndarray[uint8_t, ndim=2] mask
332+
bint no_nans
330333
int64_t nobs = 0
334+
float64_t mean, ssqd, val
331335
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
332336

333337
N, K = (<object>mat).shape
@@ -339,25 +343,57 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
339343

340344
result = np.empty((K, K), dtype=np.float64)
341345
mask = np.isfinite(mat).view(np.uint8)
346+
no_nans = mask.all()
347+
348+
# Computing the online means and variances is expensive - so if possible we can
349+
# precompute these and avoid repeating the computations each time we handle
350+
# an (xi, yi) pair
351+
if no_nans:
352+
means = np.empty(K, dtype=np.float64)
353+
ssqds = np.empty(K, dtype=np.float64)
354+
355+
with nogil:
356+
for j in range(K):
357+
ssqd = mean = 0
358+
for i in range(N):
359+
val = mat[i, j]
360+
dx = val - mean
361+
mean += 1 / (i + 1) * dx
362+
ssqd += (val - mean) * dx
363+
364+
means[j] = mean
365+
ssqds[j] = ssqd
342366

343367
with nogil:
344368
for xi in range(K):
345369
for yi in range(xi + 1):
346-
# Welford's method for the variance-calculation
347-
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
348-
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
349-
for i in range(N):
350-
if mask[i, xi] and mask[i, yi]:
370+
covxy = 0
371+
if no_nans:
372+
for i in range(N):
351373
vx = mat[i, xi]
352374
vy = mat[i, yi]
353-
nobs += 1
354-
dx = vx - meanx
355-
dy = vy - meany
356-
meanx += 1 / nobs * dx
357-
meany += 1 / nobs * dy
358-
ssqdmx += (vx - meanx) * dx
359-
ssqdmy += (vy - meany) * dy
360-
covxy += (vx - meanx) * dy
375+
covxy += (vx - means[xi]) * (vy - means[yi])
376+
377+
ssqdmx = ssqds[xi]
378+
ssqdmy = ssqds[yi]
379+
nobs = N
380+
381+
else:
382+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
383+
for i in range(N):
384+
# Welford's method for the variance-calculation
385+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
386+
if mask[i, xi] and mask[i, yi]:
387+
vx = mat[i, xi]
388+
vy = mat[i, yi]
389+
nobs += 1
390+
dx = vx - meanx
391+
dy = vy - meany
392+
meanx += 1 / nobs * dx
393+
meany += 1 / nobs * dy
394+
ssqdmx += (vx - meanx) * dx
395+
ssqdmy += (vy - meany) * dy
396+
covxy += (vx - meanx) * dy
361397

362398
if nobs < minpv:
363399
result[xi, yi] = result[yi, xi] = NaN

pandas/_libs/groupby.pyx

+61-25
Original file line numberDiff line numberDiff line change
@@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
13171317
const intp_t[:] labels,
13181318
int ngroups,
13191319
bint is_datetimelike,
1320+
bint skipna,
13201321
bint compute_max):
13211322
"""
13221323
Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1336,6 +1337,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
13361337
Number of groups, larger than all entries of `labels`.
13371338
is_datetimelike : bool
13381339
True if `values` contains datetime-like entries.
1340+
skipna : bool
1341+
If True, ignore nans in `values`.
13391342
compute_max : bool
13401343
True if cumulative maximum should be computed, False
13411344
if cumulative minimum should be computed
@@ -1356,9 +1359,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
13561359
accum[:] = -np.inf if compute_max else np.inf
13571360

13581361
if mask is not None:
1359-
masked_cummin_max(out, values, mask, labels, accum, compute_max)
1362+
masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max)
13601363
else:
1361-
cummin_max(out, values, labels, accum, is_datetimelike, compute_max)
1364+
cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max)
13621365

13631366

13641367
@cython.boundscheck(False)
@@ -1367,6 +1370,7 @@ cdef cummin_max(groupby_t[:, ::1] out,
13671370
ndarray[groupby_t, ndim=2] values,
13681371
const intp_t[:] labels,
13691372
groupby_t[:, ::1] accum,
1373+
bint skipna,
13701374
bint is_datetimelike,
13711375
bint compute_max):
13721376
"""
@@ -1375,8 +1379,24 @@ cdef cummin_max(groupby_t[:, ::1] out,
13751379
"""
13761380
cdef:
13771381
Py_ssize_t i, j, N, K
1378-
groupby_t val, mval
1382+
groupby_t val, mval, na_val
1383+
uint8_t[:, ::1] seen_na
13791384
intp_t lab
1385+
bint na_possible
1386+
1387+
if groupby_t is float64_t or groupby_t is float32_t:
1388+
na_val = NaN
1389+
na_possible = True
1390+
elif is_datetimelike:
1391+
na_val = NPY_NAT
1392+
na_possible = True
1393+
# Will never be used, just to avoid uninitialized warning
1394+
else:
1395+
na_val = 0
1396+
na_possible = False
1397+
1398+
if na_possible:
1399+
seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
13801400

13811401
N, K = (<object>values).shape
13821402
with nogil:
@@ -1385,18 +1405,22 @@ cdef cummin_max(groupby_t[:, ::1] out,
13851405
if lab < 0:
13861406
continue
13871407
for j in range(K):
1388-
val = values[i, j]
1389-
if not _treat_as_na(val, is_datetimelike):
1390-
mval = accum[lab, j]
1391-
if compute_max:
1392-
if val > mval:
1393-
accum[lab, j] = mval = val
1394-
else:
1395-
if val < mval:
1396-
accum[lab, j] = mval = val
1397-
out[i, j] = mval
1408+
if not skipna and na_possible and seen_na[lab, j]:
1409+
out[i, j] = na_val
13981410
else:
1399-
out[i, j] = val
1411+
val = values[i, j]
1412+
if not _treat_as_na(val, is_datetimelike):
1413+
mval = accum[lab, j]
1414+
if compute_max:
1415+
if val > mval:
1416+
accum[lab, j] = mval = val
1417+
else:
1418+
if val < mval:
1419+
accum[lab, j] = mval = val
1420+
out[i, j] = mval
1421+
else:
1422+
seen_na[lab, j] = 1
1423+
out[i, j] = val
14001424

14011425

14021426
@cython.boundscheck(False)
@@ -1406,6 +1430,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
14061430
uint8_t[:, ::1] mask,
14071431
const intp_t[:] labels,
14081432
groupby_t[:, ::1] accum,
1433+
bint skipna,
14091434
bint compute_max):
14101435
"""
14111436
Compute the cumulative minimum/maximum of columns of `values`, in row groups
@@ -1414,25 +1439,32 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
14141439
cdef:
14151440
Py_ssize_t i, j, N, K
14161441
groupby_t val, mval
1442+
uint8_t[:, ::1] seen_na
14171443
intp_t lab
14181444

14191445
N, K = (<object>values).shape
1446+
seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
14201447
with nogil:
14211448
for i in range(N):
14221449
lab = labels[i]
14231450
if lab < 0:
14241451
continue
14251452
for j in range(K):
1426-
if not mask[i, j]:
1427-
val = values[i, j]
1428-
mval = accum[lab, j]
1429-
if compute_max:
1430-
if val > mval:
1431-
accum[lab, j] = mval = val
1453+
if not skipna and seen_na[lab, j]:
1454+
mask[i, j] = 1
1455+
else:
1456+
if not mask[i, j]:
1457+
val = values[i, j]
1458+
mval = accum[lab, j]
1459+
if compute_max:
1460+
if val > mval:
1461+
accum[lab, j] = mval = val
1462+
else:
1463+
if val < mval:
1464+
accum[lab, j] = mval = val
1465+
out[i, j] = mval
14321466
else:
1433-
if val < mval:
1434-
accum[lab, j] = mval = val
1435-
out[i, j] = mval
1467+
seen_na[lab, j] = 1
14361468

14371469

14381470
@cython.boundscheck(False)
@@ -1442,7 +1474,8 @@ def group_cummin(groupby_t[:, ::1] out,
14421474
const intp_t[:] labels,
14431475
int ngroups,
14441476
bint is_datetimelike,
1445-
uint8_t[:, ::1] mask=None) -> None:
1477+
uint8_t[:, ::1] mask=None,
1478+
bint skipna=True) -> None:
14461479
"""See group_cummin_max.__doc__"""
14471480
group_cummin_max(
14481481
out,
@@ -1451,6 +1484,7 @@ def group_cummin(groupby_t[:, ::1] out,
14511484
labels,
14521485
ngroups,
14531486
is_datetimelike,
1487+
skipna,
14541488
compute_max=False
14551489
)
14561490

@@ -1462,7 +1496,8 @@ def group_cummax(groupby_t[:, ::1] out,
14621496
const intp_t[:] labels,
14631497
int ngroups,
14641498
bint is_datetimelike,
1465-
uint8_t[:, ::1] mask=None) -> None:
1499+
uint8_t[:, ::1] mask=None,
1500+
bint skipna=True) -> None:
14661501
"""See group_cummin_max.__doc__"""
14671502
group_cummin_max(
14681503
out,
@@ -1471,5 +1506,6 @@ def group_cummax(groupby_t[:, ::1] out,
14711506
labels,
14721507
ngroups,
14731508
is_datetimelike,
1509+
skipna,
14741510
compute_max=True
14751511
)

pandas/core/base.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1221,7 +1221,7 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1):
12211221
"""
12221222

12231223
@doc(_shared_docs["searchsorted"], klass="Index")
1224-
def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:
1224+
def searchsorted(self, value, side="left", sorter=None) -> npt.NDArray[np.intp]:
12251225
return algorithms.searchsorted(self._values, value, side=side, sorter=sorter)
12261226

12271227
def drop_duplicates(self, keep="first"):
@@ -1232,5 +1232,5 @@ def drop_duplicates(self, keep="first"):
12321232
@final
12331233
def _duplicated(
12341234
self, keep: Literal["first", "last", False] = "first"
1235-
) -> np.ndarray:
1235+
) -> npt.NDArray[np.bool_]:
12361236
return duplicated(self._values, keep=keep)

pandas/core/construction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def extract_array(
402402
>>> extract_array([1, 2, 3])
403403
[1, 2, 3]
404404
405-
For an ndarray-backed Series / Index a PandasArray is returned.
405+
For an ndarray-backed Series / Index the ndarray is returned.
406406
407407
>>> extract_array(pd.Series([1, 2, 3]))
408408
array([1, 2, 3])

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
rewrite_axis_style_signature,
8686
)
8787
from pandas.util._validators import (
88+
validate_ascending,
8889
validate_axis_style_args,
8990
validate_bool_kwarg,
9091
validate_percentile,
@@ -6202,7 +6203,7 @@ def sort_values( # type: ignore[override]
62026203
):
62036204
inplace = validate_bool_kwarg(inplace, "inplace")
62046205
axis = self._get_axis_number(axis)
6205-
6206+
ascending = validate_ascending(ascending)
62066207
if not isinstance(by, list):
62076208
by = [by]
62086209
if is_sequence(ascending) and len(by) != len(ascending):

pandas/core/generic.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,10 @@
9999
ABCDataFrame,
100100
ABCSeries,
101101
)
102-
from pandas.core.dtypes.inference import is_hashable
102+
from pandas.core.dtypes.inference import (
103+
is_hashable,
104+
is_nested_list_like,
105+
)
103106
from pandas.core.dtypes.missing import (
104107
isna,
105108
notna,
@@ -4182,6 +4185,7 @@ def _drop_axis(
41824185

41834186
# Case for non-unique axis
41844187
else:
4188+
is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)
41854189
labels = ensure_object(com.index_labels_to_array(labels))
41864190
if level is not None:
41874191
if not isinstance(axis, MultiIndex):
@@ -4191,9 +4195,14 @@ def _drop_axis(
41914195
# GH 18561 MultiIndex.drop should raise if label is absent
41924196
if errors == "raise" and indexer.all():
41934197
raise KeyError(f"{labels} not found in axis")
4194-
elif isinstance(axis, MultiIndex) and labels.dtype == "object":
4198+
elif (
4199+
isinstance(axis, MultiIndex)
4200+
and labels.dtype == "object"
4201+
and not is_tuple_labels
4202+
):
41954203
# Set level to zero in case of MultiIndex and label is string,
41964204
# because isin can't handle strings for MultiIndexes GH#36293
4205+
# In case of tuples we get dtype object but have to use isin GH#42771
41974206
indexer = ~axis.get_level_values(0).isin(labels)
41984207
else:
41994208
indexer = ~axis.isin(labels)

0 commit comments

Comments
 (0)