Skip to content

Commit 4c84247

Browse files
committed
Merge remote-tracking branch 'upstream/master' into truncate-decreasing
2 parents 85fcb0c + 0907d9e commit 4c84247

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+1605
-1108
lines changed

asv_bench/benchmarks/arithmetic.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def time_series_op_with_fill_value_no_nas(self):
6767
self.ser.add(self.ser, fill_value=4)
6868

6969

70-
class MixedFrameWithSeriesAxis0:
70+
class MixedFrameWithSeriesAxis:
7171
params = [
7272
[
7373
"eq",
@@ -78,7 +78,7 @@ class MixedFrameWithSeriesAxis0:
7878
"gt",
7979
"add",
8080
"sub",
81-
"div",
81+
"truediv",
8282
"floordiv",
8383
"mul",
8484
"pow",
@@ -87,15 +87,19 @@ class MixedFrameWithSeriesAxis0:
8787
param_names = ["opname"]
8888

8989
def setup(self, opname):
90-
arr = np.arange(10 ** 6).reshape(100, -1)
90+
arr = np.arange(10 ** 6).reshape(1000, -1)
9191
df = DataFrame(arr)
9292
df["C"] = 1.0
9393
self.df = df
9494
self.ser = df[0]
95+
self.row = df.iloc[0]
9596

9697
def time_frame_op_with_series_axis0(self, opname):
9798
getattr(self.df, opname)(self.ser, axis=0)
9899

100+
def time_frame_op_with_series_axis1(self, opname):
101+
getattr(operator, opname)(self.df, self.ser)
102+
99103

100104
class Ops:
101105

asv_bench/benchmarks/stat_ops.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ class FrameOps:
1111
param_names = ["op", "dtype", "axis"]
1212

1313
def setup(self, op, dtype, axis):
14-
if op == "mad" and dtype == "Int64" and axis == 1:
15-
# GH-33036
14+
if op == "mad" and dtype == "Int64":
15+
# GH-33036, GH#33600
1616
raise NotImplementedError
1717
values = np.random.randn(100000, 4)
1818
if dtype == "Int64":

doc/source/user_guide/computation.rst

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,8 @@ We provide a number of common statistical functions:
318318
:meth:`~Rolling.kurt`, Sample kurtosis (4th moment)
319319
:meth:`~Rolling.quantile`, Sample quantile (value at %)
320320
:meth:`~Rolling.apply`, Generic apply
321-
:meth:`~Rolling.cov`, Unbiased covariance (binary)
322-
:meth:`~Rolling.corr`, Correlation (binary)
321+
:meth:`~Rolling.cov`, Sample covariance (binary)
322+
:meth:`~Rolling.corr`, Sample correlation (binary)
323323

324324
.. _computation.window_variance.caveats:
325325

@@ -341,6 +341,8 @@ We provide a number of common statistical functions:
341341
sample variance under the circumstances would result in a biased estimator
342342
of the variable we are trying to determine.
343343

344+
The same caveats apply to using any supported statistical sample methods.
345+
344346
.. _stats.rolling_apply:
345347

346348
Rolling apply
@@ -870,12 +872,12 @@ Method summary
870872
:meth:`~Expanding.max`, Maximum
871873
:meth:`~Expanding.std`, Sample standard deviation
872874
:meth:`~Expanding.var`, Sample variance
873-
:meth:`~Expanding.skew`, Unbiased skewness (3rd moment)
874-
:meth:`~Expanding.kurt`, Unbiased kurtosis (4th moment)
875+
:meth:`~Expanding.skew`, Sample skewness (3rd moment)
876+
:meth:`~Expanding.kurt`, Sample kurtosis (4th moment)
875877
:meth:`~Expanding.quantile`, Sample quantile (value at %)
876878
:meth:`~Expanding.apply`, Generic apply
877-
:meth:`~Expanding.cov`, Unbiased covariance (binary)
878-
:meth:`~Expanding.corr`, Correlation (binary)
879+
:meth:`~Expanding.cov`, Sample covariance (binary)
880+
:meth:`~Expanding.corr`, Sample correlation (binary)
879881

880882
.. note::
881883

@@ -884,6 +886,8 @@ Method summary
884886
windows. See :ref:`this section <computation.window_variance.caveats>` for more
885887
information.
886888

889+
The same caveats apply to using any supported statistical sample methods.
890+
887891
.. currentmodule:: pandas
888892

889893
Aside from not having a ``window`` parameter, these functions have the same

doc/source/whatsnew/v1.1.0.rst

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,8 @@ Other API changes
175175
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
176176
- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
177177
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
178-
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
179-
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
178+
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
179+
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
180180
- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
181181
-
182182

@@ -191,6 +191,7 @@ Backwards incompatible API changes
191191
Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`)
192192
- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`)
193193
- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`)
194+
- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`)
194195

195196
``MultiIndex.get_indexer`` interprets `method` argument differently
196197
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -324,6 +325,36 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss
324325
...
325326
KeyError: Timestamp('1970-01-01 00:00:00')
326327
328+
.. _whatsnew_110.api_breaking.indexing_int_multiindex_raises_key_errors:
329+
330+
Failed Integer Lookups on MultiIndex Raise KeyError
331+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
332+
Indexing with integers with a :class:`MultiIndex` that has a integer-dtype
333+
first level incorrectly failed to raise ``KeyError`` when one or more of
334+
those integer keys is not present in the first level of the index (:issue:`33539`)
335+
336+
.. ipython:: python
337+
338+
idx = pd.Index(range(4))
339+
dti = pd.date_range("2000-01-03", periods=3)
340+
mi = pd.MultiIndex.from_product([idx, dti])
341+
ser = pd.Series(range(len(mi)), index=mi)
342+
343+
*Previous behavior*:
344+
345+
.. code-block:: ipython
346+
347+
In [5]: ser[[5]]
348+
Out[5]: Series([], dtype: int64)
349+
350+
*New behavior*:
351+
352+
.. code-block:: ipython
353+
354+
In [5]: ser[[5]]
355+
...
356+
KeyError: '[5] not in index'
357+
327358
:meth:`DataFrame.merge` preserves right frame's row order
328359
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
329360
:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
@@ -419,6 +450,7 @@ Performance improvements
419450
- Performance improvement in :class:`Timedelta` constructor (:issue:`30543`)
420451
- Performance improvement in :class:`Timestamp` constructor (:issue:`30543`)
421452
- Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`)
453+
- Performance improvement in arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=1`` (:issue:`33600`)
422454
- The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
423455
avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
424456
existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
@@ -462,6 +494,7 @@ Datetimelike
462494
- Bug in :meth:`DatetimeIndex.to_period` not infering the frequency when called with no arguments (:issue:`33358`)
463495
- Bug in :meth:`DatetimeIndex.tz_localize` incorrectly retaining ``freq`` in some cases where the original freq is no longer valid (:issue:`30511`)
464496
- Bug in :meth:`DatetimeIndex.intersection` losing ``freq`` and timezone in some cases (:issue:`33604`)
497+
- Bug in :class:`DatetimeIndex` addition and subtraction with some types of :class:`DateOffset` objects incorrectly retaining an invalid ``freq`` attribute (:issue:`33779`)
465498

466499
Timedelta
467500
^^^^^^^^^
@@ -499,6 +532,7 @@ Strings
499532
^^^^^^^
500533

501534
- Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`).
535+
- Fixed issue where taking ``min`` or ``max`` of a ``StringArray`` or ``Series`` with ``StringDtype`` type would raise. (:issue:`31746`)
502536
- Bug in :meth:`Series.str.cat` returning ``NaN`` output when other had :class:`Index` type (:issue:`33425`)
503537

504538

@@ -661,6 +695,7 @@ Other
661695
- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`)
662696
- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
663697
- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`)
698+
- :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`)
664699

665700
.. ---------------------------------------------------------------------------
666701

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna):
125125
{{if dtype == 'object'}}
126126
def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
127127
{{else}}
128-
def duplicated_{{dtype}}({{c_type}}[:] values, object keep='first'):
128+
def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'):
129129
{{endif}}
130130
cdef:
131131
int ret = 0

pandas/_libs/index.pyx

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,13 @@ cnp.import_array()
2121

2222
cimport pandas._libs.util as util
2323

24-
from pandas._libs.tslibs import Period
24+
from pandas._libs.tslibs import Period, Timedelta
2525
from pandas._libs.tslibs.nattype cimport c_NaT as NaT
2626
from pandas._libs.tslibs.c_timestamp cimport _Timestamp
2727

2828
from pandas._libs.hashtable cimport HashTable
2929

3030
from pandas._libs import algos, hashtable as _hash
31-
from pandas._libs.tslibs import Timedelta, period as periodlib
3231
from pandas._libs.missing import checknull
3332

3433

@@ -441,6 +440,10 @@ cdef class DatetimeEngine(Int64Engine):
441440
except KeyError:
442441
raise KeyError(val)
443442

443+
def get_indexer_non_unique(self, targets):
444+
# we may get datetime64[ns] or timedelta64[ns], cast these to int64
445+
return super().get_indexer_non_unique(targets.view("i8"))
446+
444447
def get_indexer(self, values):
445448
self._ensure_mapping_populated()
446449
if values.dtype != self._get_box_dtype():
@@ -501,38 +504,6 @@ cdef class PeriodEngine(Int64Engine):
501504
cdef _call_monotonic(self, values):
502505
return algos.is_monotonic(values, timelike=True)
503506

504-
def get_indexer(self, values):
505-
cdef:
506-
ndarray[int64_t, ndim=1] ordinals
507-
508-
super(PeriodEngine, self)._ensure_mapping_populated()
509-
510-
freq = super(PeriodEngine, self).vgetter().freq
511-
ordinals = periodlib.extract_ordinals(values, freq)
512-
513-
return self.mapping.lookup(ordinals)
514-
515-
def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
516-
freq = super(PeriodEngine, self).vgetter().freq
517-
ordinal = periodlib.extract_ordinals(other, freq)
518-
519-
return algos.pad(self._get_index_values(),
520-
np.asarray(ordinal), limit=limit)
521-
522-
def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
523-
freq = super(PeriodEngine, self).vgetter().freq
524-
ordinal = periodlib.extract_ordinals(other, freq)
525-
526-
return algos.backfill(self._get_index_values(),
527-
np.asarray(ordinal), limit=limit)
528-
529-
def get_indexer_non_unique(self, targets):
530-
freq = super(PeriodEngine, self).vgetter().freq
531-
ordinal = periodlib.extract_ordinals(targets, freq)
532-
ordinal_array = np.asarray(ordinal)
533-
534-
return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)
535-
536507

537508
cdef class BaseMultiIndexCodesEngine:
538509
"""

pandas/_libs/lib.pyx

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1173,15 +1173,15 @@ cdef class Seen:
11731173
or self.nat_)
11741174

11751175

1176-
cdef object _try_infer_map(object v):
1176+
cdef object _try_infer_map(object dtype):
11771177
"""
11781178
If its in our map, just return the dtype.
11791179
"""
11801180
cdef:
11811181
object val
11821182
str attr
1183-
for attr in ['name', 'kind', 'base']:
1184-
val = getattr(v.dtype, attr)
1183+
for attr in ["name", "kind", "base"]:
1184+
val = getattr(dtype, attr)
11851185
if val in _TYPE_MAP:
11861186
return _TYPE_MAP[val]
11871187
return None
@@ -1294,44 +1294,49 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
12941294

12951295
if util.is_array(value):
12961296
values = value
1297-
elif hasattr(value, 'dtype'):
1297+
elif hasattr(value, "inferred_type") and skipna is False:
1298+
# Index, use the cached attribute if possible, populate the cache otherwise
1299+
return value.inferred_type
1300+
elif hasattr(value, "dtype"):
12981301
# this will handle ndarray-like
12991302
# e.g. categoricals
1300-
try:
1301-
values = getattr(value, '_values', getattr(value, 'values', value))
1302-
except TypeError:
1303-
# This gets hit if we have an EA, since cython expects `values`
1304-
# to be an ndarray
1305-
value = _try_infer_map(value)
1303+
dtype = value.dtype
1304+
if not isinstance(dtype, np.dtype):
1305+
value = _try_infer_map(value.dtype)
13061306
if value is not None:
13071307
return value
13081308

1309-
# its ndarray like but we can't handle
1309+
# its ndarray-like but we can't handle
13101310
raise ValueError(f"cannot infer type for {type(value)}")
13111311

1312+
# Unwrap Series/Index
1313+
values = np.asarray(value)
1314+
13121315
else:
13131316
if not isinstance(value, list):
13141317
value = list(value)
1318+
13151319
from pandas.core.dtypes.cast import (
13161320
construct_1d_object_array_from_listlike)
13171321
values = construct_1d_object_array_from_listlike(value)
13181322

13191323
# make contiguous
1320-
values = values.ravel()
1324+
# for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup
1325+
values = values.ravel(order="K")
13211326

1322-
val = _try_infer_map(values)
1327+
val = _try_infer_map(values.dtype)
13231328
if val is not None:
13241329
return val
13251330

13261331
if values.dtype != np.object_:
1327-
values = values.astype('O')
1332+
values = values.astype("O")
13281333

13291334
if skipna:
13301335
values = values[~isnaobj(values)]
13311336

13321337
n = len(values)
13331338
if n == 0:
1334-
return 'empty'
1339+
return "empty"
13351340

13361341
# try to use a valid value
13371342
for i in range(n):

pandas/compat/numpy/function.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name):
218218
LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False)
219219
validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs")
220220

221-
MINMAX_DEFAULTS = dict(out=None, keepdims=False)
221+
MINMAX_DEFAULTS = dict(axis=None, out=None, keepdims=False)
222222
validate_min = CompatValidator(
223223
MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1
224224
)
@@ -251,10 +251,16 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name):
251251
STAT_FUNC_DEFAULTS["dtype"] = None
252252
STAT_FUNC_DEFAULTS["out"] = None
253253

254-
PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
254+
SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
255+
SUM_DEFAULTS["axis"] = None
255256
SUM_DEFAULTS["keepdims"] = False
256257
SUM_DEFAULTS["initial"] = None
257258

259+
PROD_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
260+
PROD_DEFAULTS["axis"] = None
261+
PROD_DEFAULTS["keepdims"] = False
262+
PROD_DEFAULTS["initial"] = None
263+
258264
MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
259265
MEDIAN_DEFAULTS["overwrite_input"] = False
260266
MEDIAN_DEFAULTS["keepdims"] = False

pandas/core/algorithms.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
ABCExtensionArray,
5050
ABCIndex,
5151
ABCIndexClass,
52+
ABCMultiIndex,
5253
ABCSeries,
5354
)
5455
from pandas.core.dtypes.missing import isna, na_value_for_dtype
@@ -89,6 +90,10 @@ def _ensure_data(values, dtype=None):
8990
values : ndarray
9091
pandas_dtype : str or dtype
9192
"""
93+
if not isinstance(values, ABCMultiIndex):
94+
# extract_array would raise
95+
values = extract_array(values, extract_numpy=True)
96+
9297
# we check some simple dtypes first
9398
if is_object_dtype(dtype):
9499
return ensure_object(np.asarray(values)), "object"
@@ -151,7 +156,6 @@ def _ensure_data(values, dtype=None):
151156
elif is_categorical_dtype(values) and (
152157
is_categorical_dtype(dtype) or dtype is None
153158
):
154-
values = getattr(values, "values", values)
155159
values = values.codes
156160
dtype = "category"
157161

0 commit comments

Comments
 (0)