Skip to content

Commit 1c7e63c

Browse files
committed
Merge branch 'categorical-fillna' of https://github.com/MarcoGorelli/pandas into categorical-fillna
2 parents fee3444 + 90bfabc commit 1c7e63c

19 files changed

+111
-111
lines changed

ci/deps/azure-37-locale.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ dependencies:
1717
- openpyxl
1818
- pytables
1919
- python-dateutil
20-
- python=3.7.3
20+
- python=3.7.*
2121
- pytz
2222
- s3fs
2323
- scipy

ci/deps/azure-37-numpydev.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: pandas-dev
22
channels:
33
- defaults
44
dependencies:
5-
- python=3.7.3
5+
- python=3.7.*
66
- pytz
77
- Cython>=0.28.2
88
# universal

ci/deps/travis-37.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ channels:
44
- conda-forge
55
- c3i_test
66
dependencies:
7-
- python=3.7.3
7+
- python=3.7.*
88
- botocore>=1.11
99
- cython>=0.28.2
1010
- numpy

doc/source/whatsnew/v0.25.1.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ Other enhancements
2121
Bug fixes
2222
~~~~~~~~~
2323

24+
- Bug in :func:`fill_na` in :class:`Categorical` would replace all values, not just those that are NaN (:issue:`26215`)
25+
2426

2527
Categorical
2628
^^^^^^^^^^^
@@ -83,7 +85,7 @@ Indexing
8385
^^^^^^^^
8486

8587
- Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`)
86-
- Break reference cycle involving :class:`Index` to allow garbage collection of :class:`Index` objects without running the GC. (:issue:`27585`)
88+
- Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`)
8789
- Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`).
8890
-
8991

doc/source/whatsnew/v1.0.0.rst

+6-6
Original file line numberDiff line numberDiff line change
@@ -21,27 +21,27 @@ including other versions of pandas.
2121
Enhancements
2222
~~~~~~~~~~~~
2323

24-
.. _whatsnew_1000.enhancements.other:
25-
2624
-
2725
-
2826

27+
.. _whatsnew_1000.enhancements.other:
28+
2929
Other enhancements
3030
^^^^^^^^^^^^^^^^^^
3131

32-
.. _whatsnew_1000.api_breaking:
33-
3432
-
3533
-
3634

35+
.. _whatsnew_1000.api_breaking:
36+
3737
Backwards incompatible API changes
3838
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3939

40-
.. _whatsnew_1000.api.other:
41-
4240
- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`).
4341
-
4442

43+
.. _whatsnew_1000.api.other:
44+
4545
Other API changes
4646
^^^^^^^^^^^^^^^^^
4747

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ channels:
55
dependencies:
66
# required
77
- numpy>=1.15
8-
- python=3.7.3
8+
- python=3
99
- python-dateutil>=2.6.1
1010
- pytz
1111

pandas/core/algorithms.py

+1-11
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,11 @@
2828
is_complex_dtype,
2929
is_datetime64_any_dtype,
3030
is_datetime64_ns_dtype,
31-
is_datetime64tz_dtype,
3231
is_datetimelike,
3332
is_extension_array_dtype,
3433
is_float_dtype,
3534
is_integer,
3635
is_integer_dtype,
37-
is_interval_dtype,
3836
is_list_like,
3937
is_numeric_dtype,
4038
is_object_dtype,
@@ -183,8 +181,6 @@ def _reconstruct_data(values, dtype, original):
183181

184182
if is_extension_array_dtype(dtype):
185183
values = dtype.construct_array_type()._from_sequence(values)
186-
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
187-
values = Index(original)._shallow_copy(values, name=None)
188184
elif is_bool_dtype(dtype):
189185
values = values.astype(dtype)
190186

@@ -1645,19 +1641,13 @@ def take_nd(
16451641
May be the same type as the input, or cast to an ndarray.
16461642
"""
16471643

1648-
# TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs
1649-
# dispatch to internal type takes
16501644
if is_extension_array_dtype(arr):
16511645
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1652-
elif is_datetime64tz_dtype(arr):
1653-
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1654-
elif is_interval_dtype(arr):
1655-
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
16561646

16571647
if is_sparse(arr):
16581648
arr = arr.to_dense()
16591649
elif isinstance(arr, (ABCIndexClass, ABCSeries)):
1660-
arr = arr.values
1650+
arr = arr._values
16611651

16621652
arr = np.asarray(arr)
16631653

pandas/core/arrays/categorical.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -1824,7 +1824,6 @@ def fillna(self, value=None, method=None, limit=None):
18241824

18251825
# pad / bfill
18261826
if method is not None:
1827-
18281827
values = self.to_dense().reshape(-1, len(self))
18291828
values = interpolate_2d(values, method, 0, None, value).astype(
18301829
self.categories.dtype
@@ -1838,10 +1837,9 @@ def fillna(self, value=None, method=None, limit=None):
18381837
if isinstance(value, ABCSeries):
18391838
if not value[~value.isin(self.categories)].isna().all():
18401839
raise ValueError("fill value must be in categories")
1841-
18421840
values_codes = _get_codes_for_values(value, self.categories)
1843-
indexer = np.where(values_codes != -1)
1844-
codes[indexer] = values_codes[values_codes != -1]
1841+
indexer = np.where(codes == -1)
1842+
codes[indexer] = values_codes[codes == -1]
18451843

18461844
# If value is not a dict or Series it should be a scalar
18471845
elif is_hashable(value):

pandas/core/frame.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -775,7 +775,8 @@ def style(self):
775775
Iterates over the DataFrame columns, returning a tuple with
776776
the column name and the content as a Series.
777777
778-
%s
778+
Yields
779+
------
779780
label : object
780781
The column names for the DataFrame being iterated over.
781782
content : Series
@@ -816,7 +817,7 @@ def style(self):
816817
Name: population, dtype: int64
817818
"""
818819

819-
@Appender(_shared_docs["items"] % "Yields\n ------")
820+
@Appender(_shared_docs["items"])
820821
def items(self):
821822
if self.columns.is_unique and hasattr(self, "_item_cache"):
822823
for k in self.columns:
@@ -825,9 +826,9 @@ def items(self):
825826
for i, k in enumerate(self.columns):
826827
yield k, self._ixs(i, axis=1)
827828

828-
@Appender(_shared_docs["items"] % "Returns\n -------")
829+
@Appender(_shared_docs["items"])
829830
def iteritems(self):
830-
return self.items()
831+
yield from self.items()
831832

832833
def iterrows(self):
833834
"""

pandas/core/indexes/base.py

+1-12
Original file line numberDiff line numberDiff line change
@@ -665,7 +665,7 @@ def _cleanup(self):
665665
def _engine(self):
666666
# property, for now, slow to look up
667667

668-
# to avoid a refernce cycle, bind `_ndarray_values` to a local variable, so
668+
# to avoid a reference cycle, bind `_ndarray_values` to a local variable, so
669669
# `self` is not passed into the lambda.
670670
_ndarray_values = self._ndarray_values
671671
return self._engine_type(lambda: _ndarray_values, len(self))
@@ -5341,16 +5341,6 @@ def _maybe_update_attributes(self, attrs):
53415341
"""
53425342
return attrs
53435343

5344-
def _validate_for_numeric_unaryop(self, op, opstr):
5345-
"""
5346-
Validate if we can perform a numeric unary operation.
5347-
"""
5348-
if not self._is_numeric_dtype:
5349-
raise TypeError(
5350-
"cannot evaluate a numeric op "
5351-
"{opstr} for type: {typ}".format(opstr=opstr, typ=type(self).__name__)
5352-
)
5353-
53545344
@classmethod
53555345
def _add_numeric_methods_binary(cls):
53565346
"""
@@ -5383,7 +5373,6 @@ def _add_numeric_methods_unary(cls):
53835373
def _make_evaluate_unary(op, opstr):
53845374
def _evaluate_numeric_unary(self):
53855375

5386-
self._validate_for_numeric_unaryop(op, opstr)
53875376
attrs = self._get_attributes_dict()
53885377
attrs = self._maybe_update_attributes(attrs)
53895378
return Index(op(self.values), **attrs)

pandas/core/indexes/category.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -446,9 +446,11 @@ def argsort(self, *args, **kwargs):
446446

447447
@cache_readonly
448448
def _engine(self):
449-
450-
# we are going to look things up with the codes themselves
451-
return self._engine_type(lambda: self.codes, len(self))
449+
# we are going to look things up with the codes themselves.
450+
# To avoid a reference cycle, bind `codes` to a local variable, so
451+
# `self` is not passed into the lambda.
452+
codes = self.codes
453+
return self._engine_type(lambda: codes, len(self))
452454

453455
# introspection
454456
@cache_readonly

pandas/core/indexes/period.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import datetime, timedelta
22
import warnings
3+
import weakref
34

45
import numpy as np
56

@@ -441,7 +442,9 @@ def _formatter_func(self):
441442

442443
@cache_readonly
443444
def _engine(self):
444-
return self._engine_type(lambda: self, len(self))
445+
# To avoid a reference cycle, pass a weakref of self to _engine_type.
446+
period = weakref.ref(self)
447+
return self._engine_type(period, len(self))
445448

446449
@Appender(_index_shared_docs["contains"])
447450
def __contains__(self, key):

pandas/core/ops/__init__.py

+50-53
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747

4848
import pandas as pd
4949
from pandas._typing import ArrayLike
50-
from pandas.core.construction import extract_array
50+
from pandas.core.construction import array, extract_array
5151
from pandas.core.ops import missing
5252
from pandas.core.ops.docstrings import (
5353
_arith_doc_FRAME,
@@ -460,6 +460,33 @@ def masked_arith_op(x, y, op):
460460
# Dispatch logic
461461

462462

463+
def should_extension_dispatch(left: ABCSeries, right: Any) -> bool:
464+
"""
465+
Identify cases where Series operation should use dispatch_to_extension_op.
466+
467+
Parameters
468+
----------
469+
left : Series
470+
right : object
471+
472+
Returns
473+
-------
474+
bool
475+
"""
476+
if (
477+
is_extension_array_dtype(left.dtype)
478+
or is_datetime64_dtype(left.dtype)
479+
or is_timedelta64_dtype(left.dtype)
480+
):
481+
return True
482+
483+
if is_extension_array_dtype(right) and not is_scalar(right):
484+
# GH#22378 disallow scalar to exclude e.g. "category", "Int64"
485+
return True
486+
487+
return False
488+
489+
463490
def should_series_dispatch(left, right, op):
464491
"""
465492
Identify cases where a DataFrame operation should dispatch to its
@@ -564,19 +591,18 @@ def dispatch_to_extension_op(op, left, right):
564591
apply the operator defined by op.
565592
"""
566593

594+
if left.dtype.kind in "mM":
595+
# We need to cast datetime64 and timedelta64 ndarrays to
596+
# DatetimeArray/TimedeltaArray. But we avoid wrapping others in
597+
# PandasArray as that behaves poorly with e.g. IntegerArray.
598+
left = array(left)
599+
567600
# The op calls will raise TypeError if the op is not defined
568601
# on the ExtensionArray
569602

570603
# unbox Series and Index to arrays
571-
if isinstance(left, (ABCSeries, ABCIndexClass)):
572-
new_left = left._values
573-
else:
574-
new_left = left
575-
576-
if isinstance(right, (ABCSeries, ABCIndexClass)):
577-
new_right = right._values
578-
else:
579-
new_right = right
604+
new_left = extract_array(left, extract_numpy=True)
605+
new_right = extract_array(right, extract_numpy=True)
580606

581607
try:
582608
res_values = op(new_left, new_right)
@@ -684,56 +710,27 @@ def wrapper(left, right):
684710
res_name = get_op_result_name(left, right)
685711
right = maybe_upcast_for_op(right, left.shape)
686712

687-
if is_categorical_dtype(left):
688-
raise TypeError(
689-
"{typ} cannot perform the operation "
690-
"{op}".format(typ=type(left).__name__, op=str_rep)
691-
)
692-
693-
elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left):
694-
from pandas.core.arrays import DatetimeArray
695-
696-
result = dispatch_to_extension_op(op, DatetimeArray(left), right)
697-
return construct_result(left, result, index=left.index, name=res_name)
698-
699-
elif is_extension_array_dtype(left) or (
700-
is_extension_array_dtype(right) and not is_scalar(right)
701-
):
702-
# GH#22378 disallow scalar to exclude e.g. "category", "Int64"
713+
if should_extension_dispatch(left, right):
703714
result = dispatch_to_extension_op(op, left, right)
704-
return construct_result(left, result, index=left.index, name=res_name)
705715

706-
elif is_timedelta64_dtype(left):
707-
from pandas.core.arrays import TimedeltaArray
708-
709-
result = dispatch_to_extension_op(op, TimedeltaArray(left), right)
710-
return construct_result(left, result, index=left.index, name=res_name)
711-
712-
elif is_timedelta64_dtype(right):
713-
# We should only get here with non-scalar values for right
714-
# upcast by maybe_upcast_for_op
716+
elif is_timedelta64_dtype(right) or isinstance(
717+
right, (ABCDatetimeArray, ABCDatetimeIndex)
718+
):
719+
# We should only get here with td64 right with non-scalar values
720+
# for right upcast by maybe_upcast_for_op
715721
assert not isinstance(right, (np.timedelta64, np.ndarray))
716-
717722
result = op(left._values, right)
718723

719-
# We do not pass dtype to ensure that the Series constructor
720-
# does inference in the case where `result` has object-dtype.
721-
return construct_result(left, result, index=left.index, name=res_name)
722-
723-
elif isinstance(right, (ABCDatetimeArray, ABCDatetimeIndex)):
724-
result = op(left._values, right)
725-
return construct_result(left, result, index=left.index, name=res_name)
724+
else:
725+
lvalues = extract_array(left, extract_numpy=True)
726+
rvalues = extract_array(right, extract_numpy=True)
726727

727-
lvalues = left.values
728-
rvalues = right
729-
if isinstance(rvalues, (ABCSeries, ABCIndexClass)):
730-
rvalues = rvalues._values
728+
with np.errstate(all="ignore"):
729+
result = na_op(lvalues, rvalues)
731730

732-
with np.errstate(all="ignore"):
733-
result = na_op(lvalues, rvalues)
734-
return construct_result(
735-
left, result, index=left.index, name=res_name, dtype=None
736-
)
731+
# We do not pass dtype to ensure that the Series constructor
732+
# does inference in the case where `result` has object-dtype.
733+
return construct_result(left, result, index=left.index, name=res_name)
737734

738735
wrapper.__name__ = op_name
739736
return wrapper

0 commit comments

Comments
 (0)