Skip to content

Commit c91a875

Browse files
Merge remote-tracking branch 'upstream/master' into GH28501
2 parents 8275c06 + a022d7b commit c91a875

22 files changed

+413
-280
lines changed

asv_bench/benchmarks/categoricals.py

+43
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import string
2+
import sys
13
import warnings
24

35
import numpy as np
@@ -67,6 +69,47 @@ def time_existing_series(self):
6769
pd.Categorical(self.series)
6870

6971

72+
class AsType:
73+
def setup(self):
74+
N = 10 ** 5
75+
76+
random_pick = np.random.default_rng().choice
77+
78+
categories = {
79+
"str": list(string.ascii_letters),
80+
"int": np.random.randint(2 ** 16, size=154),
81+
"float": sys.maxsize * np.random.random((38,)),
82+
"timestamp": [
83+
pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
84+
],
85+
}
86+
87+
self.df = pd.DataFrame(
88+
{col: random_pick(cats, N) for col, cats in categories.items()}
89+
)
90+
91+
for col in ("int", "float", "timestamp"):
92+
self.df[col + "_as_str"] = self.df[col].astype(str)
93+
94+
for col in self.df.columns:
95+
self.df[col] = self.df[col].astype("category")
96+
97+
def astype_str(self):
98+
[self.df[col].astype("str") for col in "int float timestamp".split()]
99+
100+
def astype_int(self):
101+
[self.df[col].astype("int") for col in "int_as_str timestamp".split()]
102+
103+
def astype_float(self):
104+
[
105+
self.df[col].astype("float")
106+
for col in "float_as_str int int_as_str timestamp".split()
107+
]
108+
109+
def astype_datetime(self):
110+
self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
111+
112+
70113
class Concat:
71114
def setup(self):
72115
N = 10 ** 5

doc/source/user_guide/indexing.rst

+34
Original file line numberDiff line numberDiff line change
@@ -1158,6 +1158,40 @@ Mask
11581158
s.mask(s >= 0)
11591159
df.mask(df >= 0)
11601160
1161+
.. _indexing.np_where:
1162+
1163+
Setting with enlargement conditionally using :func:`numpy`
1164+
----------------------------------------------------------
1165+
1166+
An alternative to :meth:`~pandas.DataFrame.where` is to use :func:`numpy.where`.
1167+
Combined with setting a new column, you can use it to enlarge a dataframe where the
1168+
values are determined conditionally.
1169+
1170+
Consider you have two choices to choose from in the following dataframe. And you want to
1171+
set a new column color to 'green' when the second column has 'Z'. You can do the
1172+
following:
1173+
1174+
.. ipython:: python
1175+
1176+
df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')})
1177+
df['color'] = np.where(df['col2'] == 'Z', 'green', 'red')
1178+
df
1179+
1180+
If you have multiple conditions, you can use :func:`numpy.select` to achieve that. Say
1181+
corresponding to three conditions there are three choice of colors, with a fourth color
1182+
as a fallback, you can do the following.
1183+
1184+
.. ipython:: python
1185+
1186+
conditions = [
1187+
(df['col2'] == 'Z') & (df['col1'] == 'A'),
1188+
(df['col2'] == 'Z') & (df['col1'] == 'B'),
1189+
(df['col1'] == 'B')
1190+
]
1191+
choices = ['yellow', 'blue', 'purple']
1192+
df['color'] = np.select(conditions, choices, default='black')
1193+
df
1194+
11611195
.. _indexing.query:
11621196

11631197
The :meth:`~pandas.DataFrame.query` Method

doc/source/whatsnew/v1.2.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,7 @@ Performance improvements
499499
- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
500500
- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
501501
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
502+
- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`)
502503
- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
503504
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
504505

@@ -624,6 +625,7 @@ MultiIndex
624625
- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`)
625626
- Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`)
626627
- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`)
628+
- Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`)
627629

628630
I/O
629631
^^^

pandas/core/arrays/categorical.py

+31-9
Original file line numberDiff line numberDiff line change
@@ -403,20 +403,42 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
403403
If copy is set to False and dtype is categorical, the original
404404
object is returned.
405405
"""
406-
if is_categorical_dtype(dtype):
406+
if self.dtype is dtype:
407+
result = self.copy() if copy else self
408+
409+
elif is_categorical_dtype(dtype):
407410
dtype = cast(Union[str, CategoricalDtype], dtype)
408411

409412
# GH 10696/18593/18630
410413
dtype = self.dtype.update_dtype(dtype)
411-
result = self.copy() if copy else self
412-
if dtype == self.dtype:
413-
return result
414-
return result._set_dtype(dtype)
415-
if is_extension_array_dtype(dtype):
416-
return array(self, dtype=dtype, copy=copy)
417-
if is_integer_dtype(dtype) and self.isna().any():
414+
self = self.copy() if copy else self
415+
result = self._set_dtype(dtype)
416+
417+
# TODO: consolidate with ndarray case?
418+
elif is_extension_array_dtype(dtype):
419+
result = array(self, dtype=dtype, copy=copy)
420+
421+
elif is_integer_dtype(dtype) and self.isna().any():
418422
raise ValueError("Cannot convert float NaN to integer")
419-
return np.array(self, dtype=dtype, copy=copy)
423+
424+
elif len(self.codes) == 0 or len(self.categories) == 0:
425+
result = np.array(self, dtype=dtype, copy=copy)
426+
427+
else:
428+
# GH8628 (PERF): astype category codes instead of astyping array
429+
try:
430+
astyped_cats = self.categories.astype(dtype=dtype, copy=copy)
431+
except (
432+
TypeError, # downstream error msg for CategoricalIndex is misleading
433+
ValueError,
434+
):
435+
msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
436+
raise ValueError(msg)
437+
438+
astyped_cats = extract_array(astyped_cats, extract_numpy=True)
439+
result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes))
440+
441+
return result
420442

421443
@cache_readonly
422444
def itemsize(self) -> int:

pandas/core/indexes/multi.py

+4
Original file line numberDiff line numberDiff line change
@@ -2156,6 +2156,10 @@ def _drop_from_level(self, codes, level, errors="raise"):
21562156
i = self._get_level_number(level)
21572157
index = self.levels[i]
21582158
values = index.get_indexer(codes)
2159+
# If nan should be dropped it will equal -1 here. We have to check which values
2160+
# are not nan and equal -1, this means they are missing in the index
2161+
nan_codes = isna(codes)
2162+
values[(np.equal(nan_codes, False)) & (values == -1)] = -2
21592163

21602164
mask = ~algos.isin(self.codes[i], values)
21612165
if mask.all() and errors != "ignore":

pandas/core/indexes/period.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas._libs import index as libindex
77
from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick
88
from pandas._libs.tslibs.parsing import DateParseError, parse_time_string
9-
from pandas._typing import DtypeObj, Label
9+
from pandas._typing import DtypeObj
1010
from pandas.errors import InvalidIndexError
1111
from pandas.util._decorators import Appender, cache_readonly, doc
1212

pandas/tests/arithmetic/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def zero(request):
8181
8282
Examples
8383
--------
84-
>>> arr = pd.RangeIndex(5)
84+
>>> arr = RangeIndex(5)
8585
>>> arr / zeros
8686
Float64Index([nan, inf, inf, inf, inf], dtype='float64')
8787
"""

pandas/tests/arithmetic/test_timedelta64.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ def test_addition_ops(self):
465465
tdi + pd.Int64Index([1, 2, 3])
466466

467467
# this is a union!
468-
# pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi)
468+
# pytest.raises(TypeError, lambda : pd.Int64Index([1,2,3]) + tdi)
469469

470470
result = tdi + dti # name will be reset
471471
expected = DatetimeIndex(["20130102", pd.NaT, "20130105"])

pandas/tests/arrays/categorical/test_constructors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,7 @@ def test_interval(self):
677677
tm.assert_index_equal(cat.categories, idx)
678678

679679
# overlapping
680-
idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)])
680+
idx = IntervalIndex([Interval(0, 2), Interval(0, 1)])
681681
cat = Categorical(idx, categories=idx)
682682
expected_codes = np.array([0, 1], dtype="int8")
683683
tm.assert_numpy_array_equal(cat.codes, expected_codes)

pandas/tests/arrays/categorical/test_dtypes.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def test_astype(self, ordered):
127127
expected = np.array(cat)
128128
tm.assert_numpy_array_equal(result, expected)
129129

130-
msg = "could not convert string to float"
130+
msg = r"Cannot cast object dtype to <class 'float'>"
131131
with pytest.raises(ValueError, match=msg):
132132
cat.astype(float)
133133

@@ -138,7 +138,7 @@ def test_astype(self, ordered):
138138
tm.assert_numpy_array_equal(result, expected)
139139

140140
result = cat.astype(int)
141-
expected = np.array(cat, dtype=int)
141+
expected = np.array(cat, dtype="int64")
142142
tm.assert_numpy_array_equal(result, expected)
143143

144144
result = cat.astype(float)

pandas/tests/frame/test_constructors.py

+7-9
Original file line numberDiff line numberDiff line change
@@ -720,8 +720,8 @@ def test_constructor_period_dict(self):
720720
@pytest.mark.parametrize(
721721
"data,dtype",
722722
[
723-
(pd.Period("2012-01", freq="M"), "period[M]"),
724-
(pd.Period("2012-02-01", freq="D"), "period[D]"),
723+
(Period("2012-01", freq="M"), "period[M]"),
724+
(Period("2012-02-01", freq="D"), "period[D]"),
725725
(Interval(left=0, right=5), IntervalDtype("int64")),
726726
(Interval(left=0.1, right=0.5), IntervalDtype("float64")),
727727
],
@@ -2577,7 +2577,7 @@ def test_from_records_series_list_dict(self):
25772577
def test_from_records_series_categorical_index(self):
25782578
# GH 32805
25792579
index = CategoricalIndex(
2580-
[pd.Interval(-20, -10), pd.Interval(-10, 0), pd.Interval(0, 10)]
2580+
[Interval(-20, -10), Interval(-10, 0), Interval(0, 10)]
25812581
)
25822582
series_of_dicts = Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index)
25832583
frame = DataFrame.from_records(series_of_dicts, index=index)
@@ -2628,7 +2628,7 @@ class List(list):
26282628
[
26292629
Categorical(list("aabbc")),
26302630
SparseArray([1, np.nan, np.nan, np.nan]),
2631-
IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]),
2631+
IntervalArray([Interval(0, 1), Interval(1, 5)]),
26322632
PeriodArray(pd.period_range(start="1/1/2017", end="1/1/2018", freq="M")),
26332633
],
26342634
)
@@ -2648,12 +2648,10 @@ def test_datetime_date_tuple_columns_from_dict(self):
26482648

26492649
def test_construct_with_two_categoricalindex_series(self):
26502650
# GH 14600
2651-
s1 = Series(
2652-
[39, 6, 4], index=pd.CategoricalIndex(["female", "male", "unknown"])
2653-
)
2651+
s1 = Series([39, 6, 4], index=CategoricalIndex(["female", "male", "unknown"]))
26542652
s2 = Series(
26552653
[2, 152, 2, 242, 150],
2656-
index=pd.CategoricalIndex(["f", "female", "m", "male", "unknown"]),
2654+
index=CategoricalIndex(["f", "female", "m", "male", "unknown"]),
26572655
)
26582656
result = DataFrame([s1, s2])
26592657
expected = DataFrame(
@@ -2717,7 +2715,7 @@ def test_dataframe_constructor_infer_multiindex(self):
27172715
(["1", "2"]),
27182716
(list(date_range("1/1/2011", periods=2, freq="H"))),
27192717
(list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))),
2720-
([pd.Interval(left=0, right=5)]),
2718+
([Interval(left=0, right=5)]),
27212719
],
27222720
)
27232721
def test_constructor_list_str(self, input_vals, string_dtype):

pandas/tests/indexes/interval/test_interval.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def test_is_unique_interval(self, closed):
228228
assert idx.is_unique is True
229229

230230
# unique overlapping - shared endpoints
231-
idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
231+
idx = IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
232232
assert idx.is_unique is True
233233

234234
# unique nested
@@ -279,14 +279,14 @@ def test_monotonic(self, closed):
279279
assert idx._is_strictly_monotonic_decreasing is False
280280

281281
# increasing overlapping shared endpoints
282-
idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
282+
idx = IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
283283
assert idx.is_monotonic is True
284284
assert idx._is_strictly_monotonic_increasing is True
285285
assert idx.is_monotonic_decreasing is False
286286
assert idx._is_strictly_monotonic_decreasing is False
287287

288288
# decreasing overlapping shared endpoints
289-
idx = pd.IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)], closed=closed)
289+
idx = IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)], closed=closed)
290290
assert idx.is_monotonic is False
291291
assert idx._is_strictly_monotonic_increasing is False
292292
assert idx.is_monotonic_decreasing is True
@@ -872,7 +872,7 @@ def test_is_all_dates(self):
872872
year_2017 = Interval(
873873
Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00")
874874
)
875-
year_2017_index = pd.IntervalIndex([year_2017])
875+
year_2017_index = IntervalIndex([year_2017])
876876
assert not year_2017_index._is_all_dates
877877

878878
@pytest.mark.parametrize("key", [[5], (2, 3)])

pandas/tests/indexes/multi/test_drop.py

+8
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,11 @@ def test_drop_not_lexsorted():
139139
tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi)
140140
with tm.assert_produces_warning(PerformanceWarning):
141141
tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a"))
142+
143+
144+
def test_drop_with_nan_in_index(nulls_fixture):
145+
# GH#18853
146+
mi = MultiIndex.from_tuples([("blah", nulls_fixture)], names=["name", "date"])
147+
msg = r"labels \[Timestamp\('2001-01-01 00:00:00'\)\] not found in level"
148+
with pytest.raises(KeyError, match=msg):
149+
mi.drop(pd.Timestamp("2001"), level="date")

pandas/tests/indexes/test_numeric.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype):
522522

523523
def test_constructor_unwraps_index(self):
524524
idx = Index([1, 2])
525-
result = pd.Int64Index(idx)
525+
result = Int64Index(idx)
526526
expected = np.array([1, 2], dtype="int64")
527527
tm.assert_numpy_array_equal(result._data, expected)
528528

@@ -614,8 +614,8 @@ def test_int_float_union_dtype(dtype):
614614
# https://github.com/pandas-dev/pandas/issues/26778
615615
# [u]int | float -> float
616616
index = Index([0, 2, 3], dtype=dtype)
617-
other = pd.Float64Index([0.5, 1.5])
618-
expected = pd.Float64Index([0.0, 0.5, 1.5, 2.0, 3.0])
617+
other = Float64Index([0.5, 1.5])
618+
expected = Float64Index([0.0, 0.5, 1.5, 2.0, 3.0])
619619
result = index.union(other)
620620
tm.assert_index_equal(result, expected)
621621

@@ -626,9 +626,9 @@ def test_int_float_union_dtype(dtype):
626626
def test_range_float_union_dtype():
627627
# https://github.com/pandas-dev/pandas/issues/26778
628628
index = pd.RangeIndex(start=0, stop=3)
629-
other = pd.Float64Index([0.5, 1.5])
629+
other = Float64Index([0.5, 1.5])
630630
result = index.union(other)
631-
expected = pd.Float64Index([0.0, 0.5, 1, 1.5, 2.0])
631+
expected = Float64Index([0.0, 0.5, 1, 1.5, 2.0])
632632
tm.assert_index_equal(result, expected)
633633

634634
result = other.union(index)

pandas/tests/series/test_constructors.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1040,7 +1040,7 @@ def test_construction_consistency(self):
10401040
"data_constructor", [list, np.array], ids=["list", "ndarray[object]"]
10411041
)
10421042
def test_constructor_infer_period(self, data_constructor):
1043-
data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None]
1043+
data = [Period("2000", "D"), Period("2001", "D"), None]
10441044
result = Series(data_constructor(data))
10451045
expected = Series(period_array(data))
10461046
tm.assert_series_equal(result, expected)
@@ -1057,7 +1057,7 @@ def test_construct_from_ints_including_iNaT_scalar_period_dtype(self):
10571057
assert isna(series[2])
10581058

10591059
def test_constructor_period_incompatible_frequency(self):
1060-
data = [pd.Period("2000", "D"), pd.Period("2001", "A")]
1060+
data = [Period("2000", "D"), Period("2001", "A")]
10611061
result = Series(data)
10621062
assert result.dtype == object
10631063
assert result.tolist() == data
@@ -1539,7 +1539,7 @@ def test_constructor_list_of_periods_infers_period_dtype(self):
15391539
assert series.dtype == "Period[D]"
15401540

15411541
series = Series(
1542-
[pd.Period("2011-01-01", freq="D"), pd.Period("2011-02-01", freq="D")]
1542+
[Period("2011-01-01", freq="D"), Period("2011-02-01", freq="D")]
15431543
)
15441544
assert series.dtype == "Period[D]"
15451545

0 commit comments

Comments
 (0)