Merge remote-tracking branch 'upstream/master' into GH28501

avinashpancham · avinashpancham · commit c91a8751ec31 · 2020-11-18T22:45:09.000+01:00
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -1,3 +1,5 @@
+import string
+import sys
 import warnings
 
 import numpy as np
@@ -67,6 +69,47 @@ def time_existing_series(self):
         pd.Categorical(self.series)
 
 
+class AsType:
+    def setup(self):
+        N = 10 ** 5
+
+        random_pick = np.random.default_rng().choice
+
+        categories = {
+            "str": list(string.ascii_letters),
+            "int": np.random.randint(2 ** 16, size=154),
+            "float": sys.maxsize * np.random.random((38,)),
+            "timestamp": [
+                pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
+            ],
+        }
+
+        self.df = pd.DataFrame(
+            {col: random_pick(cats, N) for col, cats in categories.items()}
+        )
+
+        for col in ("int", "float", "timestamp"):
+            self.df[col + "_as_str"] = self.df[col].astype(str)
+
+        for col in self.df.columns:
+            self.df[col] = self.df[col].astype("category")
+
+    def astype_str(self):
+        [self.df[col].astype("str") for col in "int float timestamp".split()]
+
+    def astype_int(self):
+        [self.df[col].astype("int") for col in "int_as_str timestamp".split()]
+
+    def astype_float(self):
+        [
+            self.df[col].astype("float")
+            for col in "float_as_str int int_as_str timestamp".split()
+        ]
+
+    def astype_datetime(self):
+        self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
+
+
 class Concat:
     def setup(self):
         N = 10 ** 5
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1158,6 +1158,40 @@ Mask
    s.mask(s >= 0)
    df.mask(df >= 0)
 
+.. _indexing.np_where:
+
+Setting with enlargement conditionally using :func:`numpy`
+----------------------------------------------------------
+
+An alternative to :meth:`~pandas.DataFrame.where` is to use :func:`numpy.where`.
+Combined with setting a new column, you can use it to enlarge a dataframe where the
+values are determined conditionally.
+
+Consider you have two choices to choose from in the following dataframe. And you want to
+set a new column color to 'green' when the second column has 'Z'.  You can do the
+following:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')})
+   df['color'] = np.where(df['col2'] == 'Z', 'green', 'red')
+   df
+
+If you have multiple conditions, you can use :func:`numpy.select` to achieve that.  Say
+corresponding to three conditions there are three choice of colors, with a fourth color
+as a fallback, you can do the following.
+
+.. ipython:: python
+
+   conditions = [
+       (df['col2'] == 'Z') & (df['col1'] == 'A'),
+       (df['col2'] == 'Z') & (df['col1'] == 'B'),
+       (df['col1'] == 'B')
+   ]
+   choices = ['yellow', 'blue', 'purple']
+   df['color'] = np.select(conditions, choices, default='black')
+   df
+
 .. _indexing.query:
 
 The :meth:`~pandas.DataFrame.query` Method
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -499,6 +499,7 @@ Performance improvements
 - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
 - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
 - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
+- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`)
 - Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
 - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
 
@@ -624,6 +625,7 @@ MultiIndex
 - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`)
 - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`)
 - Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`)
+- Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`)
 
 I/O
 ^^^
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -403,20 +403,42 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
             If copy is set to False and dtype is categorical, the original
             object is returned.
         """
-        if is_categorical_dtype(dtype):
+        if self.dtype is dtype:
+            result = self.copy() if copy else self
+
+        elif is_categorical_dtype(dtype):
             dtype = cast(Union[str, CategoricalDtype], dtype)
 
             # GH 10696/18593/18630
             dtype = self.dtype.update_dtype(dtype)
-            result = self.copy() if copy else self
-            if dtype == self.dtype:
-                return result
-            return result._set_dtype(dtype)
-        if is_extension_array_dtype(dtype):
-            return array(self, dtype=dtype, copy=copy)
-        if is_integer_dtype(dtype) and self.isna().any():
+            self = self.copy() if copy else self
+            result = self._set_dtype(dtype)
+
+        # TODO: consolidate with ndarray case?
+        elif is_extension_array_dtype(dtype):
+            result = array(self, dtype=dtype, copy=copy)
+
+        elif is_integer_dtype(dtype) and self.isna().any():
             raise ValueError("Cannot convert float NaN to integer")
-        return np.array(self, dtype=dtype, copy=copy)
+
+        elif len(self.codes) == 0 or len(self.categories) == 0:
+            result = np.array(self, dtype=dtype, copy=copy)
+
+        else:
+            # GH8628 (PERF): astype category codes instead of astyping array
+            try:
+                astyped_cats = self.categories.astype(dtype=dtype, copy=copy)
+            except (
+                TypeError,  # downstream error msg for CategoricalIndex is misleading
+                ValueError,
+            ):
+                msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
+                raise ValueError(msg)
+
+            astyped_cats = extract_array(astyped_cats, extract_numpy=True)
+            result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes))
+
+        return result
 
     @cache_readonly
     def itemsize(self) -> int:
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2156,6 +2156,10 @@ def _drop_from_level(self, codes, level, errors="raise"):
         i = self._get_level_number(level)
         index = self.levels[i]
         values = index.get_indexer(codes)
+        # If nan should be dropped it will equal -1 here. We have to check which values
+        # are not nan and equal -1, this means they are missing in the index
+        nan_codes = isna(codes)
+        values[(np.equal(nan_codes, False)) & (values == -1)] = -2
 
         mask = ~algos.isin(self.codes[i], values)
         if mask.all() and errors != "ignore":
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -6,7 +6,7 @@
 from pandas._libs import index as libindex
 from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick
 from pandas._libs.tslibs.parsing import DateParseError, parse_time_string
-from pandas._typing import DtypeObj, Label
+from pandas._typing import DtypeObj
 from pandas.errors import InvalidIndexError
 from pandas.util._decorators import Appender, cache_readonly, doc
 
diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py
@@ -81,7 +81,7 @@ def zero(request):
 
     Examples
     --------
-    >>> arr = pd.RangeIndex(5)
+    >>> arr = RangeIndex(5)
     >>> arr / zeros
     Float64Index([nan, inf, inf, inf, inf], dtype='float64')
     """
diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py
@@ -465,7 +465,7 @@ def test_addition_ops(self):
             tdi + pd.Int64Index([1, 2, 3])
 
         # this is a union!
-        # pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi)
+        # pytest.raises(TypeError, lambda : pd.Int64Index([1,2,3]) + tdi)
 
         result = tdi + dti  # name will be reset
         expected = DatetimeIndex(["20130102", pd.NaT, "20130105"])
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -677,7 +677,7 @@ def test_interval(self):
         tm.assert_index_equal(cat.categories, idx)
 
         # overlapping
-        idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)])
+        idx = IntervalIndex([Interval(0, 2), Interval(0, 1)])
         cat = Categorical(idx, categories=idx)
         expected_codes = np.array([0, 1], dtype="int8")
         tm.assert_numpy_array_equal(cat.codes, expected_codes)
diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py
@@ -127,7 +127,7 @@ def test_astype(self, ordered):
         expected = np.array(cat)
         tm.assert_numpy_array_equal(result, expected)
 
-        msg = "could not convert string to float"
+        msg = r"Cannot cast object dtype to <class 'float'>"
         with pytest.raises(ValueError, match=msg):
             cat.astype(float)
 
@@ -138,7 +138,7 @@ def test_astype(self, ordered):
         tm.assert_numpy_array_equal(result, expected)
 
         result = cat.astype(int)
-        expected = np.array(cat, dtype=int)
+        expected = np.array(cat, dtype="int64")
         tm.assert_numpy_array_equal(result, expected)
 
         result = cat.astype(float)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -720,8 +720,8 @@ def test_constructor_period_dict(self):
     @pytest.mark.parametrize(
         "data,dtype",
         [
-            (pd.Period("2012-01", freq="M"), "period[M]"),
-            (pd.Period("2012-02-01", freq="D"), "period[D]"),
+            (Period("2012-01", freq="M"), "period[M]"),
+            (Period("2012-02-01", freq="D"), "period[D]"),
             (Interval(left=0, right=5), IntervalDtype("int64")),
             (Interval(left=0.1, right=0.5), IntervalDtype("float64")),
         ],
@@ -2577,7 +2577,7 @@ def test_from_records_series_list_dict(self):
     def test_from_records_series_categorical_index(self):
         # GH 32805
         index = CategoricalIndex(
-            [pd.Interval(-20, -10), pd.Interval(-10, 0), pd.Interval(0, 10)]
+            [Interval(-20, -10), Interval(-10, 0), Interval(0, 10)]
         )
         series_of_dicts = Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index)
         frame = DataFrame.from_records(series_of_dicts, index=index)
@@ -2628,7 +2628,7 @@ class List(list):
         [
             Categorical(list("aabbc")),
             SparseArray([1, np.nan, np.nan, np.nan]),
-            IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]),
+            IntervalArray([Interval(0, 1), Interval(1, 5)]),
             PeriodArray(pd.period_range(start="1/1/2017", end="1/1/2018", freq="M")),
         ],
     )
@@ -2648,12 +2648,10 @@ def test_datetime_date_tuple_columns_from_dict(self):
 
     def test_construct_with_two_categoricalindex_series(self):
         # GH 14600
-        s1 = Series(
-            [39, 6, 4], index=pd.CategoricalIndex(["female", "male", "unknown"])
-        )
+        s1 = Series([39, 6, 4], index=CategoricalIndex(["female", "male", "unknown"]))
         s2 = Series(
             [2, 152, 2, 242, 150],
-            index=pd.CategoricalIndex(["f", "female", "m", "male", "unknown"]),
+            index=CategoricalIndex(["f", "female", "m", "male", "unknown"]),
         )
         result = DataFrame([s1, s2])
         expected = DataFrame(
@@ -2717,7 +2715,7 @@ def test_dataframe_constructor_infer_multiindex(self):
             (["1", "2"]),
             (list(date_range("1/1/2011", periods=2, freq="H"))),
             (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))),
-            ([pd.Interval(left=0, right=5)]),
+            ([Interval(left=0, right=5)]),
         ],
     )
     def test_constructor_list_str(self, input_vals, string_dtype):
diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py
@@ -228,7 +228,7 @@ def test_is_unique_interval(self, closed):
         assert idx.is_unique is True
 
         # unique overlapping - shared endpoints
-        idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
+        idx = IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
         assert idx.is_unique is True
 
         # unique nested
@@ -279,14 +279,14 @@ def test_monotonic(self, closed):
         assert idx._is_strictly_monotonic_decreasing is False
 
         # increasing overlapping shared endpoints
-        idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
+        idx = IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed)
         assert idx.is_monotonic is True
         assert idx._is_strictly_monotonic_increasing is True
         assert idx.is_monotonic_decreasing is False
         assert idx._is_strictly_monotonic_decreasing is False
 
         # decreasing overlapping shared endpoints
-        idx = pd.IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)], closed=closed)
+        idx = IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)], closed=closed)
         assert idx.is_monotonic is False
         assert idx._is_strictly_monotonic_increasing is False
         assert idx.is_monotonic_decreasing is True
@@ -872,7 +872,7 @@ def test_is_all_dates(self):
         year_2017 = Interval(
             Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00")
         )
-        year_2017_index = pd.IntervalIndex([year_2017])
+        year_2017_index = IntervalIndex([year_2017])
         assert not year_2017_index._is_all_dates
 
     @pytest.mark.parametrize("key", [[5], (2, 3)])
diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py
@@ -139,3 +139,11 @@ def test_drop_not_lexsorted():
     tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi)
     with tm.assert_produces_warning(PerformanceWarning):
         tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a"))
+
+
+def test_drop_with_nan_in_index(nulls_fixture):
+    # GH#18853
+    mi = MultiIndex.from_tuples([("blah", nulls_fixture)], names=["name", "date"])
+    msg = r"labels \[Timestamp\('2001-01-01 00:00:00'\)\] not found in level"
+    with pytest.raises(KeyError, match=msg):
+        mi.drop(pd.Timestamp("2001"), level="date")
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
@@ -522,7 +522,7 @@ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype):
 
     def test_constructor_unwraps_index(self):
         idx = Index([1, 2])
-        result = pd.Int64Index(idx)
+        result = Int64Index(idx)
         expected = np.array([1, 2], dtype="int64")
         tm.assert_numpy_array_equal(result._data, expected)
 
@@ -614,8 +614,8 @@ def test_int_float_union_dtype(dtype):
     # https://github.com/pandas-dev/pandas/issues/26778
     # [u]int | float -> float
     index = Index([0, 2, 3], dtype=dtype)
-    other = pd.Float64Index([0.5, 1.5])
-    expected = pd.Float64Index([0.0, 0.5, 1.5, 2.0, 3.0])
+    other = Float64Index([0.5, 1.5])
+    expected = Float64Index([0.0, 0.5, 1.5, 2.0, 3.0])
     result = index.union(other)
     tm.assert_index_equal(result, expected)
 
@@ -626,9 +626,9 @@ def test_int_float_union_dtype(dtype):
 def test_range_float_union_dtype():
     # https://github.com/pandas-dev/pandas/issues/26778
     index = pd.RangeIndex(start=0, stop=3)
-    other = pd.Float64Index([0.5, 1.5])
+    other = Float64Index([0.5, 1.5])
     result = index.union(other)
-    expected = pd.Float64Index([0.0, 0.5, 1, 1.5, 2.0])
+    expected = Float64Index([0.0, 0.5, 1, 1.5, 2.0])
     tm.assert_index_equal(result, expected)
 
     result = other.union(index)
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -1040,7 +1040,7 @@ def test_construction_consistency(self):
         "data_constructor", [list, np.array], ids=["list", "ndarray[object]"]
     )
     def test_constructor_infer_period(self, data_constructor):
-        data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None]
+        data = [Period("2000", "D"), Period("2001", "D"), None]
         result = Series(data_constructor(data))
         expected = Series(period_array(data))
         tm.assert_series_equal(result, expected)
@@ -1057,7 +1057,7 @@ def test_construct_from_ints_including_iNaT_scalar_period_dtype(self):
         assert isna(series[2])
 
     def test_constructor_period_incompatible_frequency(self):
-        data = [pd.Period("2000", "D"), pd.Period("2001", "A")]
+        data = [Period("2000", "D"), Period("2001", "A")]
         result = Series(data)
         assert result.dtype == object
         assert result.tolist() == data
@@ -1539,7 +1539,7 @@ def test_constructor_list_of_periods_infers_period_dtype(self):
         assert series.dtype == "Period[D]"
 
         series = Series(
-            [pd.Period("2011-01-01", freq="D"), pd.Period("2011-02-01", freq="D")]
+            [Period("2011-01-01", freq="D"), Period("2011-02-01", freq="D")]
         )
         assert series.dtype == "Period[D]"
 
diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py
diff --git a/pandas/tests/window/moments/test_moments_consistency_expanding.py b/pandas/tests/window/moments/test_moments_consistency_expanding.py
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py
diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py
diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py