Merge branch 'master' into duplicate-cut

debnathshoham · web-flow · commit 934a05276090 · 2021-07-25T20:09:07.000+05:30
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
@@ -30,7 +30,10 @@ fi
 echo $PYTEST_CMD
 sh -c "$PYTEST_CMD"
 
-PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS  --dist=loadfile $TEST_ARGS $COVERAGE pandas"
+if [[ "$PANDAS_DATA_MANAGER" != "array" ]]; then
+    # The ArrayManager tests should have already been run by PYTEST_CMD if PANDAS_DATA_MANAGER was already set to array
+    PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS  --dist=loadfile $TEST_ARGS $COVERAGE pandas"
 
-echo $PYTEST_AM_CMD
-sh -c "$PYTEST_AM_CMD"
+    echo $PYTEST_AM_CMD
+    sh -c "$PYTEST_AM_CMD"
+fi
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
@@ -24,6 +24,7 @@ Version 1.3
 .. toctree::
    :maxdepth: 2
 
+   v1.3.2
    v1.3.1
    v1.3.0
 
diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst
@@ -48,4 +48,4 @@ Bug fixes
 Contributors
 ~~~~~~~~~~~~
 
-.. contributors:: v1.3.0..v1.3.1|HEAD
+.. contributors:: v1.3.0..v1.3.1
diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
@@ -0,0 +1,45 @@
+.. _whatsnew_132:
+
+What's new in 1.3.2 (August ??, 2021)
+-------------------------------------
+
+These are the changes in pandas 1.3.2. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_132.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+-
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_132.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+-
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_132.other:
+
+Other
+~~~~~
+-
+-
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_132.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v1.3.1..v1.3.2|HEAD
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -29,6 +29,7 @@ enhancement2
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
+- :class:`DataFrameGroupBy` operations with ``as_index=False`` now correctly retain ``ExtensionDtype`` dtypes for columns being grouped on (:issue:`41373`)
 - Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
 - :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
 -  Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
@@ -235,6 +236,7 @@ MultiIndex
 ^^^^^^^^^^
 - Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`)
 - Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`)
+- Bug in :meth:`MultiIndex.get_loc` raising ``TypeError`` instead of ``KeyError`` on nested tuple (:issue:`42440`)
 -
 
 I/O
@@ -262,8 +264,8 @@ Groupby/resample/rolling
 
 Reshaping
 ^^^^^^^^^
+- :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`)
 - Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
--
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1033,7 +1033,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
             self._insert_inaxis_grouper_inplace(result)
             result.index = Index(range(len(result)))
 
-        return result._convert(datetime=True)
+        return result
 
     agg = aggregate
 
@@ -1684,6 +1684,8 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
         if self.axis == 1:
             result = result.T
 
+        # Note: we only need to pass datetime=True in order to get numeric
+        #  values converted
         return self._reindex_output(result)._convert(datetime=True)
 
     def _iterate_column_groupbys(self, obj: FrameOrSeries):
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -619,11 +619,20 @@ def group_arraylike(self) -> ArrayLike:
         Analogous to result_index, but holding an ArrayLike to ensure
         we can can retain ExtensionDtypes.
         """
+        if self._group_index is not None:
+            # _group_index is set in __init__ for MultiIndex cases
+            return self._group_index._values
+
+        elif self._all_grouper is not None:
+            # retain dtype for categories, including unobserved ones
+            return self.result_index._values
+
         return self._codes_and_uniques[1]
 
     @cache_readonly
     def result_index(self) -> Index:
-        # TODO: what's the difference between result_index vs group_index?
+        # result_index retains dtype for categories, including unobserved ones,
+        #  which group_index does not
         if self._all_grouper is not None:
             group_idx = self.group_index
             assert isinstance(group_idx, CategoricalIndex)
@@ -635,7 +644,8 @@ def group_index(self) -> Index:
         if self._group_index is not None:
             # _group_index is set in __init__ for MultiIndex cases
             return self._group_index
-        uniques = self.group_arraylike
+
+        uniques = self._codes_and_uniques[1]
         return Index(uniques, name=self.name)
 
     @cache_readonly
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -885,6 +885,7 @@ def result_arraylike(self) -> ArrayLike:
         if len(self.groupings) == 1:
             return self.groupings[0].group_arraylike
 
+        # result_index is MultiIndex
         return self.result_index._values
 
     @cache_readonly
@@ -903,12 +904,12 @@ def get_group_levels(self) -> list[ArrayLike]:
         # Note: only called from _insert_inaxis_grouper_inplace, which
         #  is only called for BaseGrouper, never for BinGrouper
         if len(self.groupings) == 1:
-            return [self.groupings[0].result_index]
+            return [self.groupings[0].group_arraylike]
 
         name_list = []
         for ping, codes in zip(self.groupings, self.reconstructed_codes):
             codes = ensure_platform_int(codes)
-            levels = ping.result_index.take(codes)
+            levels = ping.group_arraylike.take(codes)
 
             name_list.append(levels)
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -5514,16 +5514,6 @@ def _get_indexer_non_comparable(
         """
         if method is not None:
             other = unpack_nested_dtype(target)
-            if self._is_multi ^ other._is_multi:
-                kind = other.dtype.type if self._is_multi else self.dtype.type
-                raise TypeError(
-                    f"'<' not supported between instances of {kind} and 'tuple'"
-                )
-            elif self._is_multi and other._is_multi:
-                assert self.nlevels != other.nlevels
-                # Python allows comparison between tuples of different lengths,
-                #  but for our purposes such a comparison is not meaningful.
-                raise TypeError("'<' not supported between tuples of different lengths")
             raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")
 
         no_matches = -1 * np.ones(target.shape, dtype=np.intp)
@@ -5653,14 +5643,6 @@ def _should_compare(self, other: Index) -> bool:
 
         other = unpack_nested_dtype(other)
         dtype = other.dtype
-        if other._is_multi:
-            if not self._is_multi:
-                # other contains only tuples so unless we are object-dtype,
-                #  there can never be any matches
-                return self._is_comparable_dtype(dtype)
-            return self.nlevels == other.nlevels
-            # TODO: we can get more specific requiring levels are comparable?
-
         return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
 
     def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2846,9 +2846,17 @@ def _maybe_to_slice(loc):
         # needs linear search within the slice
         i = self._lexsort_depth
         lead_key, follow_key = key[:i], key[i:]
-        start, stop = (
-            self.slice_locs(lead_key, lead_key) if lead_key else (0, len(self))
-        )
+
+        if not lead_key:
+            start = 0
+            stop = len(self)
+        else:
+            try:
+                start, stop = self.slice_locs(lead_key, lead_key)
+            except TypeError as err:
+                # e.g. test_groupby_example key = ((0, 0, 1, 2), "new_col")
+                #  when self has 5 integer levels
+                raise KeyError(key) from err
 
         if start == stop:
             raise KeyError(key)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -646,8 +646,8 @@ def _get_setitem_indexer(self, key):
 
         ax = self.obj._get_axis(0)
 
-        if isinstance(ax, MultiIndex) and self.name != "iloc":
-            with suppress(TypeError, KeyError, InvalidIndexError):
+        if isinstance(ax, MultiIndex) and self.name != "iloc" and is_hashable(key):
+            with suppress(KeyError, InvalidIndexError):
                 # TypeError e.g. passed a bool
                 return ax.get_loc(key)
 
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -717,8 +717,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
         new_levels.extend(new_index.levels)
         new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
     else:
-        new_levels.append(new_index)
-        new_codes.append(np.tile(np.arange(n), kpieces))
+        new_levels.append(new_index.unique())
+        single_codes = new_index.unique().get_indexer(new_index)
+        new_codes.append(np.tile(single_codes, kpieces))
 
     if len(new_names) < len(new_levels):
         new_names.extend(new_index.names)
diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py
@@ -22,14 +22,14 @@ def test_grouping_grouper(self, data_for_grouping):
     def test_groupby_extension_agg(self, as_index, data_for_grouping):
         df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
         result = df.groupby("B", as_index=as_index).A.mean()
-        _, index = pd.factorize(data_for_grouping, sort=True)
+        _, uniques = pd.factorize(data_for_grouping, sort=True)
 
-        index = pd.Index(index, name="B")
-        expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
         if as_index:
+            index = pd.Index(uniques, name="B")
+            expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
             self.assert_series_equal(result, expected)
         else:
-            expected = expected.reset_index()
+            expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]})
             self.assert_frame_equal(result, expected)
 
     def test_groupby_agg_extension(self, data_for_grouping):
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -312,10 +312,6 @@ def test_groupby_extension_apply(self):
         we'll be able to dispatch unique.
         """
 
-    @pytest.mark.parametrize("as_index", [True, False])
-    def test_groupby_extension_agg(self, as_index, data_for_grouping):
-        super().test_groupby_extension_agg(as_index, data_for_grouping)
-
     @pytest.mark.xfail(reason="GH#39098: Converts agg result to object")
     def test_groupby_agg_extension(self, data_for_grouping):
         super().test_groupby_agg_extension(data_for_grouping)
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -269,14 +269,14 @@ def test_grouping_grouper(self, data_for_grouping):
     def test_groupby_extension_agg(self, as_index, data_for_grouping):
         df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
         result = df.groupby("B", as_index=as_index).A.mean()
-        _, index = pd.factorize(data_for_grouping, sort=True)
+        _, uniques = pd.factorize(data_for_grouping, sort=True)
 
-        index = pd.Index(index, name="B")
-        expected = pd.Series([3.0, 1.0], index=index, name="A")
         if as_index:
+            index = pd.Index(uniques, name="B")
+            expected = pd.Series([3.0, 1.0], index=index, name="A")
             self.assert_series_equal(result, expected)
         else:
-            expected = expected.reset_index()
+            expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0]})
             self.assert_frame_equal(result, expected)
 
     def test_groupby_agg_extension(self, data_for_grouping):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -717,6 +717,10 @@ def test_ops_not_as_index(reduction_func):
         expected = expected.rename("size")
     expected = expected.reset_index()
 
+    if reduction_func != "size":
+        # 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64
+        expected["a"] = expected["a"].astype(df["a"].dtype)
+
     g = df.groupby("a", as_index=False)
 
     result = getattr(g, reduction_func)()
diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
@@ -1,4 +1,5 @@
 from datetime import timedelta
+import re
 
 import numpy as np
 import pytest
@@ -457,15 +458,6 @@ def test_get_indexer_kwarg_validation(self):
         with pytest.raises(ValueError, match=msg):
             mi.get_indexer(mi[:-1], tolerance="piano")
 
-    def test_get_indexer_mismatched_nlevels(self):
-        mi = MultiIndex.from_product([range(3), ["A", "B"]])
-
-        other = MultiIndex.from_product([range(3), ["A", "B"], range(2)])
-
-        msg = "tuples of different lengths"
-        with pytest.raises(TypeError, match=msg):
-            mi.get_indexer(other, method="pad")
-
 
 def test_getitem(idx):
     # scalar
@@ -698,6 +690,14 @@ def test_multiindex_get_loc_list_raises(self):
         with pytest.raises(TypeError, match=msg):
             idx.get_loc([])
 
+    def test_get_loc_nested_tuple_raises_keyerror(self):
+        # raise KeyError, not TypeError
+        mi = MultiIndex.from_product([range(3), range(4), range(5), range(6)])
+        key = ((2, 3, 4), "foo")
+
+        with pytest.raises(KeyError, match=re.escape(str(key))):
+            mi.get_loc(key)
+
 
 class TestWhere:
     def test_where(self):
diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py
@@ -180,3 +180,15 @@ def test_concat_bool_with_int(self):
         result = concat([df1, df2])
         expected = concat([df1.astype("int64"), df2])
         tm.assert_frame_equal(result, expected)
+
+    def test_concat_duplicates_in_index_with_keys(self):
+        # GH#42651
+        index = [1, 1, 3]
+        data = [1, 2, 3]
+
+        df = DataFrame(data=data, index=index)
+        result = concat([df], keys=["A"], names=["ID", "date"])
+        mi = pd.MultiIndex.from_product([["A"], index], names=["ID", "date"])
+        expected = DataFrame(data=data, index=mi)
+        tm.assert_frame_equal(result, expected)
+        tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -2530,3 +2530,15 @@ def test_empty_string_datetime_coerce__unit():
     # verify that no exception is raised even when errors='raise' is set
     result = to_datetime([1, ""], unit="s", errors="raise")
     tm.assert_index_equal(expected, result)
+
+
+@pytest.mark.parametrize("cache", [True, False])
+def test_to_datetime_monotonic_increasing_index(cache):
+    # GH28238
+    cstart = start_caching_at
+    times = date_range(Timestamp("1980"), periods=cstart, freq="YS")
+    times = times.to_frame(index=False, name="DT").sample(n=cstart, random_state=1)
+    times.index = times.index.to_series().astype(float) / 1000
+    result = to_datetime(times.iloc[:, 0], cache=cache)
+    expected = times.iloc[:, 0]
+    tm.assert_series_equal(result, expected)