Merge remote-tracking branch 'upstream/master' into 32bit-ci

fangchenli · fangchenli · commit 134b3359f072 · 2020-08-26T09:05:14.000-05:00
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -26,7 +26,7 @@
     // The Pythons you'd like to test against.  If not provided, defaults
     // to the current version of Python used to run `asv`.
     // "pythons": ["2.7", "3.4"],
-    "pythons": ["3.6"],
+    "pythons": ["3.8"],
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -540,7 +540,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397
 
 .. ipython:: python
 
-    df.describe()
+   df.describe()
 
 ``__str__`` methods now call ``__repr__`` rather than vice versa
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst
@@ -15,8 +15,9 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`)
+- Performance regression for :meth:`RangeIndex.format` (:issue:`35712`)
 -
--
+
 
 .. ---------------------------------------------------------------------------
 
@@ -26,7 +27,7 @@ Bug fixes
 ~~~~~~~~~
 - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`)
 - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`)
--
+- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -254,6 +254,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`)
 - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`)
 - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`)
+- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1197,57 +1197,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if len(keys) == 0:
             return self.obj._constructor(index=keys)
 
-        key_names = self.grouper.names
-
         # GH12824
         first_not_none = next(com.not_none(*values), None)
 
         if first_not_none is None:
-            # GH9684. If all values are None, then this will throw an error.
-            # We'd prefer it return an empty dataframe.
+            # GH9684 - All values are None, return an empty frame.
             return self.obj._constructor()
         elif isinstance(first_not_none, DataFrame):
             return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
         else:
-            if len(self.grouper.groupings) > 1:
-                key_index = self.grouper.result_index
-
-            else:
-                ping = self.grouper.groupings[0]
-                if len(keys) == ping.ngroups:
-                    key_index = ping.group_index
-                    key_index.name = key_names[0]
-
-                    key_lookup = Index(keys)
-                    indexer = key_lookup.get_indexer(key_index)
-
-                    # reorder the values
-                    values = [values[i] for i in indexer]
-
-                    # update due to the potential reorder
-                    first_not_none = next(com.not_none(*values), None)
-                else:
-
-                    key_index = Index(keys, name=key_names[0])
-
-                # don't use the key indexer
-                if not self.as_index:
-                    key_index = None
+            key_index = self.grouper.result_index if self.as_index else None
 
-            # make Nones an empty object
-            if first_not_none is None:
-                return self.obj._constructor()
-            elif isinstance(first_not_none, NDFrame):
+            if isinstance(first_not_none, Series):
 
                 # this is to silence a DeprecationWarning
                 # TODO: Remove when default dtype of empty Series is object
                 kwargs = first_not_none._construct_axes_dict()
-                if isinstance(first_not_none, Series):
-                    backup = create_series_with_explicit_dtype(
-                        **kwargs, dtype_if_empty=object
-                    )
-                else:
-                    backup = first_not_none._constructor(**kwargs)
+                backup = create_series_with_explicit_dtype(
+                    **kwargs, dtype_if_empty=object
+                )
 
                 values = [x if (x is not None) else backup for x in values]
 
@@ -1256,7 +1224,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
             if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
                 if isinstance(v, Series):
                     applied_index = self._selected_obj._get_axis(self.axis)
-                    all_indexed_same = all_indexes_same([x.index for x in values])
+                    all_indexed_same = all_indexes_same((x.index for x in values))
                     singular_series = len(values) == 1 and applied_index.nlevels == 1
 
                     # GH3596
@@ -1288,7 +1256,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                         # GH 8467
                         return self._concat_objects(keys, values, not_indexed_same=True)
 
-                if self.axis == 0 and isinstance(v, ABCSeries):
                     # GH6124 if the list of Series have a consistent name,
                     # then propagate that name to the result.
                     index = v.index.copy()
@@ -1301,34 +1268,27 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                         if len(names) == 1:
                             index.name = list(names)[0]
 
-                    # normally use vstack as its faster than concat
-                    # and if we have mi-columns
-                    if (
-                        isinstance(v.index, MultiIndex)
-                        or key_index is None
-                        or isinstance(key_index, MultiIndex)
-                    ):
-                        stacked_values = np.vstack([np.asarray(v) for v in values])
-                        result = self.obj._constructor(
-                            stacked_values, index=key_index, columns=index
-                        )
-                    else:
-                        # GH5788 instead of stacking; concat gets the
-                        # dtypes correct
-                        from pandas.core.reshape.concat import concat
-
-                        result = concat(
-                            values,
-                            keys=key_index,
-                            names=key_index.names,
-                            axis=self.axis,
-                        ).unstack()
-                        result.columns = index
-                elif isinstance(v, ABCSeries):
+                    # Combine values
+                    # vstack+constructor is faster than concat and handles MI-columns
                     stacked_values = np.vstack([np.asarray(v) for v in values])
+
+                    if self.axis == 0:
+                        index = key_index
+                        columns = v.index.copy()
+                        if columns.name is None:
+                            # GH6124 - propagate name of Series when it's consistent
+                            names = {v.name for v in values}
+                            if len(names) == 1:
+                                columns.name = list(names)[0]
+                    else:
+                        index = v.index
+                        columns = key_index
+                        stacked_values = stacked_values.T
+
                     result = self.obj._constructor(
-                        stacked_values.T, index=v.index, columns=key_index
+                        stacked_values, index=index, columns=columns
                     )
+
                 elif not self.as_index:
                     # We add grouping column below, so create a frame here
                     result = DataFrame(
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -297,15 +297,16 @@ def all_indexes_same(indexes):
 
     Parameters
     ----------
-    indexes : list of Index objects
+    indexes : iterable of Index objects
 
     Returns
     -------
     bool
         True if all indexes contain the same elements, False otherwise.
     """
-    first = indexes[0]
-    for index in indexes[1:]:
+    itr = iter(indexes)
+    first = next(itr)
+    for index in itr:
         if not first.equals(index):
             return False
     return True
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -933,7 +933,9 @@ def format(
 
         return self._format_with_header(header, na_rep=na_rep)
 
-    def _format_with_header(self, header, na_rep="NaN") -> List[str_t]:
+    def _format_with_header(
+        self, header: List[str_t], na_rep: str_t = "NaN"
+    ) -> List[str_t]:
         from pandas.io.formats.format import format_array
 
         values = self._values
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -347,7 +347,7 @@ def _format_attrs(self):
             attrs.append(("length", len(self)))
         return attrs
 
-    def _format_with_header(self, header, na_rep="NaN") -> List[str]:
+    def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]:
         from pandas.io.formats.printing import pprint_thing
 
         result = [
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -354,15 +354,20 @@ def format(
         """
         header = []
         if name:
-            fmt_name = ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n"))
-            header.append(fmt_name)
+            header.append(
+                ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n"))
+                if self.name is not None
+                else ""
+            )
 
         if formatter is not None:
             return header + list(self.map(formatter))
 
         return self._format_with_header(header, na_rep=na_rep, date_format=date_format)
 
-    def _format_with_header(self, header, na_rep="NaT", date_format=None) -> List[str]:
+    def _format_with_header(
+        self, header: List[str], na_rep: str = "NaT", date_format: Optional[str] = None
+    ) -> List[str]:
         return header + list(
             self._format_native_types(na_rep=na_rep, date_format=date_format)
         )
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -948,7 +948,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):
     # Rendering Methods
     # __repr__ associated methods are based on MultiIndex
 
-    def _format_with_header(self, header, na_rep="NaN") -> List[str]:
+    def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]:
         return header + list(self._format_native_types(na_rep=na_rep))
 
     def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs):
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
@@ -1,7 +1,7 @@
 from datetime import timedelta
 import operator
 from sys import getsizeof
-from typing import Any
+from typing import Any, List
 import warnings
 
 import numpy as np
@@ -187,6 +187,15 @@ def _format_data(self, name=None):
         # we are formatting thru the attributes
         return None
 
+    def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]:
+        if not len(self._range):
+            return header
+        first_val_str = str(self._range[0])
+        last_val_str = str(self._range[-1])
+        max_length = max(len(first_val_str), len(last_val_str))
+
+        return header + [f"{x:<{max_length}}" for x in self._range]
+
     # --------------------------------------------------------------------
     _deprecation_message = (
         "RangeIndex.{} is deprecated and will be "
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -861,13 +861,14 @@ def test_apply_multi_level_name(category):
     b = [1, 2] * 5
     if category:
         b = pd.Categorical(b, categories=[1, 2, 3])
+        expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B")
+    else:
+        expected_index = pd.Index([1, 2], name="B")
     df = pd.DataFrame(
         {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
     ).set_index(["A", "B"])
     result = df.groupby("B").apply(lambda x: x.sum())
-    expected = pd.DataFrame(
-        {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B")
-    )
+    expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index)
     tm.assert_frame_equal(result, expected)
     assert df.index.names == ["A", "B"]
 
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
@@ -1,5 +1,5 @@
 import gc
-from typing import Optional, Type
+from typing import Type
 
 import numpy as np
 import pytest
@@ -33,7 +33,7 @@
 class Base:
     """ base class for index sub-class tests """
 
-    _holder: Optional[Type[Index]] = None
+    _holder: Type[Index]
     _compat_props = ["shape", "ndim", "size", "nbytes"]
 
     def create_index(self) -> Index:
@@ -686,6 +686,12 @@ def test_format(self):
         expected = [str(x) for x in idx]
         assert idx.format() == expected
 
+    def test_format_empty(self):
+        # GH35712
+        empty_idx = self._holder([])
+        assert empty_idx.format() == []
+        assert empty_idx.format(name=True) == [""]
+
     def test_hasnans_isnans(self, index):
         # GH 11343, added tests for hasnans / isnans
         if isinstance(index, MultiIndex):
diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py
@@ -536,6 +536,12 @@ def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key):
         with pytest.raises(KeyError, match=msg):
             df.loc[key]
 
+    def test_format_empty(self):
+        # GH35712
+        empty_idx = self._holder([], freq="A")
+        assert empty_idx.format() == []
+        assert empty_idx.format(name=True) == [""]
+
 
 def test_maybe_convert_timedelta():
     pi = PeriodIndex(["2000", "2001"], freq="D")
diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py
@@ -171,8 +171,14 @@ def test_cache(self):
             pass
         assert idx._cache == {}
 
+        idx.format()
+        assert idx._cache == {}
+
         df = pd.DataFrame({"a": range(10)}, index=idx)
 
+        str(df)
+        assert idx._cache == {}
+
         df.loc[50]
         assert idx._cache == {}
 
@@ -515,3 +521,9 @@ def test_engineless_lookup(self):
             idx.get_loc("a")
 
         assert "_engine" not in idx._cache
+
+    def test_format_empty(self):
+        # GH35712
+        empty_idx = self._holder(0)
+        assert empty_idx.format() == []
+        assert empty_idx.format(name=True) == [""]
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py