Merge pull request #88 from pandas-dev/master

sthagen · web-flow · commit 5fba421b9181 · 2020-03-12T19:01:34.000+01:00
Sync Fork from Upstream Repo
diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst
@@ -75,7 +75,7 @@ Filtering in SQL is done via a WHERE clause.
     LIMIT 5;
 
 DataFrames can be filtered in multiple ways; the most intuitive of which is using
-`boolean indexing <https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing>`_.
+:ref:`boolean indexing <indexing.boolean>`
 
 .. ipython:: python
 
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -794,8 +794,7 @@ The :ref:`Resample <timeseries.resampling>` docs.
 `Time grouping with some missing values
 <https://stackoverflow.com/questions/33637312/pandas-grouper-by-frequency-with-completeness-requirement>`__
 
-`Valid frequency arguments to Grouper
-<https://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__
+Valid frequency arguments to Grouper :ref:`Timeseries <timeseries.offset_aliases>`
 
 `Grouping using a MultiIndex
 <https://stackoverflow.com/questions/41483763/pandas-timegrouper-on-multiindex>`__
diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
@@ -1,6 +1,6 @@
 .. _whatsnew_102:
 
-What's new in 1.0.2 (March 11, 2020)
+What's new in 1.0.2 (March 12, 2020)
 ------------------------------------
 
 These are the changes in pandas 1.0.2. See :ref:`release` for a full changelog
@@ -15,22 +15,34 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 
-- Fixed regression in :meth:`DataFrame.to_excel` when ``columns`` kwarg is passed (:issue:`31677`)
-- Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`)
+**Groupby**
+
 - Fixed regression in :meth:`groupby(..).agg() <pandas.core.groupby.GroupBy.agg>` which was failing on frames with MultiIndex columns and a custom function (:issue:`31777`)
 - Fixed regression in ``groupby(..).rolling(..).apply()`` (``RollingGroupby``) where the ``raw`` parameter was ignored (:issue:`31754`)
 - Fixed regression in :meth:`rolling(..).corr() <pandas.core.window.rolling.Rolling.corr>` when using a time offset (:issue:`31789`)
 - Fixed regression in :meth:`groupby(..).nunique() <pandas.core.groupby.DataFrameGroupBy.nunique>` which was modifying the original values if ``NaN`` values were present (:issue:`31950`)
 - Fixed regression in ``DataFrame.groupby`` raising a ``ValueError`` from an internal operation (:issue:`31802`)
-- Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`).
-- Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`)
 - Fixed regression in :meth:`groupby(..).agg() <pandas.core.groupby.GroupBy.agg>` calling a user-provided function an extra time on an empty input (:issue:`31760`)
-- Fixed regression in joining on :class:`DatetimeIndex` or :class:`TimedeltaIndex` to preserve ``freq`` in simple cases (:issue:`32166`)
-- Fixed regression in the repr of an object-dtype :class:`Index` with bools and missing values (:issue:`32146`)
+
+**I/O**
+
 - Fixed regression in :meth:`read_csv` in which the ``encoding`` option was not recognized with certain file-like objects (:issue:`31819`)
+- Fixed regression in :meth:`DataFrame.to_excel` when the ``columns`` keyword argument is passed (:issue:`31677`)
+- Fixed regression in :class:`ExcelFile` where the stream passed into the function was closed by the destructor. (:issue:`31467`)
+- Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`).
+
+**Reindexing/alignment**
+
+- Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`)
 - Fixed regression in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with (tz-aware) index and ``method=nearest`` (:issue:`26683`)
 - Fixed regression in :meth:`DataFrame.reindex_like` on a :class:`DataFrame` subclass raised an  ``AssertionError`` (:issue:`31925`)
+- Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`)
+
+**Other**
+
+- Fixed regression in joining on :class:`DatetimeIndex` or :class:`TimedeltaIndex` to preserve ``freq`` in simple cases (:issue:`32166`)
 - Fixed regression in :meth:`Series.shift` with ``datetime64`` dtype when passing an integer ``fill_value`` (:issue:`32591`)
+- Fixed regression in the repr of an object-dtype :class:`Index` with bools and missing values (:issue:`32146`)
 
 
 .. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -89,9 +89,9 @@ Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 - :meth:`DataFrame.swaplevels` now raises a  ``TypeError`` if the axis is not a :class:`MultiIndex`.
   Previously a ``AttributeError`` was raised (:issue:`31126`)
-- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std`` and :meth:`~DataFrameGroupby.var``)
+- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`)
   now raise a  ``TypeError`` if a not-accepted keyword argument is passed into it.
-  Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median``) (:issue:`31485`)
+  Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`)
 - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`)
 - Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`)
 -
@@ -188,9 +188,9 @@ Performance improvements
 - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`)
 - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`)
 - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`)
-- The internal :meth:`Index._shallow_copy` now copies cached attributes over to the new index,
-  avoiding creating these again on the new index. This can speed up many operations
-  that depend on creating copies of existing indexes (:issue:`28584`)
+- The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
+  avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
+  existing indexes (:issue:`28584`, :issue:`32640`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py
@@ -122,14 +122,15 @@ def build_string(revision_range, heading="Contributors"):
     components["uline"] = "=" * len(components["heading"])
     components["authors"] = "* " + "\n* ".join(components["authors"])
 
+    # Don't change this to an fstring. It breaks the formatting.
     tpl = textwrap.dedent(
-        f"""\
-    {components['heading']}
-    {components['uline']}
+        """\
+    {heading}
+    {uline}
 
-    {components['author_message']}
-    {components['authors']}"""
-    )
+    {author_message}
+    {authors}"""
+    ).format(**components)
     return tpl
 
 
diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
@@ -1,12 +1,15 @@
 from cpython.object cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE
 
-from cpython.datetime cimport (datetime, date,
-                               PyDateTime_IMPORT,
-                               PyDateTime_GET_YEAR, PyDateTime_GET_MONTH,
-                               PyDateTime_GET_DAY, PyDateTime_DATE_GET_HOUR,
-                               PyDateTime_DATE_GET_MINUTE,
-                               PyDateTime_DATE_GET_SECOND,
-                               PyDateTime_DATE_GET_MICROSECOND)
+from cpython.datetime cimport (
+    PyDateTime_DATE_GET_HOUR,
+    PyDateTime_DATE_GET_MICROSECOND,
+    PyDateTime_DATE_GET_MINUTE,
+    PyDateTime_DATE_GET_SECOND,
+    PyDateTime_GET_DAY,
+    PyDateTime_GET_MONTH,
+    PyDateTime_GET_YEAR,
+    PyDateTime_IMPORT,
+)
 PyDateTime_IMPORT
 
 from numpy cimport int64_t
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -233,6 +233,7 @@ def _simple_new(cls, values: Categorical, name: Label = None):
 
         result._data = values
         result.name = name
+        result._cache = {}
 
         result._reset_identity()
         result._no_setting_name = False
@@ -242,14 +243,9 @@ def _simple_new(cls, values: Categorical, name: Label = None):
 
     @Appender(Index._shallow_copy.__doc__)
     def _shallow_copy(self, values=None, name: Label = no_default):
-        name = self.name if name is no_default else name
-
-        if values is None:
-            values = self.values
-
-        cat = Categorical(values, dtype=self.dtype)
-
-        return type(self)._simple_new(cat, name=name)
+        if values is not None:
+            values = Categorical(values, dtype=self.dtype)
+        return super()._shallow_copy(values=values, name=name)
 
     def _is_dtype_compat(self, other) -> bool:
         """
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -617,6 +617,7 @@ def _set_freq(self, freq):
 
     def _shallow_copy(self, values=None, name: Label = lib.no_default):
         name = self.name if name is lib.no_default else name
+        cache = self._cache.copy() if values is None else {}
 
         if values is None:
             values = self._data
@@ -635,7 +636,9 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default):
                     del attributes["freq"]
 
         attributes["name"] = name
-        return type(self)._simple_new(values, **attributes)
+        result = self._simple_new(values, **attributes)
+        result._cache = cache
+        return result
 
     # --------------------------------------------------------------------
     # Set Operation Methods
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -268,6 +268,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None):
         result = object.__new__(cls)
         result._data = dtarr
         result.name = name
+        result._cache = {}
         result._no_setting_name = False
         # For groupby perf. See note in indexes/base about _index_data
         result._index_data = dtarr._data
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -243,6 +243,7 @@ def _simple_new(cls, array: IntervalArray, name: Label = None):
         result = IntervalMixin.__new__(cls)
         result._data = array
         result.name = name
+        result._cache = {}
         result._no_setting_name = False
         result._reset_identity()
         return result
@@ -332,12 +333,15 @@ def from_tuples(
     # --------------------------------------------------------------------
 
     @Appender(Index._shallow_copy.__doc__)
-    def _shallow_copy(self, values=None, **kwargs):
+    def _shallow_copy(self, values=None, name: Label = lib.no_default):
+        name = self.name if name is lib.no_default else name
+        cache = self._cache.copy() if values is None else {}
         if values is None:
             values = self._data
-        attributes = self._get_attributes_dict()
-        attributes.update(kwargs)
-        return self._simple_new(values, **attributes)
+
+        result = self._simple_new(values, name=name)
+        result._cache = cache
+        return result
 
     @cache_readonly
     def _isnan(self):
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -233,6 +233,7 @@ def _simple_new(cls, values: PeriodArray, name: Label = None):
         # For groupby perf. See note in indexes/base about _index_data
         result._index_data = values._data
         result.name = name
+        result._cache = {}
         result._reset_identity()
         return result
 
diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
@@ -180,6 +180,7 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE):
         result = object.__new__(cls)
         result._data = values
         result._name = name
+        result._cache = {}
         # For groupby perf. See note in indexes/base about _index_data
         result._index_data = values._data
 
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -366,6 +366,9 @@ def _workbook_class(self):
     def load_workbook(self, filepath_or_buffer):
         pass
 
+    def close(self):
+        pass
+
     @property
     @abc.abstractmethod
     def sheet_names(self):
@@ -895,14 +898,7 @@ def sheet_names(self):
 
     def close(self):
         """close io if necessary"""
-        if self.engine == "openpyxl":
-            # https://stackoverflow.com/questions/31416842/
-            #  openpyxl-does-not-close-excel-workbook-in-read-only-mode
-            wb = self.book
-            wb._archive.close()
-
-        if hasattr(self.io, "close"):
-            self.io.close()
+        self._reader.close()
 
     def __enter__(self):
         return self
diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -492,6 +492,11 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
             filepath_or_buffer, read_only=True, data_only=True, keep_links=False
         )
 
+    def close(self):
+        # https://stackoverflow.com/questions/31416842/
+        #  openpyxl-does-not-close-excel-workbook-in-read-only-mode
+        self.book.close()
+
     @property
     def sheet_names(self) -> List[str]:
         return self.book.sheetnames
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -629,6 +629,17 @@ def test_read_from_py_localpath(self, read_ext):
 
         tm.assert_frame_equal(expected, actual)
 
+    @td.check_file_leaks
+    def test_close_from_py_localpath(self, read_ext):
+
+        # GH31467
+        str_path = os.path.join("test1" + read_ext)
+        with open(str_path, "rb") as f:
+            x = pd.read_excel(f, "Sheet1", index_col=0)
+            del x
+            # should not throw an exception because the passed file was closed
+            f.read()
+
     def test_reader_seconds(self, read_ext):
         if pd.read_excel.keywords["engine"] == "pyxlsb":
             pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
@@ -1020,10 +1031,10 @@ def test_excel_read_buffer(self, engine, read_ext):
         tm.assert_frame_equal(expected, actual)
 
     def test_reader_closes_file(self, engine, read_ext):
-        f = open("test1" + read_ext, "rb")
-        with pd.ExcelFile(f) as xlsx:
-            # parses okay
-            pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine)
+        with open("test1" + read_ext, "rb") as f:
+            with pd.ExcelFile(f) as xlsx:
+                # parses okay
+                pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine)
 
         assert f.closed