Merge branch 'master' of https://github.com/pandas-dev/pandas into ref-dtypes

jbrockmendel · jbrockmendel · commit cd8728fba12b · 2020-06-20T13:26:50.000-07:00
diff --git a/.travis.yml b/.travis.yml
@@ -69,9 +69,9 @@ matrix:
       env:
         - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)"
     - dist: bionic
-      python: 3.9-dev
       env:
-        - JOB="3.9-dev" PATTERN="(not slow and not network)"
+        - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)"
+
 
 before_install:
   - echo "before_install"
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -1166,6 +1166,25 @@ Storing Attributes to a group node
    store.close()
    os.remove('test.h5')
 
+You can create or load a HDFStore in-memory  by passing the ``driver``
+parameter to PyTables. Changes are only written to disk when the HDFStore
+is closed.
+
+.. ipython:: python
+
+   store = pd.HDFStore('test.h5', 'w', diver='H5FD_CORE')
+
+   df = pd.DataFrame(np.random.randn(8, 3))
+   store['test'] = df
+
+   # only after closing the store, data is written to disk:
+   store.close()
+
+.. ipython:: python
+   :suppress:
+
+   os.remove('test.h5')
+
 .. _cookbook.binary:
 
 Binary files
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
@@ -13,6 +13,14 @@ when we use Cython and Numba on a test function operating row-wise on the
 ``DataFrame``. Using :func:`pandas.eval` we will speed up a sum by an order of
 ~2.
 
+.. note::
+
+   In addition to following the steps in this tutorial, users interested in enhancing
+   performance are highly encouraged to install the
+   :ref:`recommended dependencies<install.recommended_dependencies>` for pandas.
+   These dependencies are often not installed by default, but will offer speed
+   improvements if present.
+
 .. _enhancingperf.cython:
 
 Cython (writing C extensions for pandas)
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -235,6 +235,8 @@ inferred frequency upon creation:
 
     pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer')
 
+.. _timeseries.converting.format:
+
 Providing a format argument
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -319,6 +321,12 @@ which can be specified. These are computed from the starting point specified by
    pd.to_datetime([1349720105100, 1349720105200, 1349720105300,
                    1349720105400, 1349720105500], unit='ms')
 
+.. note::
+
+   The ``unit`` parameter does not use the same strings as the ``format`` parameter
+   that was discussed :ref:`above<timeseries.converting.format>`). The
+   available units are listed on the documentation for :func:`pandas.to_datetime`.
+
 Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp
 with the ``tz`` argument specified will currently localize the epoch timestamps to UTC
 first then convert the result to the specified time zone. However, this behavior
diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst
@@ -443,9 +443,8 @@ Faceting, created by ``DataFrame.boxplot`` with the ``by``
 keyword, will affect the output type as well:
 
 ================ ======= ==========================
-``return_type=`` Faceted Output type
----------------- ------- --------------------------
-
+``return_type``  Faceted Output type
+================ ======= ==========================
 ``None``         No      axes
 ``None``         Yes     2-D ndarray of axes
 ``'axes'``       No      axes
@@ -1424,7 +1423,7 @@ Here is an example of one way to easily plot group means with standard deviation
    # Plot
    fig, ax = plt.subplots()
    @savefig errorbar_example.png
-   means.plot.bar(yerr=errors, ax=ax, capsize=4)
+   means.plot.bar(yerr=errors, ax=ax, capsize=4, rot=0)
 
 .. ipython:: python
    :suppress:
@@ -1445,9 +1444,9 @@ Plotting with matplotlib table is now supported in  :meth:`DataFrame.plot` and :
 
 .. ipython:: python
 
-   fig, ax = plt.subplots(1, 1)
+   fig, ax = plt.subplots(1, 1, figsize=(7, 6.5))
    df = pd.DataFrame(np.random.rand(5, 3), columns=['a', 'b', 'c'])
-   ax.get_xaxis().set_visible(False)   # Hide Ticks
+   ax.xaxis.tick_top()  # Display x-axis ticks on top.
 
    @savefig line_plot_table_true.png
    df.plot(table=True, ax=ax)
@@ -1464,8 +1463,9 @@ as seen in the example below.
 
 .. ipython:: python
 
-   fig, ax = plt.subplots(1, 1)
-   ax.get_xaxis().set_visible(False)   # Hide Ticks
+   fig, ax = plt.subplots(1, 1, figsize=(7, 6.75))
+   ax.xaxis.tick_top()  # Display x-axis ticks on top.
+
    @savefig line_plot_table_data.png
    df.plot(table=np.round(df.T, 2), ax=ax)
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -956,6 +956,7 @@ MultiIndex
         df.loc[(['b', 'a'], [2, 1]), :]
 
 - Bug in :meth:`MultiIndex.intersection` was not guaranteed to preserve order when ``sort=False``. (:issue:`31325`)
+- Bug in :meth:`DataFrame.truncate` was dropping :class:`MultiIndex` names. (:issue:`34564`)
 
 .. ipython:: python
 
@@ -1058,6 +1059,7 @@ Reshaping
 - Bug in :func:`Dataframe.aggregate` and :func:`Series.aggregate` was causing recursive loop in some cases (:issue:`34224`)
 - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`)
 - Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`)
+- Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`)
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1608,7 +1608,7 @@ def construct_1d_ndarray_preserving_na(
     """
     subarr = np.array(values, dtype=dtype, copy=copy)
 
-    if dtype is not None and dtype.kind in ("U", "S"):
+    if dtype is not None and dtype.kind == "U":
         # GH-21083
         # We can't just return np.array(subarr, dtype='str') since
         # NumPy will convert the non-string objects into strings
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -518,7 +518,12 @@ def is_(self, other) -> bool:
 
         Returns
         -------
-        True if both have same underlying data, False otherwise : bool
+        bool
+            True if both have same underlying data, False otherwise.
+
+        See Also
+        --------
+        Index.identical : Works like ``Index.is_`` but also checks metadata.
         """
         # use something other than None to be clearer
         return self._id is getattr(other, "_id", Ellipsis) and self._id is not None
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -3193,7 +3193,12 @@ def truncate(self, before=None, after=None):
         new_codes = [level_codes[left:right] for level_codes in self.codes]
         new_codes[0] = new_codes[0] - i
 
-        return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False)
+        return MultiIndex(
+            levels=new_levels,
+            codes=new_codes,
+            names=self._names,
+            verify_integrity=False,
+        )
 
     def equals(self, other) -> bool:
         """
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -447,8 +447,8 @@ class HDFStore:
 
     Parameters
     ----------
-    path : string
-        File path to HDF5 file
+    path : str
+        File path to HDF5 file.
     mode : {'a', 'w', 'r', 'r+'}, default 'a'
 
         ``'r'``
@@ -462,18 +462,20 @@ class HDFStore:
         ``'r+'``
             It is similar to ``'a'``, but the file must already exist.
     complevel : int, 0-9, default None
-            Specifies a compression level for data.
-            A value of 0 or None disables compression.
+        Specifies a compression level for data.
+        A value of 0 or None disables compression.
     complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
-            Specifies the compression library to be used.
-            As of v0.20.2 these additional compressors for Blosc are supported
-            (default if no compressor specified: 'blosc:blosclz'):
-            {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
-             'blosc:zlib', 'blosc:zstd'}.
-            Specifying a compression library which is not available issues
-            a ValueError.
+        Specifies the compression library to be used.
+        As of v0.20.2 these additional compressors for Blosc are supported
+        (default if no compressor specified: 'blosc:blosclz'):
+        {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+         'blosc:zlib', 'blosc:zstd'}.
+        Specifying a compression library which is not available issues
+        a ValueError.
     fletcher32 : bool, default False
-            If applying compression use the fletcher32 checksum
+        If applying compression use the fletcher32 checksum.
+    **kwargs
+        These parameters will be passed to the PyTables open_file method.
 
     Examples
     --------
@@ -482,6 +484,17 @@ class HDFStore:
     >>> store['foo'] = bar   # write to HDF5
     >>> bar = store['foo']   # retrieve
     >>> store.close()
+
+    **Create or load HDF5 file in-memory**
+
+    When passing the `driver` option to the PyTables open_file method through
+    **kwargs, the HDF5 file is loaded or created in-memory and will only be
+    written when closed:
+
+    >>> bar = pd.DataFrame(np.random.randn(10, 4))
+    >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
+    >>> store['foo'] = bar
+    >>> store.close()   # only now, data is written to disk
     """
 
     _handle: Optional["File"]
@@ -634,6 +647,8 @@ def open(self, mode: str = "a", **kwargs):
         ----------
         mode : {'a', 'w', 'r', 'r+'}, default 'a'
             See HDFStore docstring or tables.open_file for info about modes
+        **kwargs
+            These parameters will be passed to the PyTables open_file method.
         """
         tables = _tables()
 
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -1295,3 +1295,15 @@ def test_map_missing():
 
     result = arr.map({0: 10, 1: 11})
     tm.assert_sp_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("fill_value", [np.nan, 1])
+def test_dropna(fill_value):
+    # GH-28287
+    arr = SparseArray([np.nan, 1], fill_value=fill_value)
+    exp = SparseArray([1.0], fill_value=fill_value)
+    tm.assert_sp_array_equal(arr.dropna(), exp)
+
+    df = pd.DataFrame({"a": [0, 1], "b": arr})
+    expected_df = pd.DataFrame({"a": [1], "b": exp}, index=pd.Int64Index([1]))
+    tm.assert_equal(df.dropna(), expected_df)
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
@@ -179,13 +179,11 @@ def astype(self, dtype, copy=True):
     def unique(self):
         # Parent method doesn't work since np.array will try to infer
         # a 2-dim object.
-        return type(self)(
-            [dict(x) for x in list({tuple(d.items()) for d in self.data})]
-        )
+        return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
 
     @classmethod
     def _concat_same_type(cls, to_concat):
-        data = list(itertools.chain.from_iterable([x.data for x in to_concat]))
+        data = list(itertools.chain.from_iterable(x.data for x in to_concat))
         return cls(data)
 
     def _values_for_factorize(self):
diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py
@@ -169,3 +169,48 @@ def test_diff_sparse(self):
         )
 
         tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "axis,expected",
+        [
+            (
+                0,
+                pd.DataFrame(
+                    {
+                        "a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0],
+                        "b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan],
+                        "c": np.repeat(np.nan, 8),
+                        "d": [np.nan, 3, 5, 7, 9, 11, 13, 15],
+                    },
+                    dtype="Int64",
+                ),
+            ),
+            (
+                1,
+                pd.DataFrame(
+                    {
+                        "a": np.repeat(np.nan, 8),
+                        "b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0],
+                        "c": np.repeat(np.nan, 8),
+                        "d": np.repeat(np.nan, 8),
+                    },
+                    dtype="Int64",
+                ),
+            ),
+        ],
+    )
+    def test_diff_integer_na(self, axis, expected):
+        # GH#24171 IntegerNA Support for DataFrame.diff()
+        df = pd.DataFrame(
+            {
+                "a": np.repeat([0, 1, np.nan, 2], 2),
+                "b": np.tile([0, 1, np.nan, 2], 2),
+                "c": np.repeat(np.nan, 8),
+                "d": np.arange(1, 9) ** 2,
+            },
+            dtype="Int64",
+        )
+
+        # Test case for default behaviour of diff
+        result = df.diff(axis=axis)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py
@@ -104,3 +104,16 @@ def test_truncate_decreasing_index(self, before, after, indices, klass):
         result = values.truncate(before=before, after=after)
         expected = values.loc[indices]
         tm.assert_frame_equal(result, expected)
+
+    def test_truncate_multiindex(self):
+        # GH 34564
+        mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"])
+        s1 = pd.DataFrame(range(mi.shape[0]), index=mi, columns=["col"])
+        result = s1.truncate(before=2, after=3)
+
+        df = pd.DataFrame.from_dict(
+            {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]}
+        )
+        expected = df.set_index(["L1", "L2"])
+
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -785,6 +785,17 @@ def non_reducing_function(val):
             df.applymap(func)
             assert values == df.a.to_list()
 
+    def test_apply_with_byte_string(self):
+        # GH 34529
+        df = pd.DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"])
+        expected = pd.DataFrame(
+            np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object
+        )
+        # After we make the aply we exect a dataframe just
+        # like the original but with the object datatype
+        result = df.apply(lambda x: x.astype("object"))
+        tm.assert_frame_equal(result, expected)
+
 
 class TestInferOutputShape:
     # the user has supplied an opaque UDF where
diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py
@@ -30,27 +30,33 @@ def test_groupby(idx):
     tm.assert_dict_equal(groups, exp)
 
 
-def test_truncate():
+def test_truncate_multiindex():
+    # GH 34564 for MultiIndex level names check
     major_axis = Index(list(range(4)))
     minor_axis = Index(list(range(2)))
 
     major_codes = np.array([0, 0, 1, 2, 3, 3])
     minor_codes = np.array([0, 1, 0, 1, 0, 1])
 
     index = MultiIndex(
-        levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
+        levels=[major_axis, minor_axis],
+        codes=[major_codes, minor_codes],
+        names=["L1", "L2"],
     )
 
     result = index.truncate(before=1)
     assert "foo" not in result.levels[0]
     assert 1 in result.levels[0]
+    assert index.names == result.names
 
     result = index.truncate(after=1)
     assert 2 not in result.levels[0]
     assert 1 in result.levels[0]
+    assert index.names == result.names
 
     result = index.truncate(before=1, after=2)
     assert len(result.levels[0]) == 2
+    assert index.names == result.names
 
     msg = "after < before"
     with pytest.raises(ValueError, match=msg):
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py
diff --git a/pandas/tests/series/indexing/test_take.py b/pandas/tests/series/indexing/test_take.py
diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py
diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py