Merge branch 'master' of https://github.com/pandas-dev/pandas into fallback-first

jbrockmendel · jbrockmendel · commit 81ace33e7da7 · 2020-04-10T18:35:43.000-07:00
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -285,14 +285,18 @@ chunksize : int, default ``None``
 Quoting, compression, and file format
 +++++++++++++++++++++++++++++++++++++
 
-compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
+compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'``
   For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
   bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
   '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
   the ZIP file must contain only one data file to be read in.
-  Set to ``None`` for no decompression.
+  Set to ``None`` for no decompression. Can also be a dict with key ``'method'``
+  set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to
+  compression settings. As an example, the following could be passed for
+  faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``.
 
   .. versionchanged:: 0.24.0 'infer' option added and set to default.
+  .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``.
 thousands : str, default ``None``
   Thousands separator.
 decimal : str, default ``'.'``
@@ -3347,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e
 If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
 ``'.xz'``, respectively.
 
+The compression parameter can also be a ``dict`` in order to pass options to the
+compression protocol. It must have a ``'method'`` key set to the name
+of the compression protocol, which must be one of
+{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to
+the underlying compression library.
+
 .. ipython:: python
 
    df = pd.DataFrame({
@@ -3383,6 +3393,15 @@ The default is to 'infer':
    rt = pd.read_pickle("s1.pkl.bz2")
    rt
 
+Passing options to the compression protocol in order to speed up compression:
+
+.. ipython:: python
+
+   df.to_pickle(
+       "data.pkl.gz",
+       compression={"method": "gzip", 'compresslevel': 1}
+   )
+
 .. ipython:: python
    :suppress:
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -91,6 +91,12 @@ Other enhancements
 - The :meth:`DataFrame.to_feather` method now supports additional keyword
   arguments (e.g. to set the compression) that are added in pyarrow 0.17
   (:issue:`33422`).
+- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`,
+  and :meth:`DataFrame.to_json` now support passing a dict of
+  compression arguments when using the ``gzip`` and ``bz2`` protocols.
+  This can be used to set a custom compression level, e.g.,
+  ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}``
+  (:issue:`33196`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3096,7 +3096,8 @@ def to_csv(
             compression mode is 'infer' and `path_or_buf` is path-like, then
             detect compression mode from the following extensions: '.gz',
             '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
-            and mode is 'zip' or inferred as 'zip', other entries passed as
+            and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as
+            one of the above, other entries passed as
             additional compression options.
 
             .. versionchanged:: 1.0.0
@@ -3105,6 +3106,12 @@ def to_csv(
                and other entries as additional compression options if
                compression mode is 'zip'.
 
+            .. versionchanged:: 1.1.0
+
+               Passing compression options as keys in dict is
+               supported for compression modes 'gzip' and 'bz2'
+               as well as 'zip'.
+
         quoting : optional constant from csv module
             Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
             then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1197,20 +1197,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
         key_names = self.grouper.names
 
-        # GH12824.
-        def first_not_none(values):
-            try:
-                return next(com.not_none(*values))
-            except StopIteration:
-                return None
-
-        v = first_not_none(values)
+        # GH12824
+        first_not_none = next(com.not_none(*values), None)
 
-        if v is None:
+        if first_not_none is None:
             # GH9684. If all values are None, then this will throw an error.
             # We'd prefer it return an empty dataframe.
             return DataFrame()
-        elif isinstance(v, DataFrame):
+        elif isinstance(first_not_none, DataFrame):
             return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
         elif self.grouper.groupings is not None:
             if len(self.grouper.groupings) > 1:
@@ -1227,6 +1221,9 @@ def first_not_none(values):
 
                     # reorder the values
                     values = [values[i] for i in indexer]
+
+                    # update due to the potential reorder
+                    first_not_none = next(com.not_none(*values), None)
                 else:
 
                     key_index = Index(keys, name=key_names[0])
@@ -1236,20 +1233,19 @@ def first_not_none(values):
                     key_index = None
 
             # make Nones an empty object
-            v = first_not_none(values)
-            if v is None:
+            if first_not_none is None:
                 return DataFrame()
-            elif isinstance(v, NDFrame):
+            elif isinstance(first_not_none, NDFrame):
 
                 # this is to silence a DeprecationWarning
                 # TODO: Remove when default dtype of empty Series is object
-                kwargs = v._construct_axes_dict()
-                if v._constructor is Series:
+                kwargs = first_not_none._construct_axes_dict()
+                if first_not_none._constructor is Series:
                     backup = create_series_with_explicit_dtype(
                         **kwargs, dtype_if_empty=object
                     )
                 else:
-                    backup = v._constructor(**kwargs)
+                    backup = first_not_none._constructor(**kwargs)
 
                 values = [x if (x is not None) else backup for x in values]
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -351,15 +351,21 @@ def get_handle(
         'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
         and `filepath_or_buffer` is path-like, then detect compression from
         the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
-        no compression). If dict and compression mode is 'zip' or inferred as
-        'zip', other entries passed as additional compression options.
+        no compression). If dict and compression mode is one of
+        {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
+        other entries passed as additional compression options.
 
         .. versionchanged:: 1.0.0
 
            May now be a dict with key 'method' as compression mode
            and other keys as compression options if compression
            mode is 'zip'.
 
+        .. versionchanged:: 1.1.0
+
+           Passing compression options as keys in dict is now
+           supported for compression modes 'gzip' and 'bz2' as well as 'zip'.
+
     memory_map : boolean, default False
         See parsers._parser_params for more information.
     is_text : boolean, default True
@@ -394,19 +400,28 @@ def get_handle(
 
     if compression:
 
+        # GH33398 the type ignores here seem related to mypy issue #5382;
+        # it may be possible to remove them once that is resolved.
+
         # GZ Compression
         if compression == "gzip":
             if is_path:
-                f = gzip.open(path_or_buf, mode)
+                f = gzip.open(
+                    path_or_buf, mode, **compression_args  # type: ignore
+                )
             else:
-                f = gzip.GzipFile(fileobj=path_or_buf)
+                f = gzip.GzipFile(
+                    fileobj=path_or_buf, **compression_args  # type: ignore
+                )
 
         # BZ Compression
         elif compression == "bz2":
             if is_path:
-                f = bz2.BZ2File(path_or_buf, mode)
+                f = bz2.BZ2File(
+                    path_or_buf, mode, **compression_args  # type: ignore
+                )
             else:
-                f = bz2.BZ2File(path_or_buf)
+                f = bz2.BZ2File(path_or_buf, **compression_args)  # type: ignore
 
         # ZIP Compression
         elif compression == "zip":
diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py
@@ -0,0 +1,70 @@
+import numpy as np
+
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_mutate_groups():
+
+    # GH3380
+
+    df = pd.DataFrame(
+        {
+            "cat1": ["a"] * 8 + ["b"] * 6,
+            "cat2": ["c"] * 2
+            + ["d"] * 2
+            + ["e"] * 2
+            + ["f"] * 2
+            + ["c"] * 2
+            + ["d"] * 2
+            + ["e"] * 2,
+            "cat3": [f"g{x}" for x in range(1, 15)],
+            "val": np.random.randint(100, size=14),
+        }
+    )
+
+    def f_copy(x):
+        x = x.copy()
+        x["rank"] = x.val.rank(method="min")
+        return x.groupby("cat2")["rank"].min()
+
+    def f_no_copy(x):
+        x["rank"] = x.val.rank(method="min")
+        return x.groupby("cat2")["rank"].min()
+
+    grpby_copy = df.groupby("cat1").apply(f_copy)
+    grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
+    tm.assert_series_equal(grpby_copy, grpby_no_copy)
+
+
+def test_no_mutate_but_looks_like():
+
+    # GH 8467
+    # first show's mutation indicator
+    # second does not, but should yield the same results
+    df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
+
+    result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
+    result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
+    tm.assert_series_equal(result1, result2)
+
+
+def test_apply_function_with_indexing():
+    # GH: 33058
+    df = pd.DataFrame(
+        {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
+    )
+
+    def fn(x):
+        x.col2[x.index[-1]] = 0
+        return x.col2
+
+    result = df.groupby(["col1"], as_index=False).apply(fn)
+    expected = pd.Series(
+        [1, 2, 0, 4, 5, 0],
+        index=pd.MultiIndex.from_tuples(
+            [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
+        ),
+        name="col2",
+    )
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -921,51 +921,6 @@ def test_groupby_complex():
     tm.assert_series_equal(result, expected)
 
 
-def test_mutate_groups():
-
-    # GH3380
-
-    df = DataFrame(
-        {
-            "cat1": ["a"] * 8 + ["b"] * 6,
-            "cat2": ["c"] * 2
-            + ["d"] * 2
-            + ["e"] * 2
-            + ["f"] * 2
-            + ["c"] * 2
-            + ["d"] * 2
-            + ["e"] * 2,
-            "cat3": [f"g{x}" for x in range(1, 15)],
-            "val": np.random.randint(100, size=14),
-        }
-    )
-
-    def f_copy(x):
-        x = x.copy()
-        x["rank"] = x.val.rank(method="min")
-        return x.groupby("cat2")["rank"].min()
-
-    def f_no_copy(x):
-        x["rank"] = x.val.rank(method="min")
-        return x.groupby("cat2")["rank"].min()
-
-    grpby_copy = df.groupby("cat1").apply(f_copy)
-    grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
-    tm.assert_series_equal(grpby_copy, grpby_no_copy)
-
-
-def test_no_mutate_but_looks_like():
-
-    # GH 8467
-    # first show's mutation indicator
-    # second does not, but should yield the same results
-    df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
-
-    result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
-    result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
-    tm.assert_series_equal(result1, result2)
-
-
 def test_groupby_series_indexed_differently():
     s1 = Series(
         [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
@@ -143,3 +143,44 @@ def test_with_missing_lzma_runtime():
         """
     )
     subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_gzip_compression_level(obj, method):
+    # GH33196
+    with tm.ensure_clean() as path:
+        getattr(obj, method)(path, compression="gzip")
+        compressed_size_default = os.path.getsize(path)
+        getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
+        compressed_size_fast = os.path.getsize(path)
+        assert compressed_size_default < compressed_size_fast
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_bzip_compression_level(obj, method):
+    """GH33196 bzip needs file size > 100k to show a size difference between
+    compression levels, so here we just check if the call works when
+    compression is passed as a dict.
+    """
+    with tm.ensure_clean() as path:
+        getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})
diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py
diff --git a/setup.py b/setup.py