Skip to content

Commit 81ace33

Browse files
committed
Merge branch 'master' of https://github.com/pandas-dev/pandas into fallback-first
2 parents a4586d1 + a6a1ab2 commit 81ace33

File tree

10 files changed

+193
-74
lines changed

10 files changed

+193
-74
lines changed

doc/source/user_guide/io.rst

+21-2
Original file line numberDiff line numberDiff line change
@@ -285,14 +285,18 @@ chunksize : int, default ``None``
285285
Quoting, compression, and file format
286286
+++++++++++++++++++++++++++++++++++++
287287

288-
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
288+
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'``
289289
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
290290
bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
291291
'.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
292292
the ZIP file must contain only one data file to be read in.
293-
Set to ``None`` for no decompression.
293+
Set to ``None`` for no decompression. Can also be a dict with key ``'method'``
294+
set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to
295+
compression settings. As an example, the following could be passed for
296+
faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``.
294297

295298
.. versionchanged:: 0.24.0 'infer' option added and set to default.
299+
.. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``.
296300
thousands : str, default ``None``
297301
Thousands separator.
298302
decimal : str, default ``'.'``
@@ -3347,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e
33473351
If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
33483352
``'.xz'``, respectively.
33493353

3354+
The compression parameter can also be a ``dict`` in order to pass options to the
3355+
compression protocol. It must have a ``'method'`` key set to the name
3356+
of the compression protocol, which must be one of
3357+
{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to
3358+
the underlying compression library.
3359+
33503360
.. ipython:: python
33513361
33523362
df = pd.DataFrame({
@@ -3383,6 +3393,15 @@ The default is to 'infer':
33833393
rt = pd.read_pickle("s1.pkl.bz2")
33843394
rt
33853395
3396+
Passing options to the compression protocol in order to speed up compression:
3397+
3398+
.. ipython:: python
3399+
3400+
df.to_pickle(
3401+
"data.pkl.gz",
3402+
compression={"method": "gzip", 'compresslevel': 1}
3403+
)
3404+
33863405
.. ipython:: python
33873406
:suppress:
33883407

doc/source/whatsnew/v1.1.0.rst

+6
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,12 @@ Other enhancements
9191
- The :meth:`DataFrame.to_feather` method now supports additional keyword
9292
arguments (e.g. to set the compression) that are added in pyarrow 0.17
9393
(:issue:`33422`).
94+
- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`,
95+
and :meth:`DataFrame.to_json` now support passing a dict of
96+
compression arguments when using the ``gzip`` and ``bz2`` protocols.
97+
This can be used to set a custom compression level, e.g.,
98+
``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}``
99+
(:issue:`33196`)
94100

95101
.. ---------------------------------------------------------------------------
96102

pandas/core/generic.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -3096,7 +3096,8 @@ def to_csv(
30963096
compression mode is 'infer' and `path_or_buf` is path-like, then
30973097
detect compression mode from the following extensions: '.gz',
30983098
'.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
3099-
and mode is 'zip' or inferred as 'zip', other entries passed as
3099+
and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as
3100+
one of the above, other entries passed as
31003101
additional compression options.
31013102
31023103
.. versionchanged:: 1.0.0
@@ -3105,6 +3106,12 @@ def to_csv(
31053106
and other entries as additional compression options if
31063107
compression mode is 'zip'.
31073108
3109+
.. versionchanged:: 1.1.0
3110+
3111+
Passing compression options as keys in dict is
3112+
supported for compression modes 'gzip' and 'bz2'
3113+
as well as 'zip'.
3114+
31083115
quoting : optional constant from csv module
31093116
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
31103117
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC

pandas/core/groupby/generic.py

+12-16
Original file line numberDiff line numberDiff line change
@@ -1197,20 +1197,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
11971197

11981198
key_names = self.grouper.names
11991199

1200-
# GH12824.
1201-
def first_not_none(values):
1202-
try:
1203-
return next(com.not_none(*values))
1204-
except StopIteration:
1205-
return None
1206-
1207-
v = first_not_none(values)
1200+
# GH12824
1201+
first_not_none = next(com.not_none(*values), None)
12081202

1209-
if v is None:
1203+
if first_not_none is None:
12101204
# GH9684. If all values are None, then this will throw an error.
12111205
# We'd prefer it return an empty dataframe.
12121206
return DataFrame()
1213-
elif isinstance(v, DataFrame):
1207+
elif isinstance(first_not_none, DataFrame):
12141208
return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
12151209
elif self.grouper.groupings is not None:
12161210
if len(self.grouper.groupings) > 1:
@@ -1227,6 +1221,9 @@ def first_not_none(values):
12271221

12281222
# reorder the values
12291223
values = [values[i] for i in indexer]
1224+
1225+
# update due to the potential reorder
1226+
first_not_none = next(com.not_none(*values), None)
12301227
else:
12311228

12321229
key_index = Index(keys, name=key_names[0])
@@ -1236,20 +1233,19 @@ def first_not_none(values):
12361233
key_index = None
12371234

12381235
# make Nones an empty object
1239-
v = first_not_none(values)
1240-
if v is None:
1236+
if first_not_none is None:
12411237
return DataFrame()
1242-
elif isinstance(v, NDFrame):
1238+
elif isinstance(first_not_none, NDFrame):
12431239

12441240
# this is to silence a DeprecationWarning
12451241
# TODO: Remove when default dtype of empty Series is object
1246-
kwargs = v._construct_axes_dict()
1247-
if v._constructor is Series:
1242+
kwargs = first_not_none._construct_axes_dict()
1243+
if first_not_none._constructor is Series:
12481244
backup = create_series_with_explicit_dtype(
12491245
**kwargs, dtype_if_empty=object
12501246
)
12511247
else:
1252-
backup = v._constructor(**kwargs)
1248+
backup = first_not_none._constructor(**kwargs)
12531249

12541250
values = [x if (x is not None) else backup for x in values]
12551251

pandas/io/common.py

+21-6
Original file line numberDiff line numberDiff line change
@@ -351,15 +351,21 @@ def get_handle(
351351
'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
352352
and `filepath_or_buffer` is path-like, then detect compression from
353353
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
354-
no compression). If dict and compression mode is 'zip' or inferred as
355-
'zip', other entries passed as additional compression options.
354+
no compression). If dict and compression mode is one of
355+
{'zip', 'gzip', 'bz2'}, or inferred as one of the above,
356+
other entries passed as additional compression options.
356357
357358
.. versionchanged:: 1.0.0
358359
359360
May now be a dict with key 'method' as compression mode
360361
and other keys as compression options if compression
361362
mode is 'zip'.
362363
364+
.. versionchanged:: 1.1.0
365+
366+
Passing compression options as keys in dict is now
367+
supported for compression modes 'gzip' and 'bz2' as well as 'zip'.
368+
363369
memory_map : boolean, default False
364370
See parsers._parser_params for more information.
365371
is_text : boolean, default True
@@ -394,19 +400,28 @@ def get_handle(
394400

395401
if compression:
396402

403+
# GH33398 the type ignores here seem related to mypy issue #5382;
404+
# it may be possible to remove them once that is resolved.
405+
397406
# GZ Compression
398407
if compression == "gzip":
399408
if is_path:
400-
f = gzip.open(path_or_buf, mode)
409+
f = gzip.open(
410+
path_or_buf, mode, **compression_args # type: ignore
411+
)
401412
else:
402-
f = gzip.GzipFile(fileobj=path_or_buf)
413+
f = gzip.GzipFile(
414+
fileobj=path_or_buf, **compression_args # type: ignore
415+
)
403416

404417
# BZ Compression
405418
elif compression == "bz2":
406419
if is_path:
407-
f = bz2.BZ2File(path_or_buf, mode)
420+
f = bz2.BZ2File(
421+
path_or_buf, mode, **compression_args # type: ignore
422+
)
408423
else:
409-
f = bz2.BZ2File(path_or_buf)
424+
f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore
410425

411426
# ZIP Compression
412427
elif compression == "zip":
+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import numpy as np
2+
3+
import pandas as pd
4+
import pandas._testing as tm
5+
6+
7+
def test_mutate_groups():
8+
9+
# GH3380
10+
11+
df = pd.DataFrame(
12+
{
13+
"cat1": ["a"] * 8 + ["b"] * 6,
14+
"cat2": ["c"] * 2
15+
+ ["d"] * 2
16+
+ ["e"] * 2
17+
+ ["f"] * 2
18+
+ ["c"] * 2
19+
+ ["d"] * 2
20+
+ ["e"] * 2,
21+
"cat3": [f"g{x}" for x in range(1, 15)],
22+
"val": np.random.randint(100, size=14),
23+
}
24+
)
25+
26+
def f_copy(x):
27+
x = x.copy()
28+
x["rank"] = x.val.rank(method="min")
29+
return x.groupby("cat2")["rank"].min()
30+
31+
def f_no_copy(x):
32+
x["rank"] = x.val.rank(method="min")
33+
return x.groupby("cat2")["rank"].min()
34+
35+
grpby_copy = df.groupby("cat1").apply(f_copy)
36+
grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
37+
tm.assert_series_equal(grpby_copy, grpby_no_copy)
38+
39+
40+
def test_no_mutate_but_looks_like():
41+
42+
# GH 8467
43+
# first show's mutation indicator
44+
# second does not, but should yield the same results
45+
df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
46+
47+
result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
48+
result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
49+
tm.assert_series_equal(result1, result2)
50+
51+
52+
def test_apply_function_with_indexing():
53+
# GH: 33058
54+
df = pd.DataFrame(
55+
{"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
56+
)
57+
58+
def fn(x):
59+
x.col2[x.index[-1]] = 0
60+
return x.col2
61+
62+
result = df.groupby(["col1"], as_index=False).apply(fn)
63+
expected = pd.Series(
64+
[1, 2, 0, 4, 5, 0],
65+
index=pd.MultiIndex.from_tuples(
66+
[(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
67+
),
68+
name="col2",
69+
)
70+
tm.assert_series_equal(result, expected)

pandas/tests/groupby/test_groupby.py

-45
Original file line numberDiff line numberDiff line change
@@ -921,51 +921,6 @@ def test_groupby_complex():
921921
tm.assert_series_equal(result, expected)
922922

923923

924-
def test_mutate_groups():
925-
926-
# GH3380
927-
928-
df = DataFrame(
929-
{
930-
"cat1": ["a"] * 8 + ["b"] * 6,
931-
"cat2": ["c"] * 2
932-
+ ["d"] * 2
933-
+ ["e"] * 2
934-
+ ["f"] * 2
935-
+ ["c"] * 2
936-
+ ["d"] * 2
937-
+ ["e"] * 2,
938-
"cat3": [f"g{x}" for x in range(1, 15)],
939-
"val": np.random.randint(100, size=14),
940-
}
941-
)
942-
943-
def f_copy(x):
944-
x = x.copy()
945-
x["rank"] = x.val.rank(method="min")
946-
return x.groupby("cat2")["rank"].min()
947-
948-
def f_no_copy(x):
949-
x["rank"] = x.val.rank(method="min")
950-
return x.groupby("cat2")["rank"].min()
951-
952-
grpby_copy = df.groupby("cat1").apply(f_copy)
953-
grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
954-
tm.assert_series_equal(grpby_copy, grpby_no_copy)
955-
956-
957-
def test_no_mutate_but_looks_like():
958-
959-
# GH 8467
960-
# first show's mutation indicator
961-
# second does not, but should yield the same results
962-
df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
963-
964-
result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
965-
result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
966-
tm.assert_series_equal(result1, result2)
967-
968-
969924
def test_groupby_series_indexed_differently():
970925
s1 = Series(
971926
[5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],

pandas/tests/io/test_compression.py

+41
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,44 @@ def test_with_missing_lzma_runtime():
143143
"""
144144
)
145145
subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
146+
147+
148+
@pytest.mark.parametrize(
149+
"obj",
150+
[
151+
pd.DataFrame(
152+
100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
153+
columns=["X", "Y", "Z"],
154+
),
155+
pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
156+
],
157+
)
158+
@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
159+
def test_gzip_compression_level(obj, method):
160+
# GH33196
161+
with tm.ensure_clean() as path:
162+
getattr(obj, method)(path, compression="gzip")
163+
compressed_size_default = os.path.getsize(path)
164+
getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
165+
compressed_size_fast = os.path.getsize(path)
166+
assert compressed_size_default < compressed_size_fast
167+
168+
169+
@pytest.mark.parametrize(
170+
"obj",
171+
[
172+
pd.DataFrame(
173+
100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
174+
columns=["X", "Y", "Z"],
175+
),
176+
pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
177+
],
178+
)
179+
@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
180+
def test_bzip_compression_level(obj, method):
181+
"""GH33196 bzip needs file size > 100k to show a size difference between
182+
compression levels, so here we just check if the call works when
183+
compression is passed as a dict.
184+
"""
185+
with tm.ensure_clean() as path:
186+
getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})

0 commit comments

Comments
 (0)