Skip to content

Commit 1c3d64b

Browse files
suzutomatojreback
authored andcommitted
BUG: func 'to_pickle' and 'read_pickle' where not accepting URL GH#30163 (#30301)
1 parent a474a01 commit 1c3d64b

File tree

4 files changed

+166
-27
lines changed

4 files changed

+166
-27
lines changed

doc/source/whatsnew/v1.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ Other enhancements
220220
- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`)
221221
- Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`)
222222
- :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`)
223+
- :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`)
224+
223225

224226
Build Changes
225227
^^^^^^^^^^^^^

pandas/_testing.py

+13-10
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from shutil import rmtree
99
import string
1010
import tempfile
11-
from typing import List, Optional, Union, cast
11+
from typing import Any, List, Optional, Union, cast
1212
import warnings
1313
import zipfile
1414

@@ -22,7 +22,7 @@
2222
)
2323

2424
import pandas._libs.testing as _testing
25-
from pandas._typing import FrameOrSeries
25+
from pandas._typing import FilePathOrBuffer, FrameOrSeries
2626
from pandas.compat import _get_lzma_file, _import_lzma
2727

2828
from pandas.core.dtypes.common import (
@@ -101,27 +101,30 @@ def reset_display_options():
101101
pd.reset_option("^display.", silent=True)
102102

103103

104-
def round_trip_pickle(obj: FrameOrSeries, path: Optional[str] = None) -> FrameOrSeries:
104+
def round_trip_pickle(
105+
obj: Any, path: Optional[FilePathOrBuffer] = None
106+
) -> FrameOrSeries:
105107
"""
106108
Pickle an object and then read it again.
107109
108110
Parameters
109111
----------
110-
obj : pandas object
112+
obj : any object
111113
The object to pickle and then re-read.
112-
path : str, default None
114+
path : str, path object or file-like object, default None
113115
The path where the pickled object is written and then read.
114116
115117
Returns
116118
-------
117119
pandas object
118120
The original object that was pickled and then re-read.
119121
"""
120-
if path is None:
121-
path = f"__{rands(10)}__.pickle"
122-
with ensure_clean(path) as path:
123-
pd.to_pickle(obj, path)
124-
return pd.read_pickle(path)
122+
_path = path
123+
if _path is None:
124+
_path = f"__{rands(10)}__.pickle"
125+
with ensure_clean(_path) as path:
126+
pd.to_pickle(obj, _path)
127+
return pd.read_pickle(_path)
125128

126129

127130
def round_trip_pathlib(writer, reader, path: Optional[str] = None):

pandas/io/pickle.py

+54-17
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,38 @@
11
""" pickle compat """
22
import pickle
3+
from typing import Any, Optional
34
import warnings
45

6+
from pandas._typing import FilePathOrBuffer
57
from pandas.compat import pickle_compat as pc
68

7-
from pandas.io.common import get_handle, stringify_path
9+
from pandas.io.common import get_filepath_or_buffer, get_handle
810

911

10-
def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
12+
def to_pickle(
13+
obj: Any,
14+
filepath_or_buffer: FilePathOrBuffer,
15+
compression: Optional[str] = "infer",
16+
protocol: int = pickle.HIGHEST_PROTOCOL,
17+
):
1118
"""
1219
Pickle (serialize) object to file.
1320
1421
Parameters
1522
----------
1623
obj : any object
1724
Any python object.
18-
path : str
19-
File path where the pickled object will be stored.
25+
filepath_or_buffer : str, path object or file-like object
26+
File path, URL, or buffer where the pickled object will be stored.
27+
28+
.. versionchanged:: 1.0.0
29+
Accept URL. URL has to be of S3 or GCS.
30+
2031
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
21-
A string representing the compression to use in the output file. By
22-
default, infers from the file extension in specified path.
32+
If 'infer' and 'path_or_url' is path-like, then detect compression from
33+
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
34+
compression) If 'infer' and 'path_or_url' is not path-like, then use
35+
None (= no decompression).
2336
protocol : int
2437
Int which indicates which protocol should be used by the pickler,
2538
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
@@ -63,8 +76,12 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
6376
>>> import os
6477
>>> os.remove("./dummy.pkl")
6578
"""
66-
path = stringify_path(path)
67-
f, fh = get_handle(path, "wb", compression=compression, is_text=False)
79+
fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
80+
filepath_or_buffer, compression=compression, mode="wb"
81+
)
82+
if not isinstance(fp_or_buf, str) and compression == "infer":
83+
compression = None
84+
f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False)
6885
if protocol < 0:
6986
protocol = pickle.HIGHEST_PROTOCOL
7087
try:
@@ -73,9 +90,16 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
7390
f.close()
7491
for _f in fh:
7592
_f.close()
93+
if should_close:
94+
try:
95+
fp_or_buf.close()
96+
except ValueError:
97+
pass
7698

7799

78-
def read_pickle(path, compression="infer"):
100+
def read_pickle(
101+
filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer"
102+
):
79103
"""
80104
Load pickled pandas object (or any object) from file.
81105
@@ -86,13 +110,17 @@ def read_pickle(path, compression="infer"):
86110
87111
Parameters
88112
----------
89-
path : str
90-
File path where the pickled object will be loaded.
113+
filepath_or_buffer : str, path object or file-like object
114+
File path, URL, or buffer where the pickled object will be loaded from.
115+
116+
.. versionchanged:: 1.0.0
117+
Accept URL. URL is not limited to S3 and GCS.
118+
91119
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
92-
For on-the-fly decompression of on-disk data. If 'infer', then use
93-
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
94-
or '.zip' respectively, and no decompression otherwise.
95-
Set to None for no decompression.
120+
If 'infer' and 'path_or_url' is path-like, then detect compression from
121+
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
122+
compression) If 'infer' and 'path_or_url' is not path-like, then use
123+
None (= no decompression).
96124
97125
Returns
98126
-------
@@ -134,8 +162,12 @@ def read_pickle(path, compression="infer"):
134162
>>> import os
135163
>>> os.remove("./dummy.pkl")
136164
"""
137-
path = stringify_path(path)
138-
f, fh = get_handle(path, "rb", compression=compression, is_text=False)
165+
fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
166+
filepath_or_buffer, compression=compression
167+
)
168+
if not isinstance(fp_or_buf, str) and compression == "infer":
169+
compression = None
170+
f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False)
139171

140172
# 1) try standard library Pickle
141173
# 2) try pickle_compat (older pandas version) to handle subclass changes
@@ -159,3 +191,8 @@ def read_pickle(path, compression="infer"):
159191
f.close()
160192
for _f in fh:
161193
_f.close()
194+
if should_close:
195+
try:
196+
fp_or_buf.close()
197+
except ValueError:
198+
pass

pandas/tests/io/test_pickle.py

+97
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import pytest
2323

2424
from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian
25+
import pandas.util._test_decorators as td
2526

2627
import pandas as pd
2728
from pandas import Index
@@ -390,3 +391,99 @@ def test_unicode_decode_error(datapath):
390391
# just test the columns are correct since the values are random
391392
excols = pd.Index(["a", "b", "c"])
392393
tm.assert_index_equal(df.columns, excols)
394+
395+
396+
# ---------------------
397+
# tests for buffer I/O
398+
# ---------------------
399+
400+
401+
def test_pickle_buffer_roundtrip():
402+
with tm.ensure_clean() as path:
403+
df = tm.makeDataFrame()
404+
with open(path, "wb") as fh:
405+
df.to_pickle(fh)
406+
with open(path, "rb") as fh:
407+
result = pd.read_pickle(fh)
408+
tm.assert_frame_equal(df, result)
409+
410+
411+
# ---------------------
412+
# tests for URL I/O
413+
# ---------------------
414+
415+
416+
@pytest.mark.parametrize(
417+
"mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"]
418+
)
419+
def test_pickle_generalurl_read(monkeypatch, mockurl):
420+
def python_pickler(obj, path):
421+
with open(path, "wb") as fh:
422+
pickle.dump(obj, fh, protocol=-1)
423+
424+
class MockReadResponse:
425+
def __init__(self, path):
426+
self.file = open(path, "rb")
427+
if "gzip" in path:
428+
self.headers = {"Content-Encoding": "gzip"}
429+
else:
430+
self.headers = {"Content-Encoding": None}
431+
432+
def read(self):
433+
return self.file.read()
434+
435+
def close(self):
436+
return self.file.close()
437+
438+
with tm.ensure_clean() as path:
439+
440+
def mock_urlopen_read(*args, **kwargs):
441+
return MockReadResponse(path)
442+
443+
df = tm.makeDataFrame()
444+
python_pickler(df, path)
445+
monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read)
446+
result = pd.read_pickle(mockurl)
447+
tm.assert_frame_equal(df, result)
448+
449+
450+
@td.skip_if_no("gcsfs")
451+
@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"])
452+
def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl):
453+
with tm.ensure_clean() as path:
454+
455+
class MockGCSFileSystem:
456+
def __init__(self, *args, **kwargs):
457+
pass
458+
459+
def open(self, *args):
460+
mode = args[1] or None
461+
f = open(path, mode)
462+
return f
463+
464+
monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
465+
df = tm.makeDataFrame()
466+
df.to_pickle(mockurl)
467+
result = pd.read_pickle(mockurl)
468+
tm.assert_frame_equal(df, result)
469+
470+
471+
@td.skip_if_no("s3fs")
472+
@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"])
473+
def test_pickle_s3url_roundtrip(monkeypatch, mockurl):
474+
with tm.ensure_clean() as path:
475+
476+
class MockS3FileSystem:
477+
def __init__(self, *args, **kwargs):
478+
pass
479+
480+
def open(self, *args):
481+
mode = args[1] or None
482+
f = open(path, mode)
483+
return f
484+
485+
monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem)
486+
df = tm.makeDataFrame()
487+
df.to_pickle(mockurl)
488+
result = pd.read_pickle(mockurl)
489+
tm.assert_frame_equal(df, result)

0 commit comments

Comments
 (0)