Skip to content

ENH: meth 'DataFrame.to_pickle' and func 'read_pickle' to accept URL GH#30163 #30301

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jan 9, 2020
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ Other enhancements
- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`)
- :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`)
- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`)

- :func:`to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`)


Build Changes
Expand Down
23 changes: 13 additions & 10 deletions pandas/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from shutil import rmtree
import string
import tempfile
from typing import List, Optional, Union, cast
from typing import Any, List, Optional, Union, cast
import warnings
import zipfile

Expand All @@ -22,7 +22,7 @@
)

import pandas._libs.testing as _testing
from pandas._typing import FrameOrSeries
from pandas._typing import FilePathOrBuffer, FrameOrSeries
from pandas.compat import _get_lzma_file, _import_lzma

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -101,27 +101,30 @@ def reset_display_options():
pd.reset_option("^display.", silent=True)


def round_trip_pickle(obj: FrameOrSeries, path: Optional[str] = None) -> FrameOrSeries:
def round_trip_pickle(
obj: Any, path: Optional[FilePathOrBuffer] = None
) -> FrameOrSeries:
"""
Pickle an object and then read it again.

Parameters
----------
obj : pandas object
obj : any object
The object to pickle and then re-read.
path : str, default None
path : str, path object or file-like object, default None
The path where the pickled object is written and then read.

Returns
-------
pandas object
The original object that was pickled and then re-read.
"""
if path is None:
path = f"__{rands(10)}__.pickle"
with ensure_clean(path) as path:
pd.to_pickle(obj, path)
return pd.read_pickle(path)
_path = path
if _path is None:
_path = f"__{rands(10)}__.pickle"
with ensure_clean(_path) as path:
pd.to_pickle(obj, _path)
return pd.read_pickle(_path)


def round_trip_pathlib(writer, reader, path: Optional[str] = None):
Expand Down
71 changes: 54 additions & 17 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,38 @@
""" pickle compat """
import pickle
from typing import Any, Optional
import warnings

from pandas._typing import FilePathOrBuffer
from pandas.compat import pickle_compat as pc

from pandas.io.common import get_handle, stringify_path
from pandas.io.common import get_filepath_or_buffer, get_handle


def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
def to_pickle(
obj: Any,
filepath_or_buffer: FilePathOrBuffer,
compression: Optional[str] = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
):
"""
Pickle (serialize) object to file.

Parameters
----------
obj : any object
Any python object.
path : str
File path where the pickled object will be stored.
filepath_or_buffer : str, path object or file-like object
File path, URL, or buffer where the pickled object will be stored.

.. versionchanged:: 1.0.0
Accept URL. URL has to be of S3 or GCS.

compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
A string representing the compression to use in the output file. By
default, infers from the file extension in specified path.
If 'infer' and 'path_or_url' is path-like, then detect compression from
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
compression) If 'infer' and 'path_or_url' is not path-like, then use
None (= no decompression).
protocol : int
Int which indicates which protocol should be used by the pickler,
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
Expand Down Expand Up @@ -63,8 +76,12 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
>>> import os
>>> os.remove("./dummy.pkl")
"""
path = stringify_path(path)
f, fh = get_handle(path, "wb", compression=compression, is_text=False)
fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
filepath_or_buffer, compression=compression, mode="wb"
)
if not isinstance(fp_or_buf, str) and compression == "infer":
compression = None
f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False)
if protocol < 0:
protocol = pickle.HIGHEST_PROTOCOL
try:
Expand All @@ -73,9 +90,16 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
f.close()
for _f in fh:
_f.close()
if should_close:
try:
fp_or_buf.close()
except ValueError:
pass


def read_pickle(path, compression="infer"):
def read_pickle(
filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer"
):
"""
Load pickled pandas object (or any object) from file.

Expand All @@ -86,13 +110,17 @@ def read_pickle(path, compression="infer"):

Parameters
----------
path : str
File path where the pickled object will be loaded.
filepath_or_buffer : str, path object or file-like object
File path, URL, or buffer where the pickled object will be loaded from.

.. versionchanged:: 1.0.0
Accept URL. URL is not limited to S3 and GCS.

compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
or '.zip' respectively, and no decompression otherwise.
Set to None for no decompression.
If 'infer' and 'path_or_url' is path-like, then detect compression from
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
compression) If 'infer' and 'path_or_url' is not path-like, then use
None (= no decompression).

Returns
-------
Expand Down Expand Up @@ -134,8 +162,12 @@ def read_pickle(path, compression="infer"):
>>> import os
>>> os.remove("./dummy.pkl")
"""
path = stringify_path(path)
f, fh = get_handle(path, "rb", compression=compression, is_text=False)
fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
filepath_or_buffer, compression=compression
)
if not isinstance(fp_or_buf, str) and compression == "infer":
compression = None
f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False)

# 1) try standard library Pickle
# 2) try pickle_compat (older pandas version) to handle subclass changes
Expand All @@ -159,3 +191,8 @@ def read_pickle(path, compression="infer"):
f.close()
for _f in fh:
_f.close()
if should_close:
try:
fp_or_buf.close()
except ValueError:
pass
97 changes: 97 additions & 0 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import pytest

from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian
import pandas.util._test_decorators as td

import pandas as pd
from pandas import Index
Expand Down Expand Up @@ -390,3 +391,99 @@ def test_unicode_decode_error():
# just test the columns are correct since the values are random
excols = pd.Index(["a", "b", "c"])
tm.assert_index_equal(df.columns, excols)


# ---------------------
# tests for buffer I/O
# ---------------------


def test_pickle_buffer_roundtrip():
with tm.ensure_clean() as path:
df = tm.makeDataFrame()
with open(path, "wb") as fh:
df.to_pickle(fh)
with open(path, "rb") as fh:
result = pd.read_pickle(fh)
tm.assert_frame_equal(df, result)


# ---------------------
# tests for URL I/O
# ---------------------


@pytest.mark.parametrize(
"mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"]
)
def test_pickle_generalurl_read(monkeypatch, mockurl):
def python_pickler(obj, path):
with open(path, "wb") as fh:
pickle.dump(obj, fh, protocol=-1)

class MockReadResponse:
def __init__(self, path):
self.file = open(path, "rb")
if "gzip" in path:
self.headers = {"Content-Encoding": "gzip"}
else:
self.headers = {"Content-Encoding": None}

def read(self):
return self.file.read()

def close(self):
return self.file.close()

with tm.ensure_clean() as path:

def mock_urlopen_read(*args, **kwargs):
return MockReadResponse(path)

df = tm.makeDataFrame()
python_pickler(df, path)
monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read)
result = pd.read_pickle(mockurl)
tm.assert_frame_equal(df, result)


@td.skip_if_no("gcsfs")
@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"])
def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl):
with tm.ensure_clean() as path:

class MockGCSFileSystem:
def __init__(self, *args, **kwargs):
pass

def open(self, *args):
mode = args[1] or None
f = open(path, mode)
return f

monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
df = tm.makeDataFrame()
df.to_pickle(mockurl)
result = pd.read_pickle(mockurl)
tm.assert_frame_equal(df, result)


@td.skip_if_no("s3fs")
@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"])
def test_pickle_s3url_roundtrip(monkeypatch, mockurl):
with tm.ensure_clean() as path:

class MockS3FileSystem:
def __init__(self, *args, **kwargs):
pass

def open(self, *args):
mode = args[1] or None
f = open(path, mode)
return f

monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem)
df = tm.makeDataFrame()
df.to_pickle(mockurl)
result = pd.read_pickle(mockurl)
tm.assert_frame_equal(df, result)