diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5e39ca692746b..11a6f2628ac52 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -220,6 +220,8 @@ Other enhancements - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`) - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) +- :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) + Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/_testing.py b/pandas/_testing.py index 2050a18cb48bf..0b81fb0f7a8d5 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -8,7 +8,7 @@ from shutil import rmtree import string import tempfile -from typing import List, Optional, Union, cast +from typing import Any, List, Optional, Union, cast import warnings import zipfile @@ -22,7 +22,7 @@ ) import pandas._libs.testing as _testing -from pandas._typing import FrameOrSeries +from pandas._typing import FilePathOrBuffer, FrameOrSeries from pandas.compat import _get_lzma_file, _import_lzma from pandas.core.dtypes.common import ( @@ -101,15 +101,17 @@ def reset_display_options(): pd.reset_option("^display.", silent=True) -def round_trip_pickle(obj: FrameOrSeries, path: Optional[str] = None) -> FrameOrSeries: +def round_trip_pickle( + obj: Any, path: Optional[FilePathOrBuffer] = None +) -> FrameOrSeries: """ Pickle an object and then read it again. Parameters ---------- - obj : pandas object + obj : any object The object to pickle and then re-read. - path : str, default None + path : str, path object or file-like object, default None The path where the pickled object is written and then read. Returns @@ -117,11 +119,12 @@ def round_trip_pickle(obj: FrameOrSeries, path: Optional[str] = None) -> FrameOr pandas object The original object that was pickled and then re-read. """ - if path is None: - path = f"__{rands(10)}__.pickle" - with ensure_clean(path) as path: - pd.to_pickle(obj, path) - return pd.read_pickle(path) + _path = path + if _path is None: + _path = f"__{rands(10)}__.pickle" + with ensure_clean(_path) as path: + pd.to_pickle(obj, _path) + return pd.read_pickle(_path) def round_trip_pathlib(writer, reader, path: Optional[str] = None): diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 6ce52da21b4e8..e51f24b551f31 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,13 +1,20 @@ """ pickle compat """ import pickle +from typing import Any, Optional import warnings +from pandas._typing import FilePathOrBuffer from pandas.compat import pickle_compat as pc -from pandas.io.common import get_handle, stringify_path +from pandas.io.common import get_filepath_or_buffer, get_handle -def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): +def to_pickle( + obj: Any, + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + protocol: int = pickle.HIGHEST_PROTOCOL, +): """ Pickle (serialize) object to file. @@ -15,11 +22,17 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): ---------- obj : any object Any python object. - path : str - File path where the pickled object will be stored. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be stored. + + .. versionchanged:: 1.0.0 + Accept URL. URL has to be of S3 or GCS. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible @@ -63,8 +76,12 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): >>> import os >>> os.remove("./dummy.pkl") """ - path = stringify_path(path) - f, fh = get_handle(path, "wb", compression=compression, is_text=False) + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression, mode="wb" + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: @@ -73,9 +90,16 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): f.close() for _f in fh: _f.close() + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass -def read_pickle(path, compression="infer"): +def read_pickle( + filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" +): """ Load pickled pandas object (or any object) from file. @@ -86,13 +110,17 @@ def read_pickle(path, compression="infer"): Parameters ---------- - path : str - File path where the pickled object will be loaded. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be loaded from. + + .. versionchanged:: 1.0.0 + Accept URL. URL is not limited to S3 and GCS. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', - or '.zip' respectively, and no decompression otherwise. - Set to None for no decompression. + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). Returns ------- @@ -134,8 +162,12 @@ def read_pickle(path, compression="infer"): >>> import os >>> os.remove("./dummy.pkl") """ - path = stringify_path(path) - f, fh = get_handle(path, "rb", compression=compression, is_text=False) + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False) # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes @@ -159,3 +191,8 @@ def read_pickle(path, compression="infer"): f.close() for _f in fh: _f.close() + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index ccd77f47b5e5e..3d427dde573af 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -22,6 +22,7 @@ import pytest from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -390,3 +391,99 @@ def test_unicode_decode_error(datapath): # just test the columns are correct since the values are random excols = pd.Index(["a", "b", "c"]) tm.assert_index_equal(df.columns, excols) + + +# --------------------- +# tests for buffer I/O +# --------------------- + + +def test_pickle_buffer_roundtrip(): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + with open(path, "wb") as fh: + df.to_pickle(fh) + with open(path, "rb") as fh: + result = pd.read_pickle(fh) + tm.assert_frame_equal(df, result) + + +# --------------------- +# tests for URL I/O +# --------------------- + + +@pytest.mark.parametrize( + "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"] +) +def test_pickle_generalurl_read(monkeypatch, mockurl): + def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) + + class MockReadResponse: + def __init__(self, path): + self.file = open(path, "rb") + if "gzip" in path: + self.headers = {"Content-Encoding": "gzip"} + else: + self.headers = {"Content-Encoding": None} + + def read(self): + return self.file.read() + + def close(self): + return self.file.close() + + with tm.ensure_clean() as path: + + def mock_urlopen_read(*args, **kwargs): + return MockReadResponse(path) + + df = tm.makeDataFrame() + python_pickler(df, path) + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("gcsfs") +@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"]) +def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockGCSFileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("s3fs") +@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"]) +def test_pickle_s3url_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockS3FileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result)