diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 912b658cffdb6..4bd098897dda4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -717,6 +717,7 @@ I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) +- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) diff --git a/pandas/io/common.py b/pandas/io/common.py index e0076eb486976..1a9e6b472463d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -71,7 +71,7 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") -_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") +_FSSPEC_URL_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)*://") BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) @@ -291,7 +291,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: """ return ( isinstance(url, str) - and bool(_RFC_3986_PATTERN.match(url)) + and bool(_FSSPEC_URL_PATTERN.match(url)) and not url.startswith(("http://", "https://")) ) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 144b36166261b..e64fab21b85a5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1753,6 +1753,7 @@ def test_read_timezone_information(self): [ "s3://example-fsspec/", "gcs://another-fsspec/file.json", + "filecache::s3://yet-another-fsspec/file.json", "https://example-site.com/data", "some-protocol://data.txt", ], diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e162815271ab3..99af421d5aa48 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -501,6 +501,18 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") +def test_is_fsspec_url_chained(): + # GH#48978 Support chained fsspec URLs + # See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining. + assert icom.is_fsspec_url("filecache::s3://pandas/test.csv") + assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/file.zip") + assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/file.zip") + assert icom.is_fsspec_url("filecache::dask::s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache::://pandas/test.csv") + + @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format):