Skip to content

Commit 9528057

Browse files
authored
BUG: Recognize chained fsspec URLs (#61041)
* BUG: Recognize chained fsspec URLs * Add whatsnew note * Rename regex variable appropriately and allow more complex chaining * Fix pre-commit
1 parent b8f6bac commit 9528057

File tree

4 files changed

+16
-2
lines changed

4 files changed

+16
-2
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,7 @@ I/O
718718
^^^
719719
- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`)
720720
- Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
721+
- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
721722
- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)
722723
- Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`)
723724
- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)

pandas/io/common.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171

7272
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
7373
_VALID_URLS.discard("")
74-
_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://")
74+
_FSSPEC_URL_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)*://")
7575

7676
BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer)
7777

@@ -291,7 +291,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool:
291291
"""
292292
return (
293293
isinstance(url, str)
294-
and bool(_RFC_3986_PATTERN.match(url))
294+
and bool(_FSSPEC_URL_PATTERN.match(url))
295295
and not url.startswith(("http://", "https://"))
296296
)
297297

pandas/tests/io/json/test_pandas.py

+1
Original file line numberDiff line numberDiff line change
@@ -1753,6 +1753,7 @@ def test_read_timezone_information(self):
17531753
[
17541754
"s3://example-fsspec/",
17551755
"gcs://another-fsspec/file.json",
1756+
"filecache::s3://yet-another-fsspec/file.json",
17561757
"https://example-site.com/data",
17571758
"some-protocol://data.txt",
17581759
],

pandas/tests/io/test_common.py

+12
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,18 @@ def test_is_fsspec_url():
501501
assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")
502502

503503

504+
def test_is_fsspec_url_chained():
505+
# GH#48978 Support chained fsspec URLs
506+
# See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining.
507+
assert icom.is_fsspec_url("filecache::s3://pandas/test.csv")
508+
assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/file.zip")
509+
assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/file.zip")
510+
assert icom.is_fsspec_url("filecache::dask::s3://pandas/test.csv")
511+
assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv")
512+
assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv")
513+
assert not icom.is_fsspec_url("filecache::://pandas/test.csv")
514+
515+
504516
@pytest.mark.parametrize("encoding", [None, "utf-8"])
505517
@pytest.mark.parametrize("format", ["csv", "json"])
506518
def test_codecs_encoding(encoding, format):

0 commit comments

Comments
 (0)