Skip to content

Commit eeff2b0

Browse files
authored
Fix issue #36271 - pd.read_json() fails for strings that look similar to fsspec_url (#44619)
1 parent aff0694 commit eeff2b0

File tree

4 files changed

+24
-1
lines changed

4 files changed

+24
-1
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,7 @@ I/O
785785
- Bug in :func:`read_csv` not setting name of :class:`MultiIndex` columns correctly when ``index_col`` is not the first column (:issue:`38549`)
786786
- Bug in :func:`read_csv` silently ignoring errors when failing to create a memory-mapped file (:issue:`44766`)
787787
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
788+
- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`)
788789
-
789790

790791
Period

pandas/io/common.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import mmap
1919
import os
2020
from pathlib import Path
21+
import re
2122
from typing import (
2223
IO,
2324
Any,
@@ -59,6 +60,7 @@
5960

6061
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
6162
_VALID_URLS.discard("")
63+
_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://")
6264

6365
BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer)
6466

@@ -244,7 +246,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool:
244246
"""
245247
return (
246248
isinstance(url, str)
247-
and "://" in url
249+
and bool(_RFC_3986_PATTERN.match(url))
248250
and not url.startswith(("http://", "https://"))
249251
)
250252

pandas/tests/io/json/test_pandas.py

+15
Original file line numberDiff line numberDiff line change
@@ -1527,6 +1527,21 @@ def test_read_timezone_information(self):
15271527
expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC"))
15281528
tm.assert_series_equal(result, expected)
15291529

1530+
@pytest.mark.parametrize(
1531+
"url",
1532+
[
1533+
"s3://example-fsspec/",
1534+
"gcs://another-fsspec/file.json",
1535+
"https://example-site.com/data",
1536+
"some-protocol://data.txt",
1537+
],
1538+
)
1539+
def test_read_json_with_url_value(self, url):
1540+
# GH 36271
1541+
result = read_json(f'{{"url":{{"0":"{url}"}}}}')
1542+
expected = DataFrame({"url": [url]})
1543+
tm.assert_frame_equal(result, expected)
1544+
15301545
@pytest.mark.parametrize(
15311546
"date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")]
15321547
)

pandas/tests/io/test_common.py

+5
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,11 @@ def test_is_fsspec_url():
494494
assert not icom.is_fsspec_url("random:pandas/somethingelse.com")
495495
assert not icom.is_fsspec_url("/local/path")
496496
assert not icom.is_fsspec_url("relative/local/path")
497+
# fsspec URL in string should not be recognized
498+
assert not icom.is_fsspec_url("this is not fsspec://url")
499+
assert not icom.is_fsspec_url("{'url': 'gs://pandas/somethingelse.com'}")
500+
# accept everything that conforms to RFC 3986 schema
501+
assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")
497502

498503

499504
@pytest.mark.parametrize("encoding", [None, "utf-8"])

0 commit comments

Comments
 (0)