From 8c40f23d5482f1554a064a15eb7ba46da3bba887 Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 10 Sep 2020 11:18:07 -0700 Subject: [PATCH 01/10] Fix issue #36271 to disambiguate json string pd.read_json() fails currently for strings that look like fsspec_url and contain "://". adding another condition to fix this at least in most cases --- pandas/io/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index f177e08ac0089..fe2f38c0b40b0 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -161,6 +161,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: return ( isinstance(url, str) and "://" in url + and not " " in url and not url.startswith(("http://", "https://")) ) From e5777f4b44b019956eaf2db93f9e4145d70bc4c0 Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 10 Sep 2020 13:48:19 -0700 Subject: [PATCH 02/10] BUG: pd.read_json() fails for strings that look similar to fsspec_url #36271 --- pandas/tests/io/test_common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 85a12a13d19fb..79ebd441155e1 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -52,6 +52,12 @@ class TestCommonIOCapabilities: bar2,12,13,14,15 """ + def test_is_fsspec_url(self): + some_string = 'some :// string' + expected = False + + assert icom.is_fsspec_url(some_string)==expected + def test_expand_user(self): filename = "~/sometest" expanded_name = icom._expand_user(filename) From ed49bfecf05306149be833e85537366470590243 Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 10 Sep 2020 14:22:44 -0700 Subject: [PATCH 03/10] fixed duplicate test_is_fsspec_url() --- pandas/tests/io/test_common.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 79ebd441155e1..68e1e1265ec08 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -52,12 +52,6 @@ class TestCommonIOCapabilities: bar2,12,13,14,15 """ - def test_is_fsspec_url(self): - some_string = 'some :// string' - expected = False - - assert icom.is_fsspec_url(some_string)==expected - def test_expand_user(self): filename = "~/sometest" expanded_name = icom._expand_user(filename) @@ -423,3 +417,5 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + # there are no white spaces in a URL + assert not icom.is_fsspec_url("gs://pandas /somethingelse.com") From 0d7e0633955acc4f87ebcab984385f3ceaa59bf7 Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 10 Sep 2020 14:25:05 -0700 Subject: [PATCH 04/10] Update common.py --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index fe2f38c0b40b0..89d1ae2af16cd 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -161,7 +161,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: return ( isinstance(url, str) and "://" in url - and not " " in url + and " " not in url and not url.startswith(("http://", "https://")) ) From 3706d18a4b1e659f6da1aa3729c7d4c08868f3c6 Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 10 Sep 2020 15:32:38 -0700 Subject: [PATCH 05/10] check if url is a json string --- pandas/io/common.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 89d1ae2af16cd..80761707efb11 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -45,6 +45,8 @@ from pandas.core.dtypes.common import is_file_like +from pandas._libs.json import loads as json_loads + lzma = import_lzma() @@ -153,6 +155,19 @@ def urlopen(*args, **kwargs): return urllib.request.urlopen(*args, **kwargs) +def is_json(url: FilePathOrBuffer) -> bool: + """ + Returns true if the given string looks like + something json.loads can handle + """ + try: + json_loads(url) + + return True + except: + return False + + def is_fsspec_url(url: FilePathOrBuffer) -> bool: """ Returns true if the given URL looks like @@ -161,7 +176,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: return ( isinstance(url, str) and "://" in url - and " " not in url + and not is_json(url) and not url.startswith(("http://", "https://")) ) From 210d659d6c063e823de15ce01486ff0b6ad5e79a Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 10 Sep 2020 15:36:41 -0700 Subject: [PATCH 06/10] Test test_is_fsspec_url() interprets json as URL --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 68e1e1265ec08..dbe99f795356f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -418,4 +418,4 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") # there are no white spaces in a URL - assert not icom.is_fsspec_url("gs://pandas /somethingelse.com") + assert not icom.is_fsspec_url('{"json": "text ://"}') From 6b2c98cae133c9f013c8078a597c6c98757bcd34 Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 10 Sep 2020 15:37:37 -0700 Subject: [PATCH 07/10] Update test_common.py --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index dbe99f795356f..9d30a45fc893e 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -417,5 +417,5 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") - # there are no white spaces in a URL + # Ensure json string is not interpreted as URL assert not icom.is_fsspec_url('{"json": "text ://"}') From 6019978967c541a1757867eab18338da451a8cce Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 10 Sep 2020 15:52:30 -0700 Subject: [PATCH 08/10] Update common.py --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 80761707efb11..dac4418e3e13d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -45,7 +45,7 @@ from pandas.core.dtypes.common import is_file_like -from pandas._libs.json import loads as json_loads +from pandas._libs.json import loads lzma = import_lzma() @@ -161,7 +161,7 @@ def is_json(url: FilePathOrBuffer) -> bool: something json.loads can handle """ try: - json_loads(url) + loads(url) return True except: From def18815c462e487ee08d7a6c1b5f5e7ef243c52 Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Thu, 17 Sep 2020 11:47:11 -0700 Subject: [PATCH 09/10] use regex to check for json in url --- pandas/io/common.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index dac4418e3e13d..e44f9569e7689 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -7,6 +7,7 @@ import mmap import os import pathlib +import re from typing import ( IO, TYPE_CHECKING, @@ -45,8 +46,6 @@ from pandas.core.dtypes.common import is_file_like -from pandas._libs.json import loads - lzma = import_lzma() @@ -158,13 +157,12 @@ def urlopen(*args, **kwargs): def is_json(url: FilePathOrBuffer) -> bool: """ Returns true if the given string looks like - something json.loads can handle + json """ - try: - loads(url) - + json_pattern = re.compile(r"^\s*[\[{]") + if json_pattern.match(url): return True - except: + else: return False From 943f64431ca3402c48ad93d20f3cacba9395e5da Mon Sep 17 00:00:00 2001 From: Thomas Bachlechner <35582223+tbachlechner@users.noreply.github.com> Date: Wed, 11 Nov 2020 10:02:49 -0500 Subject: [PATCH 10/10] simplify is_json --- pandas/io/common.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e44f9569e7689..d5dac940a17ba 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -160,10 +160,8 @@ def is_json(url: FilePathOrBuffer) -> bool: json """ json_pattern = re.compile(r"^\s*[\[{]") - if json_pattern.match(url): - return True - else: - return False + return json_pattern.match(url) is not None + def is_fsspec_url(url: FilePathOrBuffer) -> bool: