From 795b260dd5a5623399847ac047cb1c549ce1321c Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Thu, 24 Oct 2024 21:40:06 +0800 Subject: [PATCH 01/11] add to whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e5376177d3381..359545d8726d8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -776,6 +776,7 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) From f05dec54e91924aa912d12ba480191c9a5ee4602 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Tue, 29 Oct 2024 21:25:53 +0800 Subject: [PATCH 02/11] extract the target file to access when chained URLs are used --- pandas/io/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index a76f0cf6dd34d..fa8d892605b6d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -584,6 +584,9 @@ def infer_compression( # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings + if "::" in filepath_or_buffer: + # chained URLs contain :: + filepath_or_buffer = filepath_or_buffer.split("::")[0] filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression From cb940606c74f2e8b0bffb7ad4fcd3cec02667b10 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Tue, 29 Oct 2024 22:31:32 +0800 Subject: [PATCH 03/11] add isinstance to filter on str inputs only --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index fa8d892605b6d..0691e6b2fa438 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -584,7 +584,7 @@ def infer_compression( # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings - if "::" in filepath_or_buffer: + if isinstance(filepath_or_buffer, str) and "::" in filepath_or_buffer: # chained URLs contain :: filepath_or_buffer = filepath_or_buffer.split("::")[0] filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) From 9e1ba27f09a8583a290cb5028a1b03b9a1a70eb4 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Tue, 29 Oct 2024 22:31:49 +0800 Subject: [PATCH 04/11] add tests and test tar file --- pandas/tests/io/data/tar/test-csv.tar | Bin 0 -> 10240 bytes pandas/tests/io/test_common.py | 17 +++++++---------- 2 files changed, 7 insertions(+), 10 deletions(-) create mode 100644 pandas/tests/io/data/tar/test-csv.tar diff --git a/pandas/tests/io/data/tar/test-csv.tar b/pandas/tests/io/data/tar/test-csv.tar new file mode 100644 index 0000000000000000000000000000000000000000..c3b3091348426791f9bb09e2cbd8196465074c49 GIT binary patch literal 10240 zcmeIy!3u&f9LMpUeTtqy_ur;VBIww$SCCo|(Ir>(_-P`9P?sQ8|uSTYwP;G_K4D=jTp6fjPf;uqPIF$*QWj8^<0)_xwypBC9K6; zZFE^mnfhkF%xy9kgE{|a40TNR^?gi(Hq?ddGVY7K%er~byjSA9Xepd|cWX`0%yGKpQeX`0g&0R#|0009ILKmY**5I_I{1Q0*~0R#|0009IL SKmY**5I_I{1Q0;rXMrbBPb;PX literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 10e3af601b7ef..7d86092778707 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -630,16 +630,13 @@ def test_fail_mmap(): icom.get_handle(buffer, "rb", memory_map=True) -def test_close_on_error(): - # GH 47136 - class TestError: - def close(self): - raise OSError("test") - - with pytest.raises(OSError, match="test"): - with BytesIO() as buffer: - with icom.get_handle(buffer, "rb") as handles: - handles.created_handles.append(TestError()) +def read_chained_urls_no_errors(): + tar_file = "pandas/tests/io/data/tar/test-csv.tar" + try: + pd.read_csv(f"tar://test.csv::file://{tar_file}", compression=None) + pd.read_csv(f"tar://test.csv::file://{tar_file}", compression="infer") + except Exception as e: + pytest.fail(e) @pytest.mark.parametrize( From 3aaad97728c2eec6963ec134bef744cdd2251a1d Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Tue, 29 Oct 2024 22:39:12 +0800 Subject: [PATCH 05/11] rename func to start with "test"; revert removed random test func --- pandas/tests/io/test_common.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 7d86092778707..0d97cccc6a804 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -630,7 +630,18 @@ def test_fail_mmap(): icom.get_handle(buffer, "rb", memory_map=True) -def read_chained_urls_no_errors(): +def test_close_on_error(): + # GH 47136 + class TestError: + def close(self): + raise OSError("test") + + with pytest.raises(OSError, match="test"): + with BytesIO() as buffer: + with icom.get_handle(buffer, "rb") as handles: + handles.created_handles.append(TestError()) + +def test_read_csv_chained_url_no_error(): tar_file = "pandas/tests/io/data/tar/test-csv.tar" try: pd.read_csv(f"tar://test.csv::file://{tar_file}", compression=None) @@ -638,7 +649,6 @@ def read_chained_urls_no_errors(): except Exception as e: pytest.fail(e) - @pytest.mark.parametrize( "reader", [ From 778e385e522d53ac5b4d70f85bfa3ce0c1056649 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Tue, 29 Oct 2024 22:45:25 +0800 Subject: [PATCH 06/11] formatting improvements by ruff --- pandas/tests/io/test_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 0d97cccc6a804..4bbe2d316c595 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -641,6 +641,7 @@ def close(self): with icom.get_handle(buffer, "rb") as handles: handles.created_handles.append(TestError()) + def test_read_csv_chained_url_no_error(): tar_file = "pandas/tests/io/data/tar/test-csv.tar" try: @@ -649,6 +650,7 @@ def test_read_csv_chained_url_no_error(): except Exception as e: pytest.fail(e) + @pytest.mark.parametrize( "reader", [ From 33b601d16c4b14d480822b8cfa5545aee5f55c6f Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Thu, 31 Oct 2024 06:26:55 +0800 Subject: [PATCH 07/11] add @td.skip_if_no("fsspec") on test func --- pandas/tests/io/test_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4bbe2d316c595..9770e5bfd743c 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -25,6 +25,7 @@ WASM, is_platform_windows, ) +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -642,6 +643,7 @@ def close(self): handles.created_handles.append(TestError()) +@td.skip_if_no("fsspec") def test_read_csv_chained_url_no_error(): tar_file = "pandas/tests/io/data/tar/test-csv.tar" try: From fc469c79e09558a5baae10afed2d6823bd093e8a Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Mon, 4 Nov 2024 21:02:22 +0800 Subject: [PATCH 08/11] improve test function for read_csv chained urls --- pandas/tests/io/test_common.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 9770e5bfd743c..9e63e893d6395 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -645,12 +645,17 @@ def close(self): @td.skip_if_no("fsspec") def test_read_csv_chained_url_no_error(): + # GH 60100 tar_file = "pandas/tests/io/data/tar/test-csv.tar" - try: - pd.read_csv(f"tar://test.csv::file://{tar_file}", compression=None) - pd.read_csv(f"tar://test.csv::file://{tar_file}", compression="infer") - except Exception as e: - pytest.fail(e) + + x = pd.read_csv(f"tar://test.csv::file://{tar_file}", compression=None) + y = pd.read_csv(f"tar://test.csv::file://{tar_file}", compression="infer") + + x_to_json_expected_output = '{"1;2":{"0":"3;4"}}' + y_to_json_expected_output = '{"1;2":{"0":"3;4"}}' + + assert x_to_json_expected_output == x.to_json() + assert y_to_json_expected_output == y.to_json() @pytest.mark.parametrize( From 32fef2921b3cbd1f05bb28a300495ae91eb507ec Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Wed, 6 Nov 2024 17:31:20 +0800 Subject: [PATCH 09/11] use tm.assert_frame_equal; add separator on read_csv; improve chained_file_url --- pandas/tests/io/test_common.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 9e63e893d6395..67e2517479c21 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -646,16 +646,16 @@ def close(self): @td.skip_if_no("fsspec") def test_read_csv_chained_url_no_error(): # GH 60100 - tar_file = "pandas/tests/io/data/tar/test-csv.tar" + tar_file_path = "pandas/tests/io/data/tar/test-csv.tar" + chained_file_url = f"tar://test.csv::file://{tar_file_path}" - x = pd.read_csv(f"tar://test.csv::file://{tar_file}", compression=None) - y = pd.read_csv(f"tar://test.csv::file://{tar_file}", compression="infer") + result_a = pd.read_csv(chained_file_url, compression=None, sep=";") + result_b = pd.read_csv(chained_file_url, compression="infer", sep=";") - x_to_json_expected_output = '{"1;2":{"0":"3;4"}}' - y_to_json_expected_output = '{"1;2":{"0":"3;4"}}' + expected = pd.DataFrame({"1": {0: 3}, "2": {0: 4}}) - assert x_to_json_expected_output == x.to_json() - assert y_to_json_expected_output == y.to_json() + tm.assert_frame_equal(expected, result_a) + tm.assert_frame_equal(expected, result_b) @pytest.mark.parametrize( From 0dc0444111a32d8775334683400cbac2d6c73081 Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Mon, 11 Nov 2024 20:04:12 +0800 Subject: [PATCH 10/11] add min_version to td.skip_if_no due to ffspec bug --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 67e2517479c21..fd7b40f3ac7ac 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -643,7 +643,7 @@ def close(self): handles.created_handles.append(TestError()) -@td.skip_if_no("fsspec") +@td.skip_if_no("fsspec", min_version="2023.1.0") def test_read_csv_chained_url_no_error(): # GH 60100 tar_file_path = "pandas/tests/io/data/tar/test-csv.tar" From 04f92462947095a8ed442dd35de8491c56f5bc5d Mon Sep 17 00:00:00 2001 From: KevsterAmp Date: Mon, 11 Nov 2024 20:08:45 +0800 Subject: [PATCH 11/11] utilize pytest.mark.parametrize for testing --- pandas/tests/io/test_common.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index fd7b40f3ac7ac..4f3f613f71542 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -644,18 +644,16 @@ def close(self): @td.skip_if_no("fsspec", min_version="2023.1.0") -def test_read_csv_chained_url_no_error(): +@pytest.mark.parametrize("compression", [None, "infer"]) +def test_read_csv_chained_url_no_error(compression): # GH 60100 tar_file_path = "pandas/tests/io/data/tar/test-csv.tar" chained_file_url = f"tar://test.csv::file://{tar_file_path}" - result_a = pd.read_csv(chained_file_url, compression=None, sep=";") - result_b = pd.read_csv(chained_file_url, compression="infer", sep=";") - + result = pd.read_csv(chained_file_url, compression=compression, sep=";") expected = pd.DataFrame({"1": {0: 3}, "2": {0: 4}}) - tm.assert_frame_equal(expected, result_a) - tm.assert_frame_equal(expected, result_b) + tm.assert_frame_equal(expected, result) @pytest.mark.parametrize(