From 0007503fb37374232b3ca3c2f7d6197edbc76079 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Tue, 9 Apr 2019 11:00:49 -0400 Subject: [PATCH] BUG: Fix loading files from S3 with # characters in URL (GH25945) This fixes loading files with URLs such as s3://bucket/key#1.csv. The part from the # on was being lost because it was considered to be a URL fragment. The fix disables URL fragment parsing as it doesn't make sense for S3 URLs. --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/s3.py | 2 +- pandas/tests/io/conftest.py | 1 + pandas/tests/io/parser/test_network.py | 5 +++++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8dabaeb6c7bfe..34af5d1e3dbe5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -356,6 +356,7 @@ I/O - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) +- Fixed bug in loading objects from S3 that contain ``#`` characters in the URL (:issue:`25945`) Plotting ^^^^^^^^ diff --git a/pandas/io/s3.py b/pandas/io/s3.py index f127bb4c8094c..607eae27021c3 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -10,7 +10,7 @@ def _strip_schema(url): """Returns the url without the s3:// part""" - result = parse_url(url) + result = parse_url(url, allow_fragments=False) return result.netloc + result.path diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index af6f7ac4ef528..a4e778a68c728 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -59,6 +59,7 @@ def s3_resource(tips_file, jsonl_file): moto = pytest.importorskip('moto') test_s3_files = [ + ('tips#1.csv', tips_file), ('tips.csv', tips_file), ('tips.csv.gz', tips_file + '.gz'), ('tips.csv.bz2', tips_file + '.bz2'), diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index b36508d89d770..c8cace6118ad8 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -198,3 +198,8 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) assert ((0, 5505024) in {x.args[-2:] for x in caplog.records}) + + def test_read_s3_with_hash_in_key(self, tips_df): + # GH 25945 + result = read_csv('s3://pandas-test/tips#1.csv') + tm.assert_frame_equal(tips_df, result)