BUG: Fix loading files from S3 with # characters in URL (GH25945)

swt2c · swt2c · commit f2b5529ce10a · 2019-04-04T14:56:21.000-04:00
This fixes loading files with URLs such as s3://bucket/key#1.csv.  The part
from the # on was being lost because it was considered to be a URL fragment.
The fix disables URL fragment parsing as it doesn't make sense for S3 URLs.
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -356,6 +356,7 @@ I/O
 - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`)
 - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
 - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
+- Fixed bug in loading objects from S3 that contain # characters in the URL (:issue:`25945`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/s3.py b/pandas/io/s3.py
@@ -10,7 +10,7 @@
 
 def _strip_schema(url):
     """Returns the url without the s3:// part"""
-    result = parse_url(url)
+    result = parse_url(url, allow_fragments=False)
     return result.netloc + result.path
 
 
diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
@@ -5,6 +5,7 @@
 from pandas import read_csv
 
 from pandas.io.common import is_s3_url
+from pandas.io.s3 import _strip_schema
 
 
 class TestS3URL(object):
@@ -27,3 +28,8 @@ def test_streaming_s3_objects():
     for el in data:
         body = StreamingBody(BytesIO(el), content_length=len(el))
         read_csv(body)
+
+
+def test_parse_s3_url_with_pound_sign():
+    # GH25945
+    assert _strip_schema('s3://bucket/key#1.csv') == 'bucket/key#1.csv'