Skip to content

Commit 2b847a5

Browse files
cdknoxluckyvs1
authored andcommitted
Read csv headers (pandas-dev#37966)
1 parent 780b769 commit 2b847a5

File tree

6 files changed

+391
-18
lines changed

6 files changed

+391
-18
lines changed

doc/source/user_guide/io.rst

+14
Original file line numberDiff line numberDiff line change
@@ -1627,6 +1627,20 @@ functions - the following example shows reading a CSV file:
16271627
16281628
df = pd.read_csv("https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t")
16291629
1630+
.. versionadded:: 1.3.0
1631+
1632+
A custom header can be sent alongside HTTP(s) requests by passing a dictionary
1633+
of header key value mappings to the ``storage_options`` keyword argument as shown below:
1634+
1635+
.. code-block:: python
1636+
1637+
headers = {"User-Agent": "pandas"}
1638+
df = pd.read_csv(
1639+
"https://download.bls.gov/pub/time.series/cu/cu.item",
1640+
sep="\t",
1641+
storage_options=headers
1642+
)
1643+
16301644
All URLs which are not local files or HTTP(s) are handled by
16311645
`fsspec`_, if installed, and its various filesystem implementations
16321646
(including Amazon S3, Google Cloud, SSH, FTP, webHDFS...).

doc/source/whatsnew/v1.3.0.rst

+20
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,26 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_130.read_csv_json_http_headers:
17+
18+
Custom HTTP(s) headers when reading csv or json files
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
When reading from a remote URL that is not handled by fsspec (ie. HTTP and
22+
HTTPS) the dictionary passed to ``storage_options`` will be used to create the
23+
headers included in the request. This can be used to control the User-Agent
24+
header or send other custom headers (:issue:`36688`).
25+
For example:
26+
27+
.. ipython:: python
28+
29+
headers = {"User-Agent": "pandas"}
30+
df = pd.read_csv(
31+
"https://download.bls.gov/pub/time.series/cu/cu.item",
32+
sep="\t",
33+
storage_options=headers
34+
)
35+
1636
1737
.. _whatsnew_130.enhancements.other:
1838

pandas/core/shared_docs.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,7 @@
383383
"storage_options"
384384
] = """storage_options : dict, optional
385385
Extra options that make sense for a particular storage connection, e.g.
386-
host, port, username, password, etc., if using a URL that will
387-
be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
388-
will be raised if providing this argument with a non-fsspec URL.
389-
See the fsspec and backend storage implementation docs for the set of
390-
allowed keys and values."""
386+
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
387+
are forwarded to ``urllib`` as header options. For other URLs (e.g.
388+
starting with "s3://", and "gcs://") the key-value pairs are forwarded to
389+
``fsspec``. Please see ``fsspec`` and ``urllib`` for more details."""

pandas/io/common.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -280,12 +280,18 @@ def _get_filepath_or_buffer(
280280
fsspec_mode += "b"
281281

282282
if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
283-
# TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
284-
if storage_options:
285-
raise ValueError(
286-
"storage_options passed with file object or non-fsspec file path"
287-
)
288-
req = urlopen(filepath_or_buffer)
283+
# TODO: fsspec can also handle HTTP via requests, but leaving this
284+
# unchanged. using fsspec appears to break the ability to infer if the
285+
# server responded with gzipped data
286+
storage_options = storage_options or {}
287+
288+
# waiting until now for importing to match intended lazy logic of
289+
# urlopen function defined elsewhere in this module
290+
import urllib.request
291+
292+
# assuming storage_options is to be interpretted as headers
293+
req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
294+
req = urlopen(req_info)
289295
content_encoding = req.headers.get("Content-Encoding", None)
290296
if content_encoding == "gzip":
291297
# Override compression based on Content-Encoding header

pandas/io/parquet.py

+32-7
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
from pandas import DataFrame, MultiIndex, get_option
1515
from pandas.core import generic
1616

17-
from pandas.io.common import IOHandles, get_handle, is_fsspec_url, stringify_path
17+
from pandas.io.common import (
18+
IOHandles,
19+
get_handle,
20+
is_fsspec_url,
21+
is_url,
22+
stringify_path,
23+
)
1824

1925

2026
def get_engine(engine: str) -> "BaseImpl":
@@ -66,8 +72,10 @@ def _get_path_or_handle(
6672
fs, path_or_handle = fsspec.core.url_to_fs(
6773
path_or_handle, **(storage_options or {})
6874
)
69-
elif storage_options:
70-
raise ValueError("storage_options passed with buffer or non-fsspec filepath")
75+
elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
76+
# can't write to a remote url
77+
# without making use of fsspec at the moment
78+
raise ValueError("storage_options passed with buffer, or non-supported URL")
7179

7280
handles = None
7381
if (
@@ -79,7 +87,9 @@ def _get_path_or_handle(
7987
# use get_handle only when we are very certain that it is not a directory
8088
# fsspec resources can also point to directories
8189
# this branch is used for example when reading from non-fsspec URLs
82-
handles = get_handle(path_or_handle, mode, is_text=False)
90+
handles = get_handle(
91+
path_or_handle, mode, is_text=False, storage_options=storage_options
92+
)
8393
fs = None
8494
path_or_handle = handles.handle
8595
return path_or_handle, handles, fs
@@ -307,7 +317,9 @@ def read(
307317
# use get_handle only when we are very certain that it is not a directory
308318
# fsspec resources can also point to directories
309319
# this branch is used for example when reading from non-fsspec URLs
310-
handles = get_handle(path, "rb", is_text=False)
320+
handles = get_handle(
321+
path, "rb", is_text=False, storage_options=storage_options
322+
)
311323
path = handles.handle
312324
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
313325

@@ -404,10 +416,12 @@ def to_parquet(
404416
return None
405417

406418

419+
@doc(storage_options=generic._shared_docs["storage_options"])
407420
def read_parquet(
408421
path,
409422
engine: str = "auto",
410423
columns=None,
424+
storage_options: StorageOptions = None,
411425
use_nullable_dtypes: bool = False,
412426
**kwargs,
413427
):
@@ -432,13 +446,18 @@ def read_parquet(
432446
By file-like object, we refer to objects with a ``read()`` method,
433447
such as a file handle (e.g. via builtin ``open`` function)
434448
or ``StringIO``.
435-
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
449+
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
436450
Parquet library to use. If 'auto', then the option
437451
``io.parquet.engine`` is used. The default ``io.parquet.engine``
438452
behavior is to try 'pyarrow', falling back to 'fastparquet' if
439453
'pyarrow' is unavailable.
440454
columns : list, default=None
441455
If not None, only these columns will be read from the file.
456+
457+
{storage_options}
458+
459+
.. versionadded:: 1.3.0
460+
442461
use_nullable_dtypes : bool, default False
443462
If True, use dtypes that use ``pd.NA`` as missing value indicator
444463
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
@@ -448,6 +467,7 @@ def read_parquet(
448467
support dtypes) may change without notice.
449468
450469
.. versionadded:: 1.2.0
470+
451471
**kwargs
452472
Any additional kwargs are passed to the engine.
453473
@@ -456,6 +476,11 @@ def read_parquet(
456476
DataFrame
457477
"""
458478
impl = get_engine(engine)
479+
459480
return impl.read(
460-
path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
481+
path,
482+
columns=columns,
483+
storage_options=storage_options,
484+
use_nullable_dtypes=use_nullable_dtypes,
485+
**kwargs,
461486
)

0 commit comments

Comments
 (0)