Skip to content

Commit 602040f

Browse files
Merge remote-tracking branch 'upstream/master' into read_sql_doc_examples
2 parents 46ffa21 + dc4eaf3 commit 602040f

File tree

19 files changed

+432
-38
lines changed

19 files changed

+432
-38
lines changed

.github/workflows/ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
- name: Testing docstring validation script
6565
run: |
6666
source activate pandas-dev
67-
pytest --capture=no --strict scripts
67+
pytest --capture=no --strict-markers scripts
6868
if: always()
6969

7070
- name: Running benchmarks

ci/run_tests.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then
2020
XVFB="xvfb-run "
2121
fi
2222

23-
PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
23+
PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
2424

2525
if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then
2626
# GH#37455 windows py38 build appears to be running out of memory

doc/source/user_guide/indexing.rst

+2
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ NA values in a boolean array propagate as ``False``:
380380

381381
.. versionchanged:: 1.0.2
382382

383+
.. ipython:: python
384+
383385
mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
384386
mask
385387
df1[mask]

doc/source/user_guide/io.rst

+14
Original file line numberDiff line numberDiff line change
@@ -1627,6 +1627,20 @@ functions - the following example shows reading a CSV file:
16271627
16281628
df = pd.read_csv("https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t")
16291629
1630+
.. versionadded:: 1.3.0
1631+
1632+
A custom header can be sent alongside HTTP(s) requests by passing a dictionary
1633+
of header key value mappings to the ``storage_options`` keyword argument as shown below:
1634+
1635+
.. code-block:: python
1636+
1637+
headers = {"User-Agent": "pandas"}
1638+
df = pd.read_csv(
1639+
"https://download.bls.gov/pub/time.series/cu/cu.item",
1640+
sep="\t",
1641+
storage_options=headers
1642+
)
1643+
16301644
All URLs which are not local files or HTTP(s) are handled by
16311645
`fsspec`_, if installed, and its various filesystem implementations
16321646
(including Amazon S3, Google Cloud, SSH, FTP, webHDFS...).

doc/source/whatsnew/v1.3.0.rst

+21-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,26 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_130.read_csv_json_http_headers:
17+
18+
Custom HTTP(s) headers when reading csv or json files
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
When reading from a remote URL that is not handled by fsspec (ie. HTTP and
22+
HTTPS) the dictionary passed to ``storage_options`` will be used to create the
23+
headers included in the request. This can be used to control the User-Agent
24+
header or send other custom headers (:issue:`36688`).
25+
For example:
26+
27+
.. ipython:: python
28+
29+
headers = {"User-Agent": "pandas"}
30+
df = pd.read_csv(
31+
"https://download.bls.gov/pub/time.series/cu/cu.item",
32+
sep="\t",
33+
storage_options=headers
34+
)
35+
1636
1737
.. _whatsnew_130.enhancements.other:
1838

@@ -211,7 +231,7 @@ Missing
211231
MultiIndex
212232
^^^^^^^^^^
213233

214-
-
234+
- Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`)
215235
-
216236

217237
I/O

pandas/_testing.py

+2
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@
108108
+ BYTES_DTYPES
109109
)
110110

111+
NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA]
112+
111113

112114
# set testing_mode
113115
_testing_mode_warnings = (DeprecationWarning, ResourceWarning)

pandas/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ def nselect_method(request):
266266
# ----------------------------------------------------------------
267267
# Missing values & co.
268268
# ----------------------------------------------------------------
269-
@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), pd.NA], ids=str)
269+
@pytest.fixture(params=tm.NULL_OBJECTS, ids=str)
270270
def nulls_fixture(request):
271271
"""
272272
Fixture for each null type in pandas.

pandas/core/generic.py

+4
Original file line numberDiff line numberDiff line change
@@ -4182,6 +4182,10 @@ def _drop_axis(
41824182
# GH 18561 MultiIndex.drop should raise if label is absent
41834183
if errors == "raise" and indexer.all():
41844184
raise KeyError(f"{labels} not found in axis")
4185+
elif isinstance(axis, MultiIndex) and labels.dtype == "object":
4186+
# Set level to zero in case of MultiIndex and label is string,
4187+
# because isin can't handle strings for MultiIndexes GH#36293
4188+
indexer = ~axis.get_level_values(0).isin(labels)
41854189
else:
41864190
indexer = ~axis.isin(labels)
41874191
# Check if label doesn't exist along axis

pandas/core/shared_docs.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,7 @@
383383
"storage_options"
384384
] = """storage_options : dict, optional
385385
Extra options that make sense for a particular storage connection, e.g.
386-
host, port, username, password, etc., if using a URL that will
387-
be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
388-
will be raised if providing this argument with a non-fsspec URL.
389-
See the fsspec and backend storage implementation docs for the set of
390-
allowed keys and values."""
386+
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
387+
are forwarded to ``urllib`` as header options. For other URLs (e.g.
388+
starting with "s3://", and "gcs://") the key-value pairs are forwarded to
389+
``fsspec``. Please see ``fsspec`` and ``urllib`` for more details."""

pandas/io/common.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -280,12 +280,18 @@ def _get_filepath_or_buffer(
280280
fsspec_mode += "b"
281281

282282
if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
283-
# TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
284-
if storage_options:
285-
raise ValueError(
286-
"storage_options passed with file object or non-fsspec file path"
287-
)
288-
req = urlopen(filepath_or_buffer)
283+
# TODO: fsspec can also handle HTTP via requests, but leaving this
284+
# unchanged. using fsspec appears to break the ability to infer if the
285+
# server responded with gzipped data
286+
storage_options = storage_options or {}
287+
288+
# waiting until now for importing to match intended lazy logic of
289+
# urlopen function defined elsewhere in this module
290+
import urllib.request
291+
292+
# assuming storage_options is to be interpretted as headers
293+
req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
294+
req = urlopen(req_info)
289295
content_encoding = req.headers.get("Content-Encoding", None)
290296
if content_encoding == "gzip":
291297
# Override compression based on Content-Encoding header

pandas/io/parquet.py

+32-7
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
from pandas import DataFrame, MultiIndex, get_option
1515
from pandas.core import generic
1616

17-
from pandas.io.common import IOHandles, get_handle, is_fsspec_url, stringify_path
17+
from pandas.io.common import (
18+
IOHandles,
19+
get_handle,
20+
is_fsspec_url,
21+
is_url,
22+
stringify_path,
23+
)
1824

1925

2026
def get_engine(engine: str) -> "BaseImpl":
@@ -66,8 +72,10 @@ def _get_path_or_handle(
6672
fs, path_or_handle = fsspec.core.url_to_fs(
6773
path_or_handle, **(storage_options or {})
6874
)
69-
elif storage_options:
70-
raise ValueError("storage_options passed with buffer or non-fsspec filepath")
75+
elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
76+
# can't write to a remote url
77+
# without making use of fsspec at the moment
78+
raise ValueError("storage_options passed with buffer, or non-supported URL")
7179

7280
handles = None
7381
if (
@@ -79,7 +87,9 @@ def _get_path_or_handle(
7987
# use get_handle only when we are very certain that it is not a directory
8088
# fsspec resources can also point to directories
8189
# this branch is used for example when reading from non-fsspec URLs
82-
handles = get_handle(path_or_handle, mode, is_text=False)
90+
handles = get_handle(
91+
path_or_handle, mode, is_text=False, storage_options=storage_options
92+
)
8393
fs = None
8494
path_or_handle = handles.handle
8595
return path_or_handle, handles, fs
@@ -307,7 +317,9 @@ def read(
307317
# use get_handle only when we are very certain that it is not a directory
308318
# fsspec resources can also point to directories
309319
# this branch is used for example when reading from non-fsspec URLs
310-
handles = get_handle(path, "rb", is_text=False)
320+
handles = get_handle(
321+
path, "rb", is_text=False, storage_options=storage_options
322+
)
311323
path = handles.handle
312324
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
313325

@@ -404,10 +416,12 @@ def to_parquet(
404416
return None
405417

406418

419+
@doc(storage_options=generic._shared_docs["storage_options"])
407420
def read_parquet(
408421
path,
409422
engine: str = "auto",
410423
columns=None,
424+
storage_options: StorageOptions = None,
411425
use_nullable_dtypes: bool = False,
412426
**kwargs,
413427
):
@@ -432,13 +446,18 @@ def read_parquet(
432446
By file-like object, we refer to objects with a ``read()`` method,
433447
such as a file handle (e.g. via builtin ``open`` function)
434448
or ``StringIO``.
435-
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
449+
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
436450
Parquet library to use. If 'auto', then the option
437451
``io.parquet.engine`` is used. The default ``io.parquet.engine``
438452
behavior is to try 'pyarrow', falling back to 'fastparquet' if
439453
'pyarrow' is unavailable.
440454
columns : list, default=None
441455
If not None, only these columns will be read from the file.
456+
457+
{storage_options}
458+
459+
.. versionadded:: 1.3.0
460+
442461
use_nullable_dtypes : bool, default False
443462
If True, use dtypes that use ``pd.NA`` as missing value indicator
444463
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
@@ -448,6 +467,7 @@ def read_parquet(
448467
support dtypes) may change without notice.
449468
450469
.. versionadded:: 1.2.0
470+
451471
**kwargs
452472
Any additional kwargs are passed to the engine.
453473
@@ -456,6 +476,11 @@ def read_parquet(
456476
DataFrame
457477
"""
458478
impl = get_engine(engine)
479+
459480
return impl.read(
460-
path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
481+
path,
482+
columns=columns,
483+
storage_options=storage_options,
484+
use_nullable_dtypes=use_nullable_dtypes,
485+
**kwargs,
461486
)

pandas/tests/extension/arrow/test_bool.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ def test_view(self, data):
5151
data.view()
5252

5353
@pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet")
54-
def test_contains(self, data, data_missing, nulls_fixture):
55-
super().test_contains(data, data_missing, nulls_fixture)
54+
def test_contains(self, data, data_missing):
55+
super().test_contains(data, data_missing)
5656

5757

5858
class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):

pandas/tests/extension/base/interface.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def test_can_hold_na_valid(self, data):
2929
# GH-20761
3030
assert data._can_hold_na is True
3131

32-
def test_contains(self, data, data_missing, nulls_fixture):
32+
def test_contains(self, data, data_missing):
3333
# GH-37867
3434
# Tests for membership checks. Membership checks for nan-likes is tricky and
3535
# the settled on rule is: `nan_like in arr` is True if nan_like is
@@ -47,10 +47,12 @@ def test_contains(self, data, data_missing, nulls_fixture):
4747
assert na_value in data_missing
4848
assert na_value not in data
4949

50-
if nulls_fixture is not na_value:
51-
# the data can never contain other nan-likes than na_value
52-
assert nulls_fixture not in data
53-
assert nulls_fixture not in data_missing
50+
# the data can never contain other nan-likes than na_value
51+
for na_value_obj in tm.NULL_OBJECTS:
52+
if na_value_obj is na_value:
53+
continue
54+
assert na_value_obj not in data
55+
assert na_value_obj not in data_missing
5456

5557
def test_memory_usage(self, data):
5658
s = pd.Series(data)

pandas/tests/extension/test_categorical.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def test_memory_usage(self, data):
8787
# Is this deliberate?
8888
super().test_memory_usage(data)
8989

90-
def test_contains(self, data, data_missing, nulls_fixture):
90+
def test_contains(self, data, data_missing):
9191
# GH-37867
9292
# na value handling in Categorical.__contains__ is deprecated.
9393
# See base.BaseInterFaceTests.test_contains for more details.
@@ -105,9 +105,11 @@ def test_contains(self, data, data_missing, nulls_fixture):
105105
assert na_value not in data
106106

107107
# Categoricals can contain other nan-likes than na_value
108-
if nulls_fixture is not na_value:
109-
assert nulls_fixture not in data
110-
assert nulls_fixture in data_missing # this line differs from super method
108+
for na_value_obj in tm.NULL_OBJECTS:
109+
if na_value_obj is na_value:
110+
continue
111+
assert na_value_obj not in data
112+
assert na_value_obj in data_missing # this line differs from super method
111113

112114

113115
class TestConstructors(base.BaseConstructorsTests):

pandas/tests/frame/methods/test_drop.py

+8
Original file line numberDiff line numberDiff line change
@@ -441,3 +441,11 @@ def test_inplace_drop_and_operation(self, operation, inplace):
441441
# Perform operation and check result
442442
getattr(y, operation)(1)
443443
tm.assert_frame_equal(df, expected)
444+
445+
def test_drop_with_non_unique_multiindex(self):
446+
# GH#36293
447+
mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]])
448+
df = DataFrame([1, 2, 3], index=mi)
449+
result = df.drop(index="x")
450+
expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]]))
451+
tm.assert_frame_equal(result, expected)

pandas/tests/io/conftest.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,7 @@ def s3_base(worker_id):
5050
pytest.importorskip("s3fs")
5151
pytest.importorskip("boto3")
5252
requests = pytest.importorskip("requests")
53-
# GH 38090: Suppress http logs in tests by moto_server
54-
logging.getLogger("werkzeug").disabled = True
53+
logging.getLogger("requests").disabled = True
5554

5655
with tm.ensure_safe_environment_variables():
5756
# temporary workaround as moto fails for botocore >= 1.11 otherwise,
@@ -71,7 +70,9 @@ def s3_base(worker_id):
7170

7271
# pipe to null to avoid logging in terminal
7372
proc = subprocess.Popen(
74-
shlex.split(f"moto_server s3 -p {endpoint_port}"), stdout=subprocess.DEVNULL
73+
shlex.split(f"moto_server s3 -p {endpoint_port}"),
74+
stdout=subprocess.DEVNULL,
75+
stderr=subprocess.DEVNULL,
7576
)
7677

7778
timeout = 5

0 commit comments

Comments
 (0)