Merge remote-tracking branch 'upstream/master' into read_sql_doc_examples

avinashpancham · avinashpancham · commit 602040fef89f · 2020-12-15T20:33:44.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -64,7 +64,7 @@ jobs:
     - name: Testing docstring validation script
       run: |
         source activate pandas-dev
-        pytest --capture=no --strict scripts
+        pytest --capture=no --strict-markers scripts
       if: always()
 
     - name: Running benchmarks
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
@@ -20,7 +20,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then
     XVFB="xvfb-run "
 fi
 
-PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
+PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
 
 if [[ $(uname) != "Linux"  && $(uname) != "Darwin" ]]; then
     # GH#37455 windows py38 build appears to be running out of memory
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -380,6 +380,8 @@ NA values in a boolean array propagate as ``False``:
 
 .. versionchanged:: 1.0.2
 
+.. ipython:: python
+
    mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
    mask
    df1[mask]
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1627,6 +1627,20 @@ functions - the following example shows reading a CSV file:
 
    df = pd.read_csv("https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t")
 
+.. versionadded:: 1.3.0
+
+A custom header can be sent alongside HTTP(s) requests by passing a dictionary
+of header key value mappings to the ``storage_options`` keyword argument as shown below:
+
+.. code-block:: python
+
+   headers = {"User-Agent": "pandas"}
+   df = pd.read_csv(
+       "https://download.bls.gov/pub/time.series/cu/cu.item",
+       sep="\t",
+       storage_options=headers
+   )
+
 All URLs which are not local files or HTTP(s) are handled by
 `fsspec`_, if installed, and its various filesystem implementations
 (including Amazon S3, Google Cloud, SSH, FTP, webHDFS...).
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -13,6 +13,26 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_130.read_csv_json_http_headers:
+
+Custom HTTP(s) headers when reading csv or json files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When reading from a remote URL that is not handled by fsspec (ie. HTTP and
+HTTPS) the dictionary passed to ``storage_options`` will be used to create the
+headers included in the request.  This can be used to control the User-Agent
+header or send other custom headers (:issue:`36688`).
+For example:
+
+.. ipython:: python
+
+    headers = {"User-Agent": "pandas"}
+    df = pd.read_csv(
+        "https://download.bls.gov/pub/time.series/cu/cu.item",
+        sep="\t",
+        storage_options=headers
+    )
+
 
 .. _whatsnew_130.enhancements.other:
 
@@ -211,7 +231,7 @@ Missing
 MultiIndex
 ^^^^^^^^^^
 
--
+- Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`)
 -
 
 I/O
diff --git a/pandas/_testing.py b/pandas/_testing.py
@@ -108,6 +108,8 @@
     + BYTES_DTYPES
 )
 
+NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA]
+
 
 # set testing_mode
 _testing_mode_warnings = (DeprecationWarning, ResourceWarning)
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -266,7 +266,7 @@ def nselect_method(request):
 # ----------------------------------------------------------------
 # Missing values & co.
 # ----------------------------------------------------------------
-@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), pd.NA], ids=str)
+@pytest.fixture(params=tm.NULL_OBJECTS, ids=str)
 def nulls_fixture(request):
     """
     Fixture for each null type in pandas.
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4182,6 +4182,10 @@ def _drop_axis(
                 # GH 18561 MultiIndex.drop should raise if label is absent
                 if errors == "raise" and indexer.all():
                     raise KeyError(f"{labels} not found in axis")
+            elif isinstance(axis, MultiIndex) and labels.dtype == "object":
+                # Set level to zero in case of MultiIndex and label is string,
+                #  because isin can't handle strings for MultiIndexes GH#36293
+                indexer = ~axis.get_level_values(0).isin(labels)
             else:
                 indexer = ~axis.isin(labels)
                 # Check if label doesn't exist along axis
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -383,8 +383,7 @@
     "storage_options"
 ] = """storage_options : dict, optional
     Extra options that make sense for a particular storage connection, e.g.
-    host, port, username, password, etc., if using a URL that will
-    be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
-    will be raised if providing this argument with a non-fsspec URL.
-    See the fsspec and backend storage implementation docs for the set of
-    allowed keys and values."""
+    host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+    are forwarded to ``urllib`` as header options. For other URLs (e.g.
+    starting with "s3://", and "gcs://") the key-value pairs are forwarded to
+    ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details."""
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -280,12 +280,18 @@ def _get_filepath_or_buffer(
         fsspec_mode += "b"
 
     if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
-        # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
-        if storage_options:
-            raise ValueError(
-                "storage_options passed with file object or non-fsspec file path"
-            )
-        req = urlopen(filepath_or_buffer)
+        # TODO: fsspec can also handle HTTP via requests, but leaving this
+        # unchanged. using fsspec appears to break the ability to infer if the
+        # server responded with gzipped data
+        storage_options = storage_options or {}
+
+        # waiting until now for importing to match intended lazy logic of
+        # urlopen function defined elsewhere in this module
+        import urllib.request
+
+        # assuming storage_options is to be interpretted as headers
+        req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
+        req = urlopen(req_info)
         content_encoding = req.headers.get("Content-Encoding", None)
         if content_encoding == "gzip":
             # Override compression based on Content-Encoding header
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -14,7 +14,13 @@
 from pandas import DataFrame, MultiIndex, get_option
 from pandas.core import generic
 
-from pandas.io.common import IOHandles, get_handle, is_fsspec_url, stringify_path
+from pandas.io.common import (
+    IOHandles,
+    get_handle,
+    is_fsspec_url,
+    is_url,
+    stringify_path,
+)
 
 
 def get_engine(engine: str) -> "BaseImpl":
@@ -66,8 +72,10 @@ def _get_path_or_handle(
         fs, path_or_handle = fsspec.core.url_to_fs(
             path_or_handle, **(storage_options or {})
         )
-    elif storage_options:
-        raise ValueError("storage_options passed with buffer or non-fsspec filepath")
+    elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
+        # can't write to a remote url
+        # without making use of fsspec at the moment
+        raise ValueError("storage_options passed with buffer, or non-supported URL")
 
     handles = None
     if (
@@ -79,7 +87,9 @@ def _get_path_or_handle(
         # use get_handle only when we are very certain that it is not a directory
         # fsspec resources can also point to directories
         # this branch is used for example when reading from non-fsspec URLs
-        handles = get_handle(path_or_handle, mode, is_text=False)
+        handles = get_handle(
+            path_or_handle, mode, is_text=False, storage_options=storage_options
+        )
         fs = None
         path_or_handle = handles.handle
     return path_or_handle, handles, fs
@@ -307,7 +317,9 @@ def read(
             # use get_handle only when we are very certain that it is not a directory
             # fsspec resources can also point to directories
             # this branch is used for example when reading from non-fsspec URLs
-            handles = get_handle(path, "rb", is_text=False)
+            handles = get_handle(
+                path, "rb", is_text=False, storage_options=storage_options
+            )
             path = handles.handle
         parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
 
@@ -404,10 +416,12 @@ def to_parquet(
         return None
 
 
+@doc(storage_options=generic._shared_docs["storage_options"])
 def read_parquet(
     path,
     engine: str = "auto",
     columns=None,
+    storage_options: StorageOptions = None,
     use_nullable_dtypes: bool = False,
     **kwargs,
 ):
@@ -432,13 +446,18 @@ def read_parquet(
         By file-like object, we refer to objects with a ``read()`` method,
         such as a file handle (e.g. via builtin ``open`` function)
         or ``StringIO``.
-    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
         Parquet library to use. If 'auto', then the option
         ``io.parquet.engine`` is used. The default ``io.parquet.engine``
         behavior is to try 'pyarrow', falling back to 'fastparquet' if
         'pyarrow' is unavailable.
     columns : list, default=None
         If not None, only these columns will be read from the file.
+
+    {storage_options}
+
+        .. versionadded:: 1.3.0
+
     use_nullable_dtypes : bool, default False
         If True, use dtypes that use ``pd.NA`` as missing value indicator
         for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
@@ -448,6 +467,7 @@ def read_parquet(
         support dtypes) may change without notice.
 
         .. versionadded:: 1.2.0
+
     **kwargs
         Any additional kwargs are passed to the engine.
 
@@ -456,6 +476,11 @@ def read_parquet(
     DataFrame
     """
     impl = get_engine(engine)
+
     return impl.read(
-        path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
+        path,
+        columns=columns,
+        storage_options=storage_options,
+        use_nullable_dtypes=use_nullable_dtypes,
+        **kwargs,
     )
diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py
@@ -51,8 +51,8 @@ def test_view(self, data):
         data.view()
 
     @pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet")
-    def test_contains(self, data, data_missing, nulls_fixture):
-        super().test_contains(data, data_missing, nulls_fixture)
+    def test_contains(self, data, data_missing):
+        super().test_contains(data, data_missing)
 
 
 class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py
@@ -29,7 +29,7 @@ def test_can_hold_na_valid(self, data):
         # GH-20761
         assert data._can_hold_na is True
 
-    def test_contains(self, data, data_missing, nulls_fixture):
+    def test_contains(self, data, data_missing):
         # GH-37867
         # Tests for membership checks. Membership checks for nan-likes is tricky and
         # the settled on rule is: `nan_like in arr` is True if nan_like is
@@ -47,10 +47,12 @@ def test_contains(self, data, data_missing, nulls_fixture):
         assert na_value in data_missing
         assert na_value not in data
 
-        if nulls_fixture is not na_value:
-            # the data can never contain other nan-likes than na_value
-            assert nulls_fixture not in data
-            assert nulls_fixture not in data_missing
+        # the data can never contain other nan-likes than na_value
+        for na_value_obj in tm.NULL_OBJECTS:
+            if na_value_obj is na_value:
+                continue
+            assert na_value_obj not in data
+            assert na_value_obj not in data_missing
 
     def test_memory_usage(self, data):
         s = pd.Series(data)
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
@@ -87,7 +87,7 @@ def test_memory_usage(self, data):
         # Is this deliberate?
         super().test_memory_usage(data)
 
-    def test_contains(self, data, data_missing, nulls_fixture):
+    def test_contains(self, data, data_missing):
         # GH-37867
         # na value handling in Categorical.__contains__ is deprecated.
         # See base.BaseInterFaceTests.test_contains for more details.
@@ -105,9 +105,11 @@ def test_contains(self, data, data_missing, nulls_fixture):
         assert na_value not in data
 
         # Categoricals can contain other nan-likes than na_value
-        if nulls_fixture is not na_value:
-            assert nulls_fixture not in data
-            assert nulls_fixture in data_missing  # this line differs from super method
+        for na_value_obj in tm.NULL_OBJECTS:
+            if na_value_obj is na_value:
+                continue
+            assert na_value_obj not in data
+            assert na_value_obj in data_missing  # this line differs from super method
 
 
 class TestConstructors(base.BaseConstructorsTests):
diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py
@@ -441,3 +441,11 @@ def test_inplace_drop_and_operation(self, operation, inplace):
             # Perform operation and check result
             getattr(y, operation)(1)
             tm.assert_frame_equal(df, expected)
+
+    def test_drop_with_non_unique_multiindex(self):
+        # GH#36293
+        mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]])
+        df = DataFrame([1, 2, 3], index=mi)
+        result = df.drop(index="x")
+        expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]]))
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -50,8 +50,7 @@ def s3_base(worker_id):
     pytest.importorskip("s3fs")
     pytest.importorskip("boto3")
     requests = pytest.importorskip("requests")
-    # GH 38090: Suppress http logs in tests by moto_server
-    logging.getLogger("werkzeug").disabled = True
+    logging.getLogger("requests").disabled = True
 
     with tm.ensure_safe_environment_variables():
         # temporary workaround as moto fails for botocore >= 1.11 otherwise,
@@ -71,7 +70,9 @@ def s3_base(worker_id):
 
         # pipe to null to avoid logging in terminal
         proc = subprocess.Popen(
-            shlex.split(f"moto_server s3 -p {endpoint_port}"), stdout=subprocess.DEVNULL
+            shlex.split(f"moto_server s3 -p {endpoint_port}"),
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
         )
 
         timeout = 5
diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py
diff --git a/test_fast.bat b/test_fast.bat
diff --git a/test_fast.sh b/test_fast.sh

Original file line number	Diff line number	Diff line change
`@@ -108,6 +108,8 @@`
`108`	`108`	`+ BYTES_DTYPES`
`109`	`109`	`)`
`110`	`110`
	`111`	`+NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA]`
	`112`	`+`
`111`	`113`
`112`	`114`	`# set testing_mode`
`113`	`115`	`_testing_mode_warnings = (DeprecationWarning, ResourceWarning)`