merge with master

arw2019 · arw2019 · commit 1d0ba610fccc · 2020-07-08T17:43:58.000Z
diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst
@@ -159,3 +159,18 @@ For example:
 
     # wrong
     from common import test_base
+
+
+Miscellaneous
+=============
+
+Reading from a url
+------------------
+
+**Good:**
+
+.. code-block:: python
+
+    from pandas.io.common import urlopen
+    with urlopen('http://www.google.com') as url:
+        raw_text = url.read()
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -280,6 +280,7 @@ Other enhancements
 - :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`)
 - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
 - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
+- Add :meth:`ExtensionArray.argmax` and :meth:`ExtensionArray.argmin` (:issue:`24382`)
 - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`)
 - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`)
 - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
@@ -321,7 +322,7 @@ Other enhancements
 - :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`)
 - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example
   combining a nullable integer column with a numpy integer column will no longer
-  result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`).
+  result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`).
 - :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
 - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`).
 - :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`).
@@ -1124,6 +1125,7 @@ Sparse
 - Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
 - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s  string representation (:issue:`34352`)
 - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`)
+- Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`)
 
 ExtensionArray
 ^^^^^^^^^^^^^^
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -291,16 +291,14 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]
                 res_index = res_index.take(successes)
 
         else:
-            for i, v in series_gen_enumeration:
-
-                with option_context("mode.chained_assignment", None):
+            with option_context("mode.chained_assignment", None):
+                for i, v in series_gen_enumeration:
                     # ignore SettingWithCopy here in case the user mutates
                     results[i] = self.f(v)
-
-                if isinstance(results[i], ABCSeries):
-                    # If we have a view on v, we need to make a copy because
-                    #  series_generator will swap out the underlying data
-                    results[i] = results[i].copy(deep=False)
+                    if isinstance(results[i], ABCSeries):
+                        # If we have a view on v, we need to make a copy because
+                        #  series_generator will swap out the underlying data
+                        results[i] = results[i].copy(deep=False)
 
         return results, res_index
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -28,7 +28,7 @@
 from pandas.core import ops
 from pandas.core.algorithms import _factorize_array, unique
 from pandas.core.missing import backfill_1d, pad_1d
-from pandas.core.sorting import nargsort
+from pandas.core.sorting import nargminmax, nargsort
 
 _extension_array_shared_docs: Dict[str, str] = dict()
 
@@ -533,6 +533,40 @@ def argsort(
         result = nargsort(self, kind=kind, ascending=ascending, na_position="last")
         return result
 
+    def argmin(self):
+        """
+        Return the index of minimum value.
+
+        In case of multiple occurrences of the minimum value, the index
+        corresponding to the first occurrence is returned.
+
+        Returns
+        -------
+        int
+
+        See Also
+        --------
+        ExtensionArray.argmax
+        """
+        return nargminmax(self, "argmin")
+
+    def argmax(self):
+        """
+        Return the index of maximum value.
+
+        In case of multiple occurrences of the maximum value, the index
+        corresponding to the first occurrence is returned.
+
+        Returns
+        -------
+        int
+
+        See Also
+        --------
+        ExtensionArray.argmin
+        """
+        return nargminmax(self, "argmax")
+
     def fillna(self, value=None, method=None, limit=None):
         """
         Fill NA/NaN values using the specified method.
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -92,10 +92,13 @@ def construct_array_type(cls) -> Type["IntegerArray"]:
         return IntegerArray
 
     def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
-        # for now only handle other integer types
+        # we only handle nullable EA dtypes and numeric numpy dtypes
         if not all(
-            isinstance(t, _IntegerDtype)
-            or (isinstance(t, np.dtype) and np.issubdtype(t, np.integer))
+            isinstance(t, BaseMaskedDtype)
+            or (
+                isinstance(t, np.dtype)
+                and (np.issubdtype(t, np.number) or np.issubdtype(t, np.bool_))
+            )
             for t in dtypes
         ):
             return None
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -866,11 +866,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
 
         if self.sp_index.npoints == 0:
             # Avoid taking from the empty self.sp_values
-            taken = np.full(
-                sp_indexer.shape,
-                fill_value=fill_value,
-                dtype=np.result_type(type(fill_value)),
-            )
+            _dtype = np.result_type(self.dtype.subtype, type(fill_value))
+            taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
         else:
             taken = self.sp_values.take(sp_indexer)
 
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1165,6 +1165,10 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False):
                     if len(key) == labels.nlevels:
                         return {"key": key}
                     raise
+            except InvalidIndexError:
+                # GH35015, using datetime as column indices raises exception
+                if not isinstance(labels, ABCMultiIndex):
+                    raise
             except TypeError:
                 pass
             except ValueError:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -319,6 +319,33 @@ def nargsort(
     return indexer
 
 
+def nargminmax(values, method: str):
+    """
+    Implementation of np.argmin/argmax but for ExtensionArray and which
+    handles missing values.
+
+    Parameters
+    ----------
+    values : ExtensionArray
+    method : {"argmax", "argmin"}
+
+    Returns
+    -------
+    int
+    """
+    assert method in {"argmax", "argmin"}
+    func = np.argmax if method == "argmax" else np.argmin
+
+    mask = np.asarray(isna(values))
+    values = values._values_for_argsort()
+
+    idx = np.arange(len(values))
+    non_nans = values[~mask]
+    non_nan_idx = idx[~mask]
+
+    return non_nan_idx[func(non_nans)]
+
+
 def ensure_key_mapped_multiindex(index, key: Callable, level=None):
     """
     Returns a new MultiIndex in which key has been applied
diff --git a/pandas/tests/arrays/integer/test_concat.py b/pandas/tests/arrays/integer/test_concat.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pytest
 
 import pandas as pd
@@ -15,12 +16,52 @@
         (["Int32", "UInt32"], "Int64"),
         # this still gives object (awaiting float extension dtype)
         (["Int64", "UInt64"], "object"),
+        (["Int64", "boolean"], "Int64"),
+        (["UInt8", "boolean"], "UInt8"),
     ],
 )
 def test_concat_series(to_concat_dtypes, result_dtype):
 
-    result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes])
-    expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype(
+    result = pd.concat([pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes])
+    expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
         result_dtype
     )
     tm.assert_series_equal(result, expected)
+
+    # order doesn't matter for result
+    result = pd.concat(
+        [pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes[::-1]]
+    )
+    expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
+        result_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "to_concat_dtypes, result_dtype",
+    [
+        (["Int64", "int64"], "Int64"),
+        (["UInt64", "uint64"], "UInt64"),
+        (["Int8", "int8"], "Int8"),
+        (["Int8", "int16"], "Int16"),
+        (["UInt8", "int8"], "Int16"),
+        (["Int32", "uint32"], "Int64"),
+        # this still gives object (awaiting float extension dtype)
+        (["Int64", "uint64"], "object"),
+        (["Int64", "bool"], "Int64"),
+        (["UInt8", "bool"], "UInt8"),
+    ],
+)
+def test_concat_series_with_numpy(to_concat_dtypes, result_dtype):
+
+    s1 = pd.Series([0, 1, pd.NA], dtype=to_concat_dtypes[0])
+    s2 = pd.Series(np.array([0, 1], dtype=to_concat_dtypes[1]))
+    result = pd.concat([s1, s2], ignore_index=True)
+    expected = pd.Series([0, 1, pd.NA, 0, 1], dtype=object).astype(result_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # order doesn't matter for result
+    result = pd.concat([s2, s1], ignore_index=True)
+    expected = pd.Series([0, 1, 0, 1, pd.NA], dtype=object).astype(result_dtype)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -75,6 +75,42 @@ def test_argsort_missing(self, data_missing_for_sorting):
         expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
         self.assert_series_equal(result, expected)
 
+    def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
+        # GH 24382
+
+        # data_for_sorting -> [B, C, A] with A < B < C
+        assert data_for_sorting.argmax() == 1
+        assert data_for_sorting.argmin() == 2
+
+        # with repeated values -> first occurence
+        data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
+        assert data.argmax() == 3
+        assert data.argmin() == 0
+
+        # with missing values
+        # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
+        assert data_missing_for_sorting.argmax() == 0
+        assert data_missing_for_sorting.argmin() == 2
+
+    @pytest.mark.parametrize(
+        "method", ["argmax", "argmin"],
+    )
+    def test_argmin_argmax_empty_array(self, method, data):
+        # GH 24382
+        err_msg = "attempt to get"
+        with pytest.raises(ValueError, match=err_msg):
+            getattr(data[:0], method)()
+
+    @pytest.mark.parametrize(
+        "method", ["argmax", "argmin"],
+    )
+    def test_argmin_argmax_all_na(self, method, data, na_value):
+        # all missing with skipna=True is the same as emtpy
+        err_msg = "attempt to get"
+        data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
+        with pytest.raises(ValueError, match=err_msg):
+            getattr(data_na, method)()
+
     @pytest.mark.parametrize(
         "na_position, expected",
         [
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -235,6 +235,23 @@ def test_searchsorted(self, data_for_sorting, as_series):
     def test_value_counts(self, all_data, dropna):
         return super().test_value_counts(all_data, dropna)
 
+    def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting):
+        # override because there are only 2 unique values
+
+        # data_for_sorting -> [B, C, A] with A < B < C -> here True, True, False
+        assert data_for_sorting.argmax() == 0
+        assert data_for_sorting.argmin() == 2
+
+        # with repeated values -> first occurence
+        data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
+        assert data.argmax() == 1
+        assert data.argmin() == 0
+
+        # with missing values
+        # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
+        assert data_missing_for_sorting.argmax() == 0
+        assert data_missing_for_sorting.argmin() == 2
+
 
 class TestCasting(base.BaseCastingTests):
     pass
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -321,6 +321,14 @@ def test_shift_0_periods(self, data):
         data._sparse_values[0] = data._sparse_values[1]
         assert result._sparse_values[0] != result._sparse_values[1]
 
+    @pytest.mark.parametrize(
+        "method", ["argmax", "argmin"],
+    )
+    def test_argmin_argmax_all_na(self, method, data, na_value):
+        # overriding because Sparse[int64, 0] cannot handle na_value
+        self._check_unsupported(data)
+        super().test_argmin_argmax_all_na(method, data, na_value)
+
     @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
     def test_equals(self, data, na_value, as_series, box):
         self._check_unsupported(data)
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -21,7 +21,6 @@
     notna,
 )
 import pandas._testing as tm
-from pandas.arrays import SparseArray
 import pandas.core.common as com
 from pandas.core.indexing import IndexingError
 
@@ -1907,20 +1906,6 @@ def test_getitem_ix_float_duplicates(self):
         expect = df.iloc[[1, -1], 0]
         tm.assert_series_equal(df.loc[0.2, "a"], expect)
 
-    def test_getitem_sparse_column(self):
-        # https://github.com/pandas-dev/pandas/issues/23559
-        data = SparseArray([0, 1])
-        df = pd.DataFrame({"A": data})
-        expected = pd.Series(data, name="A")
-        result = df["A"]
-        tm.assert_series_equal(result, expected)
-
-        result = df.iloc[:, 0]
-        tm.assert_series_equal(result, expected)
-
-        result = df.loc[:, "A"]
-        tm.assert_series_equal(result, expected)
-
     def test_setitem_with_unaligned_tz_aware_datetime_column(self):
         # GH 12981
         # Assignment of unaligned offset-aware datetime series.
diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py
diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
diff --git a/pandas/tests/indexing/multiindex/test_datetime.py b/pandas/tests/indexing/multiindex/test_datetime.py