Merge remote-tracking branch 'upstream/master' into xs-fails-with-slice

arw2019 · arw2019 · commit 90d0f8bbce2e · 2020-08-04T01:17:24.000Z
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -1,9 +1,11 @@
 # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml
 trigger:
 - master
+- 1.1.x
 
 pr:
 - master
+- 1.1.x
 
 variables:
   PYTEST_WORKERS: auto
diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst
@@ -15,7 +15,7 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 
--
+- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`).
 -
 -
 
@@ -26,6 +26,13 @@ Fixed regressions
 Bug fixes
 ~~~~~~~~~
 
+
+Categorical
+^^^^^^^^^^^
+
+- Bug in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`)
+
+
 **Datetimelike**
 
 -
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
@@ -155,7 +155,10 @@ def isnaobj_old(arr: ndarray) -> ndarray:
     result = np.zeros(n, dtype=np.uint8)
     for i in range(n):
         val = arr[i]
-        result[i] = checknull(val) or val == INF or val == NEGINF
+        result[i] = (
+            checknull(val)
+            or util.is_float_object(val) and (val == INF or val == NEGINF)
+        )
     return result.view(np.bool_)
 
 
diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx
@@ -211,49 +211,40 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None):
         int reso = RESO_DAY, curr_reso
         ndarray[int64_t] trans
         int64_t[:] deltas
-        Py_ssize_t[:] pos
-        int64_t local_val, delta
+        intp_t[:] pos
+        int64_t local_val, delta = NPY_NAT
+        bint use_utc = False, use_tzlocal = False, use_fixed = False
 
     if is_utc(tz) or tz is None:
-        for i in range(n):
-            if stamps[i] == NPY_NAT:
-                continue
-            dt64_to_dtstruct(stamps[i], &dts)
-            curr_reso = _reso_stamp(&dts)
-            if curr_reso < reso:
-                reso = curr_reso
+        use_utc = True
     elif is_tzlocal(tz):
-        for i in range(n):
-            if stamps[i] == NPY_NAT:
-                continue
-            local_val = tz_convert_utc_to_tzlocal(stamps[i], tz)
-            dt64_to_dtstruct(local_val, &dts)
-            curr_reso = _reso_stamp(&dts)
-            if curr_reso < reso:
-                reso = curr_reso
+        use_tzlocal = True
     else:
-        # Adjust datetime64 timestamp, recompute datetimestruct
         trans, deltas, typ = get_dst_info(tz)
-
         if typ not in ["pytz", "dateutil"]:
             # static/fixed; in this case we know that len(delta) == 1
+            use_fixed = True
             delta = deltas[0]
-            for i in range(n):
-                if stamps[i] == NPY_NAT:
-                    continue
-                dt64_to_dtstruct(stamps[i] + delta, &dts)
-                curr_reso = _reso_stamp(&dts)
-                if curr_reso < reso:
-                    reso = curr_reso
         else:
             pos = trans.searchsorted(stamps, side="right") - 1
-            for i in range(n):
-                if stamps[i] == NPY_NAT:
-                    continue
-                dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts)
-                curr_reso = _reso_stamp(&dts)
-                if curr_reso < reso:
-                    reso = curr_reso
+
+    for i in range(n):
+        if stamps[i] == NPY_NAT:
+            continue
+
+        if use_utc:
+            local_val = stamps[i]
+        elif use_tzlocal:
+            local_val = tz_convert_utc_to_tzlocal(stamps[i], tz)
+        elif use_fixed:
+            local_val = stamps[i] + delta
+        else:
+            local_val = stamps[i] + deltas[pos[i]]
+
+        dt64_to_dtstruct(local_val, &dts)
+        curr_reso = _reso_stamp(&dts)
+        if curr_reso < reso:
+            reso = curr_reso
 
     return Resolution(reso)
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4252,16 +4252,15 @@ def equals(self, other: Any) -> bool:
         if not isinstance(other, Index):
             return False
 
-        if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype):
-            # if other is not object, use other's logic for coercion
-            return other.equals(self)
-
-        if isinstance(other, ABCMultiIndex):
-            # d-level MultiIndex can equal d-tuple Index
-            return other.equals(self)
-
-        if is_extension_array_dtype(other.dtype):
-            # All EA-backed Index subclasses override equals
+        # If other is a subclass of self and defines it's own equals method, we
+        # dispatch to the subclass method. For instance for a MultiIndex,
+        # a d-level MultiIndex can equal d-tuple Index.
+        # Note: All EA-backed Index subclasses override equals
+        if (
+            isinstance(other, type(self))
+            and type(other) is not type(self)
+            and other.equals is not self.equals
+        ):
             return other.equals(self)
 
         return array_equivalent(self._values, other._values)
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -20,7 +20,7 @@
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import CategoricalDtype
-from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna
+from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna
 
 from pandas.core import accessor
 from pandas.core.algorithms import take_1d
@@ -348,12 +348,12 @@ def _format_attrs(self):
         return attrs
 
     def _format_with_header(self, header, na_rep="NaN") -> List[str]:
-        from pandas.io.formats.format import format_array
+        from pandas.io.formats.printing import pprint_thing
 
-        formatted_values = format_array(
-            self._values, formatter=None, na_rep=na_rep, justify="left"
-        )
-        result = ibase.trim_front(formatted_values)
+        result = [
+            pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep
+            for x in self._values
+        ]
         return header + result
 
     # --------------------------------------------------------------------
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
@@ -1,7 +1,7 @@
 from datetime import timedelta
 import operator
 from sys import getsizeof
-from typing import Any, List, Optional
+from typing import Any, Optional
 import warnings
 
 import numpy as np
@@ -33,8 +33,6 @@
 from pandas.core.indexes.numeric import Int64Index
 from pandas.core.ops.common import unpack_zerodim_and_defer
 
-from pandas.io.formats.printing import pprint_thing
-
 _empty_range = range(0)
 
 
@@ -197,9 +195,6 @@ def _format_data(self, name=None):
         # we are formatting thru the attributes
         return None
 
-    def _format_with_header(self, header, na_rep="NaN") -> List[str]:
-        return header + [pprint_thing(x) for x in self._range]
-
     # --------------------------------------------------------------------
     _deprecation_message = (
         "RangeIndex.{} is deprecated and will be "
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -255,6 +255,8 @@ def loc(self) -> "_LocIndexer":
 
         - A boolean array of the same length as the axis being sliced,
           e.g. ``[True, False, True]``.
+        - An alignable boolean Series. The index of the key will be aligned before
+          masking.
         - A ``callable`` function with one argument (the calling Series or
           DataFrame) and that returns valid output for indexing (one of the above)
 
@@ -264,6 +266,8 @@ def loc(self) -> "_LocIndexer":
         ------
         KeyError
             If any items are not found.
+        IndexingError
+            If an indexed key is passed and its index is unalignable to the frame index.
 
         See Also
         --------
@@ -319,6 +323,13 @@ def loc(self) -> "_LocIndexer":
                     max_speed  shield
         sidewinder          7       8
 
+        Alignable boolean Series:
+
+        >>> df.loc[pd.Series([False, True, False],
+        ...        index=['viper', 'sidewinder', 'cobra'])]
+                    max_speed  shield
+        sidewinder          7       8
+
         Conditional that returns a boolean Series
 
         >>> df.loc[df['shield'] > 6]
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1,4 +1,4 @@
-from datetime import datetime
+from datetime import date, datetime
 from io import StringIO
 
 import numpy as np
@@ -1014,3 +1014,33 @@ def test_apply_with_timezones_aware():
     result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())
 
     tm.assert_frame_equal(result1, result2)
+
+
+def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
+    # GH 29617
+
+    df = pd.DataFrame(
+        {
+            "A": ["a", "a", "a", "b"],
+            "B": [
+                date(2020, 1, 10),
+                date(2020, 1, 10),
+                date(2020, 2, 10),
+                date(2020, 2, 10),
+            ],
+            "C": [1, 2, 3, 4],
+        },
+        index=pd.Index([100, 101, 102, 103], name="idx"),
+    )
+
+    grp = df.groupby(["A", "B"])
+    result = grp.apply(lambda x: x.head(1))
+
+    expected = df.iloc[[0, 2, 3]]
+    expected = expected.reset_index()
+    expected.index = pd.MultiIndex.from_frame(expected[["A", "B", "idx"]])
+    expected = expected.drop(columns="idx")
+
+    tm.assert_frame_equal(result, expected)
+    for val in result.index.levels[1]:
+        assert type(val) is date
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -992,6 +992,68 @@ def test_frame_describe_unstacked_format():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.filterwarnings(
+    "ignore:"
+    "indexing past lexsort depth may impact performance:"
+    "pandas.errors.PerformanceWarning"
+)
+@pytest.mark.parametrize("as_index", [True, False])
+def test_describe_with_duplicate_output_column_names(as_index):
+    # GH 35314
+    df = pd.DataFrame(
+        {
+            "a": [99, 99, 99, 88, 88, 88],
+            "b": [1, 2, 3, 4, 5, 6],
+            "c": [10, 20, 30, 40, 50, 60],
+        },
+        columns=["a", "b", "b"],
+    )
+
+    expected = (
+        pd.DataFrame.from_records(
+            [
+                ("a", "count", 3.0, 3.0),
+                ("a", "mean", 88.0, 99.0),
+                ("a", "std", 0.0, 0.0),
+                ("a", "min", 88.0, 99.0),
+                ("a", "25%", 88.0, 99.0),
+                ("a", "50%", 88.0, 99.0),
+                ("a", "75%", 88.0, 99.0),
+                ("a", "max", 88.0, 99.0),
+                ("b", "count", 3.0, 3.0),
+                ("b", "mean", 5.0, 2.0),
+                ("b", "std", 1.0, 1.0),
+                ("b", "min", 4.0, 1.0),
+                ("b", "25%", 4.5, 1.5),
+                ("b", "50%", 5.0, 2.0),
+                ("b", "75%", 5.5, 2.5),
+                ("b", "max", 6.0, 3.0),
+                ("b", "count", 3.0, 3.0),
+                ("b", "mean", 5.0, 2.0),
+                ("b", "std", 1.0, 1.0),
+                ("b", "min", 4.0, 1.0),
+                ("b", "25%", 4.5, 1.5),
+                ("b", "50%", 5.0, 2.0),
+                ("b", "75%", 5.5, 2.5),
+                ("b", "max", 6.0, 3.0),
+            ],
+        )
+        .set_index([0, 1])
+        .T
+    )
+    expected.columns.names = [None, None]
+    expected.index = pd.Index([88, 99], name="a")
+
+    if as_index:
+        expected = expected.drop(columns=["a"], level=0)
+    else:
+        expected = expected.reset_index(drop=True)
+
+    result = df.groupby("a", as_index=as_index).describe()
+
+    tm.assert_frame_equal(result, expected)
+
+
 def test_groupby_mean_no_overflow():
     # Regression test for (#22487)
     df = pd.DataFrame(
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2055,3 +2055,17 @@ def test_groups_repr_truncates(max_seq_items, expected):
 
         result = df.groupby(np.array(df.a)).groups.__repr__()
         assert result == expected
+
+
+def test_group_on_two_row_multiindex_returns_one_tuple_key():
+    # GH 18451
+    df = pd.DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}])
+    df = df.set_index(["a", "b"])
+
+    grp = df.groupby(["a", "b"])
+    result = grp.indices
+    expected = {(1, 2): np.array([0, 1], dtype=np.int64)}
+
+    assert len(result) == 1
+    key = (1, 2)
+    assert (result[key] == expected[key]).all()
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
@@ -478,3 +478,9 @@ def test_reindex_base(self):
     def test_map_str(self):
         # See test_map.py
         pass
+
+    def test_format_different_scalar_lengths(self):
+        # GH35439
+        idx = CategoricalIndex(["aaaaaaaaa", "b"])
+        expected = ["aaaaaaaaa", "b"]
+        assert idx.format() == expected
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
@@ -642,6 +642,12 @@ def test_equals_op(self):
             tm.assert_numpy_array_equal(index_a == item, expected3)
             tm.assert_series_equal(series_a == item, Series(expected3))
 
+    def test_format(self):
+        # GH35439
+        idx = self.create_index()
+        expected = [str(x) for x in idx]
+        assert idx.format() == expected
+
     def test_hasnans_isnans(self, index):
         # GH 11343, added tests for hasnans / isnans
         if isinstance(index, MultiIndex):
diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py
@@ -20,6 +20,12 @@ def index(self, request):
     def create_index(self) -> DatetimeIndex:
         return date_range("20130101", periods=5)
 
+    def test_format(self):
+        # GH35439
+        idx = self.create_index()
+        expected = [f"{x:%Y-%m-%d}" for x in idx]
+        assert idx.format() == expected
+
     def test_shift(self):
         pass  # handled in test_ops
 
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ including other versions of pandas.`
`15`	`15`	`Fixed regressions`
`16`	`16`	`~~~~~~~~~~~~~~~~~`
`17`	`17`
`18`		`--`
	`18`	+- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`).
`19`	`19`	`-`
`20`	`20`	`-`
`21`	`21`
`@@ -26,6 +26,13 @@ Fixed regressions`
`26`	`26`	`Bug fixes`
`27`	`27`	`~~~~~~~~~`
`28`	`28`
	`29`	`+`
	`30`	`+Categorical`
	`31`	`+^^^^^^^^^^^`
	`32`	`+`
	`33`	+- Bug in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`)
	`34`	`+`
	`35`	`+`
`29`	`36`	`Datetimelike`
`30`	`37`
`31`	`38`	`-`