Merge remote-tracking branch 'upstream/master' into docfix-multiindex-set_levels

hweecat · hweecat · commit 0ba97531e053 · 2020-01-02T06:50:14.000+08:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -56,7 +56,7 @@ Dedicated string data type
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 We've added :class:`StringDtype`, an extension type dedicated to string data.
-Previously, strings were typically stored in object-dtype NumPy arrays.
+Previously, strings were typically stored in object-dtype NumPy arrays. (:issue:`29975`)
 
 .. warning::
 
@@ -938,6 +938,7 @@ Reshaping
 - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`)
 - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`).
 - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`)
+- Bug in :meth:`Index.join` that caused infinite recursion error for mismatched ``MultiIndex`` name orders. (:issue:`25760`, :issue:`28956`)
 - Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`)
 - Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`)
 - Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`)
@@ -977,7 +978,7 @@ Other
 - Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`)
 - Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:`29069`)
 - Bug in :class:`DataFrame` constructor when passing a 2D ``ndarray`` and an extension dtype (:issue:`12513`)
--
+- Bug in :meth:`DaataFrame.to_csv` when supplied a series with a ``dtype="string"`` and a ``na_rep``, the ``na_rep`` was being truncated to 2 characters. (:issue:`29975`)
 
 .. _whatsnew_1000.contributors:
 
diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in
@@ -8,14 +8,11 @@ from pandas._libs.algos import is_monotonic
 
 ctypedef fused int_scalar_t:
     int64_t
-    int32_t
     float64_t
-    float32_t
 
 ctypedef fused uint_scalar_t:
     uint64_t
     float64_t
-    float32_t
 
 ctypedef fused scalar_t:
     int_scalar_t
@@ -212,7 +209,7 @@ cdef sort_values_and_indices(all_values, all_indices, subset):
 {{py:
 
 nodes = []
-for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']:
+for dtype in ['float64', 'int64', 'uint64']:
     for closed, cmp_left, cmp_right in [
         ('left', '<=', '<'),
         ('right', '<', '<='),
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3368,8 +3368,13 @@ def _join_multi(self, other, how, return_indexers=True):
             ldrop_names = list(self_names - overlap)
             rdrop_names = list(other_names - overlap)
 
-            self_jnlevels = self.droplevel(ldrop_names)
-            other_jnlevels = other.droplevel(rdrop_names)
+            # if only the order differs
+            if not len(ldrop_names + rdrop_names):
+                self_jnlevels = self
+                other_jnlevels = other.reorder_levels(self.names)
+            else:
+                self_jnlevels = self.droplevel(ldrop_names)
+                other_jnlevels = other.droplevel(rdrop_names)
 
             # Join left and right
             # Join on same leveled multi-index frames is supported
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -657,9 +657,9 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
         if slicer is not None:
             values = values[:, slicer]
         mask = isna(values)
+        itemsize = writers.word_len(na_rep)
 
-        if not self.is_object and not quoting:
-            itemsize = writers.word_len(na_rep)
+        if not self.is_object and not quoting and itemsize:
             values = values.astype(f"<U{itemsize}")
         else:
             values = np.array(values, dtype="object")
@@ -1773,11 +1773,11 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
         mask = isna(values)
 
         try:
-            values = values.astype(str)
             values[mask] = na_rep
         except Exception:
             # eg SparseArray does not support setitem, needs to be converted to ndarray
             return super().to_native_types(slicer, na_rep, quoting, **kwargs)
+        values = values.astype(str)
 
         # we are expected to return a 2-d ndarray
         return values.reshape(1, len(values))
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
@@ -288,7 +288,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype):
 def test_array(array, attr, index_or_series):
     box = index_or_series
     if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index:
-        pytest.skip("No index type for {}".format(array.dtype))
+        pytest.skip(f"No index type for {array.dtype}")
     result = box(array, copy=False).array
 
     if attr:
@@ -351,7 +351,7 @@ def test_to_numpy(array, expected, index_or_series):
     thing = box(array)
 
     if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index:
-        pytest.skip("No index type for {}".format(array.dtype))
+        pytest.skip(f"No index type for {array.dtype}")
 
     result = thing.to_numpy()
     tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py
@@ -62,8 +62,8 @@ def setup_method(self, method):
         self.unicode_series = Series(arr, index=self.unicode_index, name="a")
 
         types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"]
-        self.indexes = [getattr(self, "{}_index".format(t)) for t in types]
-        self.series = [getattr(self, "{}_series".format(t)) for t in types]
+        self.indexes = [getattr(self, f"{t}_index") for t in types]
+        self.series = [getattr(self, f"{t}_series") for t in types]
 
         # To test narrow dtypes, we use narrower *data* elements, not *index* elements
         index = self.int_index
@@ -79,7 +79,7 @@ def setup_method(self, method):
         self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a")
 
         nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"]
-        self.narrow_series = [getattr(self, "{}_series".format(t)) for t in nrw_types]
+        self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types]
 
         self.objs = self.indexes + self.series + self.narrow_series
 
diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py
@@ -20,9 +20,7 @@ def skipif_32bit(param):
     return pytest.param(param, marks=marks)
 
 
-@pytest.fixture(
-    scope="class", params=["int32", "int64", "float32", "float64", "uint64"]
-)
+@pytest.fixture(scope="class", params=["int64", "float64", "uint64"])
 def dtype(request):
     return request.param
 
@@ -39,12 +37,9 @@ def leaf_size(request):
 @pytest.fixture(
     params=[
         np.arange(5, dtype="int64"),
-        np.arange(5, dtype="int32"),
         np.arange(5, dtype="uint64"),
         np.arange(5, dtype="float64"),
-        np.arange(5, dtype="float32"),
         np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"),
-        np.array([0, 1, 2, 3, 4, np.nan], dtype="float32"),
     ]
 )
 def tree(request, leaf_size):
@@ -64,13 +59,14 @@ def test_get_indexer(self, tree):
             tree.get_indexer(np.array([3.0]))
 
     @pytest.mark.parametrize(
-        "dtype, target_value", [("int64", 2 ** 63 + 1), ("uint64", -1)]
+        "dtype, target_value, target_dtype",
+        [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")],
     )
-    def test_get_indexer_overflow(self, dtype, target_value):
+    def test_get_indexer_overflow(self, dtype, target_value, target_dtype):
         left, right = np.array([0, 1], dtype=dtype), np.array([1, 2], dtype=dtype)
         tree = IntervalTree(left, right)
 
-        result = tree.get_indexer(np.array([target_value]))
+        result = tree.get_indexer(np.array([target_value], dtype=target_dtype))
         expected = np.array([-1], dtype="intp")
         tm.assert_numpy_array_equal(result, expected)
 
@@ -94,12 +90,13 @@ def test_get_indexer_non_unique(self, tree):
         tm.assert_numpy_array_equal(result, expected)
 
     @pytest.mark.parametrize(
-        "dtype, target_value", [("int64", 2 ** 63 + 1), ("uint64", -1)]
+        "dtype, target_value, target_dtype",
+        [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")],
     )
-    def test_get_indexer_non_unique_overflow(self, dtype, target_value):
+    def test_get_indexer_non_unique_overflow(self, dtype, target_value, target_dtype):
         left, right = np.array([0, 2], dtype=dtype), np.array([1, 3], dtype=dtype)
         tree = IntervalTree(left, right)
-        target = np.array([target_value])
+        target = np.array([target_value], dtype=target_dtype)
 
         result_indexer, result_missing = tree.get_indexer_non_unique(target)
         expected_indexer = np.array([-1], dtype="intp")
@@ -146,10 +143,10 @@ def test_get_indexer_closed(self, closed, leaf_size):
     @pytest.mark.parametrize(
         "left, right, expected",
         [
-            (np.array([0, 1, 4]), np.array([2, 3, 5]), True),
-            (np.array([0, 1, 2]), np.array([5, 4, 3]), True),
+            (np.array([0, 1, 4], dtype="int64"), np.array([2, 3, 5]), True),
+            (np.array([0, 1, 2], dtype="int64"), np.array([5, 4, 3]), True),
             (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True),
-            (np.array([0, 2, 4]), np.array([1, 3, 5]), False),
+            (np.array([0, 2, 4], dtype="int64"), np.array([1, 3, 5]), False),
             (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False),
         ],
     )
@@ -164,7 +161,7 @@ def test_is_overlapping(self, closed, order, left, right, expected):
     def test_is_overlapping_endpoints(self, closed, order):
         """shared endpoints are marked as overlapping"""
         # GH 23309
-        left, right = np.arange(3), np.arange(1, 4)
+        left, right = np.arange(3, dtype="int64"), np.arange(1, 4)
         tree = IntervalTree(left[order], right[order], closed=closed)
         result = tree.is_overlapping
         expected = closed == "both"
@@ -187,7 +184,7 @@ def test_is_overlapping_trivial(self, closed, left, right):
     @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440")
     def test_construction_overflow(self):
         # GH 25485
-        left, right = np.arange(101), [np.iinfo(np.int64).max] * 101
+        left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101
         tree = IntervalTree(left, right)
 
         # pivot should be average of left/right medians
diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py
@@ -87,3 +87,19 @@ def test_join_self_unique(idx, join_type):
     if idx.is_unique:
         joined = idx.join(idx, how=join_type)
         assert (idx == joined).all()
+
+
+def test_join_multi_wrong_order():
+    # GH 25760
+    # GH 28956
+
+    midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
+    midx2 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"])
+
+    join_idx, lidx, ridx = midx1.join(midx2, return_indexers=False)
+
+    exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp)
+
+    tm.assert_index_equal(midx1, join_idx)
+    assert lidx is None
+    tm.assert_numpy_array_equal(ridx, exp_ridx)
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -205,6 +205,14 @@ def test_to_csv_na_rep(self):
         assert df.set_index("a").to_csv(na_rep="_") == expected
         assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
 
+        # GH 29975
+        # Make sure full na_rep shows up when a dtype is provided
+        csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ")
+        expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
+        assert expected == csv
+        csv = pd.Series(["a", pd.NA, "c"], dtype="string").to_csv(na_rep="ZZZZZ")
+        assert expected == csv
+
     def test_to_csv_date_format(self):
         # GH 10209
         df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")})
diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py
@@ -828,3 +828,22 @@ def test_single_common_level(self):
         ).set_index(["key", "X", "Y"])
 
         tm.assert_frame_equal(result, expected)
+
+    def test_join_multi_wrong_order(self):
+        # GH 25760
+        # GH 28956
+
+        midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
+        midx3 = pd.MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
+
+        left = pd.DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
+        right = pd.DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
+
+        result = left.join(right)
+
+        expected = pd.DataFrame(
+            index=midx1,
+            data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
+        )
+
+        tm.assert_frame_equal(result, expected)