Skip to content

Commit 0ba9753

Browse files
committed
Merge remote-tracking branch 'upstream/master' into docfix-multiindex-set_levels
2 parents 46792ac + 641051f commit 0ba9753

File tree

10 files changed

+76
-33
lines changed

10 files changed

+76
-33
lines changed

doc/source/whatsnew/v1.0.0.rst

+3-2
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ Dedicated string data type
5656
^^^^^^^^^^^^^^^^^^^^^^^^^^
5757

5858
We've added :class:`StringDtype`, an extension type dedicated to string data.
59-
Previously, strings were typically stored in object-dtype NumPy arrays.
59+
Previously, strings were typically stored in object-dtype NumPy arrays. (:issue:`29975`)
6060

6161
.. warning::
6262

@@ -938,6 +938,7 @@ Reshaping
938938
- :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`)
939939
- Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`).
940940
- Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`)
941+
- Bug in :meth:`Index.join` that caused infinite recursion error for mismatched ``MultiIndex`` name orders. (:issue:`25760`, :issue:`28956`)
941942
- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`)
942943
- Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`)
943944
- Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`)
@@ -977,7 +978,7 @@ Other
977978
- Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`)
978979
- Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:`29069`)
979980
- Bug in :class:`DataFrame` constructor when passing a 2D ``ndarray`` and an extension dtype (:issue:`12513`)
980-
-
981+
- Bug in :meth:`DaataFrame.to_csv` when supplied a series with a ``dtype="string"`` and a ``na_rep``, the ``na_rep`` was being truncated to 2 characters. (:issue:`29975`)
981982

982983
.. _whatsnew_1000.contributors:
983984

pandas/_libs/intervaltree.pxi.in

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,11 @@ from pandas._libs.algos import is_monotonic
88

99
ctypedef fused int_scalar_t:
1010
int64_t
11-
int32_t
1211
float64_t
13-
float32_t
1412

1513
ctypedef fused uint_scalar_t:
1614
uint64_t
1715
float64_t
18-
float32_t
1916

2017
ctypedef fused scalar_t:
2118
int_scalar_t
@@ -212,7 +209,7 @@ cdef sort_values_and_indices(all_values, all_indices, subset):
212209
{{py:
213210

214211
nodes = []
215-
for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']:
212+
for dtype in ['float64', 'int64', 'uint64']:
216213
for closed, cmp_left, cmp_right in [
217214
('left', '<=', '<'),
218215
('right', '<', '<='),

pandas/core/indexes/base.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -3368,8 +3368,13 @@ def _join_multi(self, other, how, return_indexers=True):
33683368
ldrop_names = list(self_names - overlap)
33693369
rdrop_names = list(other_names - overlap)
33703370

3371-
self_jnlevels = self.droplevel(ldrop_names)
3372-
other_jnlevels = other.droplevel(rdrop_names)
3371+
# if only the order differs
3372+
if not len(ldrop_names + rdrop_names):
3373+
self_jnlevels = self
3374+
other_jnlevels = other.reorder_levels(self.names)
3375+
else:
3376+
self_jnlevels = self.droplevel(ldrop_names)
3377+
other_jnlevels = other.droplevel(rdrop_names)
33733378

33743379
# Join left and right
33753380
# Join on same leveled multi-index frames is supported

pandas/core/internals/blocks.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -657,9 +657,9 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
657657
if slicer is not None:
658658
values = values[:, slicer]
659659
mask = isna(values)
660+
itemsize = writers.word_len(na_rep)
660661

661-
if not self.is_object and not quoting:
662-
itemsize = writers.word_len(na_rep)
662+
if not self.is_object and not quoting and itemsize:
663663
values = values.astype(f"<U{itemsize}")
664664
else:
665665
values = np.array(values, dtype="object")
@@ -1773,11 +1773,11 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
17731773
mask = isna(values)
17741774

17751775
try:
1776-
values = values.astype(str)
17771776
values[mask] = na_rep
17781777
except Exception:
17791778
# eg SparseArray does not support setitem, needs to be converted to ndarray
17801779
return super().to_native_types(slicer, na_rep, quoting, **kwargs)
1780+
values = values.astype(str)
17811781

17821782
# we are expected to return a 2-d ndarray
17831783
return values.reshape(1, len(values))

pandas/tests/base/test_conversion.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype):
288288
def test_array(array, attr, index_or_series):
289289
box = index_or_series
290290
if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index:
291-
pytest.skip("No index type for {}".format(array.dtype))
291+
pytest.skip(f"No index type for {array.dtype}")
292292
result = box(array, copy=False).array
293293

294294
if attr:
@@ -351,7 +351,7 @@ def test_to_numpy(array, expected, index_or_series):
351351
thing = box(array)
352352

353353
if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index:
354-
pytest.skip("No index type for {}".format(array.dtype))
354+
pytest.skip(f"No index type for {array.dtype}")
355355

356356
result = thing.to_numpy()
357357
tm.assert_numpy_array_equal(result, expected)

pandas/tests/base/test_ops.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ def setup_method(self, method):
6262
self.unicode_series = Series(arr, index=self.unicode_index, name="a")
6363

6464
types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"]
65-
self.indexes = [getattr(self, "{}_index".format(t)) for t in types]
66-
self.series = [getattr(self, "{}_series".format(t)) for t in types]
65+
self.indexes = [getattr(self, f"{t}_index") for t in types]
66+
self.series = [getattr(self, f"{t}_series") for t in types]
6767

6868
# To test narrow dtypes, we use narrower *data* elements, not *index* elements
6969
index = self.int_index
@@ -79,7 +79,7 @@ def setup_method(self, method):
7979
self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a")
8080

8181
nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"]
82-
self.narrow_series = [getattr(self, "{}_series".format(t)) for t in nrw_types]
82+
self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types]
8383

8484
self.objs = self.indexes + self.series + self.narrow_series
8585

pandas/tests/indexes/interval/test_interval_tree.py

+14-17
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ def skipif_32bit(param):
2020
return pytest.param(param, marks=marks)
2121

2222

23-
@pytest.fixture(
24-
scope="class", params=["int32", "int64", "float32", "float64", "uint64"]
25-
)
23+
@pytest.fixture(scope="class", params=["int64", "float64", "uint64"])
2624
def dtype(request):
2725
return request.param
2826

@@ -39,12 +37,9 @@ def leaf_size(request):
3937
@pytest.fixture(
4038
params=[
4139
np.arange(5, dtype="int64"),
42-
np.arange(5, dtype="int32"),
4340
np.arange(5, dtype="uint64"),
4441
np.arange(5, dtype="float64"),
45-
np.arange(5, dtype="float32"),
4642
np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"),
47-
np.array([0, 1, 2, 3, 4, np.nan], dtype="float32"),
4843
]
4944
)
5045
def tree(request, leaf_size):
@@ -64,13 +59,14 @@ def test_get_indexer(self, tree):
6459
tree.get_indexer(np.array([3.0]))
6560

6661
@pytest.mark.parametrize(
67-
"dtype, target_value", [("int64", 2 ** 63 + 1), ("uint64", -1)]
62+
"dtype, target_value, target_dtype",
63+
[("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")],
6864
)
69-
def test_get_indexer_overflow(self, dtype, target_value):
65+
def test_get_indexer_overflow(self, dtype, target_value, target_dtype):
7066
left, right = np.array([0, 1], dtype=dtype), np.array([1, 2], dtype=dtype)
7167
tree = IntervalTree(left, right)
7268

73-
result = tree.get_indexer(np.array([target_value]))
69+
result = tree.get_indexer(np.array([target_value], dtype=target_dtype))
7470
expected = np.array([-1], dtype="intp")
7571
tm.assert_numpy_array_equal(result, expected)
7672

@@ -94,12 +90,13 @@ def test_get_indexer_non_unique(self, tree):
9490
tm.assert_numpy_array_equal(result, expected)
9591

9692
@pytest.mark.parametrize(
97-
"dtype, target_value", [("int64", 2 ** 63 + 1), ("uint64", -1)]
93+
"dtype, target_value, target_dtype",
94+
[("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")],
9895
)
99-
def test_get_indexer_non_unique_overflow(self, dtype, target_value):
96+
def test_get_indexer_non_unique_overflow(self, dtype, target_value, target_dtype):
10097
left, right = np.array([0, 2], dtype=dtype), np.array([1, 3], dtype=dtype)
10198
tree = IntervalTree(left, right)
102-
target = np.array([target_value])
99+
target = np.array([target_value], dtype=target_dtype)
103100

104101
result_indexer, result_missing = tree.get_indexer_non_unique(target)
105102
expected_indexer = np.array([-1], dtype="intp")
@@ -146,10 +143,10 @@ def test_get_indexer_closed(self, closed, leaf_size):
146143
@pytest.mark.parametrize(
147144
"left, right, expected",
148145
[
149-
(np.array([0, 1, 4]), np.array([2, 3, 5]), True),
150-
(np.array([0, 1, 2]), np.array([5, 4, 3]), True),
146+
(np.array([0, 1, 4], dtype="int64"), np.array([2, 3, 5]), True),
147+
(np.array([0, 1, 2], dtype="int64"), np.array([5, 4, 3]), True),
151148
(np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True),
152-
(np.array([0, 2, 4]), np.array([1, 3, 5]), False),
149+
(np.array([0, 2, 4], dtype="int64"), np.array([1, 3, 5]), False),
153150
(np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False),
154151
],
155152
)
@@ -164,7 +161,7 @@ def test_is_overlapping(self, closed, order, left, right, expected):
164161
def test_is_overlapping_endpoints(self, closed, order):
165162
"""shared endpoints are marked as overlapping"""
166163
# GH 23309
167-
left, right = np.arange(3), np.arange(1, 4)
164+
left, right = np.arange(3, dtype="int64"), np.arange(1, 4)
168165
tree = IntervalTree(left[order], right[order], closed=closed)
169166
result = tree.is_overlapping
170167
expected = closed == "both"
@@ -187,7 +184,7 @@ def test_is_overlapping_trivial(self, closed, left, right):
187184
@pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440")
188185
def test_construction_overflow(self):
189186
# GH 25485
190-
left, right = np.arange(101), [np.iinfo(np.int64).max] * 101
187+
left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101
191188
tree = IntervalTree(left, right)
192189

193190
# pivot should be average of left/right medians

pandas/tests/indexes/multi/test_join.py

+16
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,19 @@ def test_join_self_unique(idx, join_type):
8787
if idx.is_unique:
8888
joined = idx.join(idx, how=join_type)
8989
assert (idx == joined).all()
90+
91+
92+
def test_join_multi_wrong_order():
93+
# GH 25760
94+
# GH 28956
95+
96+
midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
97+
midx2 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"])
98+
99+
join_idx, lidx, ridx = midx1.join(midx2, return_indexers=False)
100+
101+
exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp)
102+
103+
tm.assert_index_equal(midx1, join_idx)
104+
assert lidx is None
105+
tm.assert_numpy_array_equal(ridx, exp_ridx)

pandas/tests/io/formats/test_to_csv.py

+8
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,14 @@ def test_to_csv_na_rep(self):
205205
assert df.set_index("a").to_csv(na_rep="_") == expected
206206
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
207207

208+
# GH 29975
209+
# Make sure full na_rep shows up when a dtype is provided
210+
csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ")
211+
expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
212+
assert expected == csv
213+
csv = pd.Series(["a", pd.NA, "c"], dtype="string").to_csv(na_rep="ZZZZZ")
214+
assert expected == csv
215+
208216
def test_to_csv_date_format(self):
209217
# GH 10209
210218
df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")})

pandas/tests/reshape/merge/test_multi.py

+19
Original file line numberDiff line numberDiff line change
@@ -828,3 +828,22 @@ def test_single_common_level(self):
828828
).set_index(["key", "X", "Y"])
829829

830830
tm.assert_frame_equal(result, expected)
831+
832+
def test_join_multi_wrong_order(self):
833+
# GH 25760
834+
# GH 28956
835+
836+
midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
837+
midx3 = pd.MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"])
838+
839+
left = pd.DataFrame(index=midx1, data={"x": [10, 20, 30, 40]})
840+
right = pd.DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]})
841+
842+
result = left.join(right)
843+
844+
expected = pd.DataFrame(
845+
index=midx1,
846+
data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]},
847+
)
848+
849+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)