Skip to content

Commit ce619c7

Browse files
authored
Merge branch 'main' into issue-37210-to-sql-truncate
2 parents b7e79af + b7dedf5 commit ce619c7

File tree

14 files changed

+117
-85
lines changed

14 files changed

+117
-85
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,7 @@ Reshaping
669669
- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
670670
- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
671671
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
672+
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
672673
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
673674

674675
Sparse

pandas/core/arrays/string_.py

+4
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,10 @@ def __setitem__(self, key, value) -> None:
715715
else:
716716
if not is_array_like(value):
717717
value = np.asarray(value, dtype=object)
718+
else:
719+
# cast categories and friends to arrays to see if values are
720+
# compatible, compatibility with arrow backed strings
721+
value = np.asarray(value)
718722
if len(value) and not lib.is_string_array(value, skipna=True):
719723
raise TypeError("Must provide strings.")
720724

pandas/core/arrays/string_arrow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def _maybe_convert_setitem_value(self, value):
240240
value[isna(value)] = None
241241
for v in value:
242242
if not (v is None or isinstance(v, str)):
243-
raise TypeError("Scalar must be NA or str")
243+
raise TypeError("Must provide strings")
244244
return super()._maybe_convert_setitem_value(value)
245245

246246
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:

pandas/core/construction.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,10 @@ def sanitize_array(
611611
dtype = StringDtype(na_value=np.nan)
612612
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
613613

614-
if subarr is data and copy:
614+
if (
615+
subarr is data
616+
or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr]
617+
) and copy:
615618
subarr = subarr.copy()
616619

617620
else:

pandas/core/indexes/base.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,8 @@ def __new__(
504504

505505
elif is_ea_or_datetimelike_dtype(dtype):
506506
# non-EA dtype indexes have special casting logic, so we punt here
507-
pass
507+
if isinstance(data, (set, frozenset)):
508+
data = list(data)
508509

509510
elif is_ea_or_datetimelike_dtype(data_dtype):
510511
pass
@@ -6877,6 +6878,9 @@ def insert(self, loc: int, item) -> Index:
68776878
# We cannot keep the same dtype, so cast to the (often object)
68786879
# minimal shared dtype before doing the insert.
68796880
dtype = self._find_common_type_compat(item)
6881+
if dtype == self.dtype:
6882+
# EA's might run into recursion errors if loc is invalid
6883+
raise
68806884
return self.astype(dtype).insert(loc, item)
68816885

68826886
if arr.dtype != object or not isinstance(

pandas/core/reshape/pivot.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,12 @@ def _all_key(key):
557557
table_pieces.append(piece)
558558
margin_keys.append(all_key)
559559
else:
560-
from pandas import DataFrame
560+
margin = (
561+
data[cols[:1] + values]
562+
.groupby(cols[:1], observed=observed)
563+
.agg(aggfunc, **kwargs)
564+
.T
565+
)
561566

562567
cat_axis = 0
563568
for key, piece in table.groupby(level=0, observed=observed):
@@ -566,9 +571,7 @@ def _all_key(key):
566571
else:
567572
all_key = margins_name
568573
table_pieces.append(piece)
569-
# GH31016 this is to calculate margin for each group, and assign
570-
# corresponded key as index
571-
transformed_piece = DataFrame(piece.apply(aggfunc, **kwargs)).T
574+
transformed_piece = margin[key].to_frame().T
572575
if isinstance(piece.index, MultiIndex):
573576
# We are adding an empty level
574577
transformed_piece.index = MultiIndex.from_tuples(

pandas/tests/arrays/string_/test_string.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,7 @@ def test_setitem_validates(cls, dtype):
102102
with pytest.raises(TypeError, match=msg):
103103
arr[0] = 10
104104

105-
if dtype.storage == "python":
106-
msg = "Must provide strings."
107-
else:
108-
msg = "Scalar must be NA or str"
105+
msg = "Must provide strings"
109106
with pytest.raises(TypeError, match=msg):
110107
arr[:] = np.array([1, 2])
111108

pandas/tests/indexes/base_class/test_setops.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
import pandas as pd
97
from pandas import (
108
Index,
@@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort):
233231
expected = Index(expected)
234232
tm.assert_index_equal(result, expected)
235233

236-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
237234
@pytest.mark.parametrize("first_list", [["b", "a"], []])
238235
@pytest.mark.parametrize("second_list", [["a", "b"], []])
239236
@pytest.mark.parametrize(
@@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort):
243240
def test_union_name_preservation(
244241
self, first_list, second_list, first_name, second_name, expected_name, sort
245242
):
243+
expected_dtype = object if not first_list or not second_list else "str"
246244
first = Index(first_list, name=first_name)
247245
second = Index(second_list, name=second_name)
248246
union = first.union(second, sort=sort)
@@ -253,7 +251,7 @@ def test_union_name_preservation(
253251
expected = Index(sorted(vals), name=expected_name)
254252
tm.assert_index_equal(union, expected)
255253
else:
256-
expected = Index(vals, name=expected_name)
254+
expected = Index(vals, name=expected_name, dtype=expected_dtype)
257255
tm.assert_index_equal(union.sort_values(), expected.sort_values())
258256

259257
@pytest.mark.parametrize(

pandas/tests/indexes/test_base.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,6 @@ def test_constructor_casting(self, index):
7676
tm.assert_contains_all(arr, new_index)
7777
tm.assert_index_equal(index, new_index)
7878

79-
@pytest.mark.xfail(
80-
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
81-
)
8279
def test_constructor_copy(self, using_infer_string):
8380
index = Index(list("abc"), name="name")
8481
arr = np.array(index)
@@ -343,11 +340,6 @@ def test_constructor_empty_special(self, empty, klass):
343340
def test_view_with_args(self, index):
344341
index.view("i8")
345342

346-
@pytest.mark.xfail(
347-
using_string_dtype() and not HAS_PYARROW,
348-
reason="TODO(infer_string)",
349-
strict=False,
350-
)
351343
@pytest.mark.parametrize(
352344
"index",
353345
[
@@ -364,7 +356,8 @@ def test_view_with_args_object_array_raises(self, index):
364356
msg = "When changing to a larger dtype"
365357
with pytest.raises(ValueError, match=msg):
366358
index.view("i8")
367-
elif index.dtype == "string":
359+
elif index.dtype == "str" and not index.dtype.storage == "python":
360+
# TODO(infer_string): Make the errors consistent
368361
with pytest.raises(NotImplementedError, match="i8"):
369362
index.view("i8")
370363
else:

pandas/tests/indexes/test_old_base.py

+11-15
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@
66
import numpy as np
77
import pytest
88

9-
from pandas._config import using_string_dtype
10-
119
from pandas._libs.tslibs import Timestamp
12-
from pandas.compat import HAS_PYARROW
1310

1411
from pandas.core.dtypes.common import (
1512
is_integer_dtype,
@@ -28,6 +25,7 @@
2825
PeriodIndex,
2926
RangeIndex,
3027
Series,
28+
StringDtype,
3129
TimedeltaIndex,
3230
isna,
3331
period_range,
@@ -229,7 +227,6 @@ def test_logical_compat(self, simple_index):
229227
with pytest.raises(TypeError, match=msg):
230228
idx.any()
231229

232-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
233230
def test_repr_roundtrip(self, simple_index):
234231
if isinstance(simple_index, IntervalIndex):
235232
pytest.skip(f"Not a valid repr for {type(simple_index).__name__}")
@@ -246,11 +243,6 @@ def test_repr_max_seq_item_setting(self, simple_index):
246243
repr(idx)
247244
assert "..." not in str(idx)
248245

249-
@pytest.mark.xfail(
250-
using_string_dtype() and not HAS_PYARROW,
251-
reason="TODO(infer_string)",
252-
strict=False,
253-
)
254246
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
255247
def test_ensure_copied_data(self, index):
256248
# Check the "copy" argument of each Index.__new__ is honoured
@@ -296,7 +288,9 @@ def test_ensure_copied_data(self, index):
296288
tm.assert_numpy_array_equal(
297289
index._values._mask, result._values._mask, check_same="same"
298290
)
299-
elif index.dtype == "string[python]":
291+
elif (
292+
isinstance(index.dtype, StringDtype) and index.dtype.storage == "python"
293+
):
300294
assert np.shares_memory(index._values._ndarray, result._values._ndarray)
301295
tm.assert_numpy_array_equal(
302296
index._values._ndarray, result._values._ndarray, check_same="same"
@@ -444,11 +438,7 @@ def test_insert_base(self, index):
444438
result = trimmed.insert(0, index[0])
445439
assert index[0:4].equals(result)
446440

447-
@pytest.mark.skipif(
448-
using_string_dtype(),
449-
reason="completely different behavior, tested elsewher",
450-
)
451-
def test_insert_out_of_bounds(self, index):
441+
def test_insert_out_of_bounds(self, index, using_infer_string):
452442
# TypeError/IndexError matches what np.insert raises in these cases
453443

454444
if len(index) > 0:
@@ -460,6 +450,12 @@ def test_insert_out_of_bounds(self, index):
460450
msg = "index (0|0.5) is out of bounds for axis 0 with size 0"
461451
else:
462452
msg = "slice indices must be integers or None or have an __index__ method"
453+
454+
if using_infer_string and (
455+
index.dtype == "string" or index.dtype == "category" # noqa: PLR1714
456+
):
457+
msg = "loc must be an integer between"
458+
463459
with pytest.raises(err, match=msg):
464460
index.insert(0.5, "foo")
465461

pandas/tests/indexing/test_iloc.py

+16-15
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
import numpy as np
77
import pytest
88

9-
from pandas._config import using_string_dtype
10-
119
from pandas.errors import IndexingError
1210

1311
from pandas import (
@@ -1198,22 +1196,25 @@ def test_iloc_getitem_int_single_ea_block_view(self):
11981196
arr[2] = arr[-1]
11991197
assert ser[0] == arr[-1]
12001198

1201-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1202-
def test_iloc_setitem_multicolumn_to_datetime(self):
1199+
def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string):
12031200
# GH#20511
12041201
df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]})
12051202

1206-
df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
1207-
expected = DataFrame(
1208-
{
1209-
"A": [
1210-
Timestamp("2021-01-01 00:00:00"),
1211-
Timestamp("2022-01-01 00:00:00"),
1212-
],
1213-
"B": ["2021", "2022"],
1214-
}
1215-
)
1216-
tm.assert_frame_equal(df, expected, check_dtype=False)
1203+
if using_infer_string:
1204+
with pytest.raises(TypeError, match="Invalid value"):
1205+
df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
1206+
else:
1207+
df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
1208+
expected = DataFrame(
1209+
{
1210+
"A": [
1211+
Timestamp("2021-01-01 00:00:00"),
1212+
Timestamp("2022-01-01 00:00:00"),
1213+
],
1214+
"B": ["2021", "2022"],
1215+
}
1216+
)
1217+
tm.assert_frame_equal(df, expected, check_dtype=False)
12171218

12181219

12191220
class TestILocErrors:

pandas/tests/indexing/test_indexing.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas.errors import IndexingError
1412

1513
from pandas.core.dtypes.common import (
@@ -528,12 +526,12 @@ def test_string_slice_empty(self):
528526
with pytest.raises(KeyError, match="^0$"):
529527
df.loc["2011", 0]
530528

531-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
532529
def test_astype_assignment(self, using_infer_string):
533530
# GH4312 (iloc)
534531
df_orig = DataFrame(
535532
[["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
536533
)
534+
df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object)
537535

538536
df = df_orig.copy()
539537

@@ -543,9 +541,9 @@ def test_astype_assignment(self, using_infer_string):
543541
expected = DataFrame(
544542
[[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
545543
)
546-
if not using_infer_string:
547-
expected["A"] = expected["A"].astype(object)
548-
expected["B"] = expected["B"].astype(object)
544+
expected[list("CDG")] = expected[list("CDG")].astype(object)
545+
expected["A"] = expected["A"].astype(object)
546+
expected["B"] = expected["B"].astype(object)
549547
tm.assert_frame_equal(df, expected)
550548

551549
# GH5702 (loc)
@@ -554,18 +552,16 @@ def test_astype_assignment(self, using_infer_string):
554552
expected = DataFrame(
555553
[[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
556554
)
557-
if not using_infer_string:
558-
expected["A"] = expected["A"].astype(object)
555+
expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
559556
tm.assert_frame_equal(df, expected)
560557

561558
df = df_orig.copy()
559+
562560
df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
563561
expected = DataFrame(
564562
[["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
565563
)
566-
if not using_infer_string:
567-
expected["B"] = expected["B"].astype(object)
568-
expected["C"] = expected["C"].astype(object)
564+
expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
569565
tm.assert_frame_equal(df, expected)
570566

571567
def test_astype_assignment_full_replacements(self):

0 commit comments

Comments
 (0)