Skip to content

Commit 934a052

Browse files
Merge branch 'master' into duplicate-cut
2 parents aa1d1eb + daec2e7 commit 934a052

File tree

19 files changed

+135
-56
lines changed

19 files changed

+135
-56
lines changed

ci/run_tests.sh

+6-3
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ fi
3030
echo $PYTEST_CMD
3131
sh -c "$PYTEST_CMD"
3232

33-
PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas"
33+
if [[ "$PANDAS_DATA_MANAGER" != "array" ]]; then
34+
# The ArrayManager tests should have already been run by PYTEST_CMD if PANDAS_DATA_MANAGER was already set to array
35+
PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas"
3436

35-
echo $PYTEST_AM_CMD
36-
sh -c "$PYTEST_AM_CMD"
37+
echo $PYTEST_AM_CMD
38+
sh -c "$PYTEST_AM_CMD"
39+
fi

doc/source/whatsnew/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Version 1.3
2424
.. toctree::
2525
:maxdepth: 2
2626

27+
v1.3.2
2728
v1.3.1
2829
v1.3.0
2930

doc/source/whatsnew/v1.3.1.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,4 @@ Bug fixes
4848
Contributors
4949
~~~~~~~~~~~~
5050

51-
.. contributors:: v1.3.0..v1.3.1|HEAD
51+
.. contributors:: v1.3.0..v1.3.1

doc/source/whatsnew/v1.3.2.rst

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
.. _whatsnew_132:
2+
3+
What's new in 1.3.2 (August ??, 2021)
4+
-------------------------------------
5+
6+
These are the changes in pandas 1.3.2. See :ref:`release` for a full changelog
7+
including other versions of pandas.
8+
9+
{{ header }}
10+
11+
.. ---------------------------------------------------------------------------
12+
13+
.. _whatsnew_132.regressions:
14+
15+
Fixed regressions
16+
~~~~~~~~~~~~~~~~~
17+
-
18+
-
19+
20+
.. ---------------------------------------------------------------------------
21+
22+
.. _whatsnew_132.bug_fixes:
23+
24+
Bug fixes
25+
~~~~~~~~~
26+
-
27+
-
28+
29+
.. ---------------------------------------------------------------------------
30+
31+
.. _whatsnew_132.other:
32+
33+
Other
34+
~~~~~
35+
-
36+
-
37+
38+
.. ---------------------------------------------------------------------------
39+
40+
.. _whatsnew_132.contributors:
41+
42+
Contributors
43+
~~~~~~~~~~~~
44+
45+
.. contributors:: v1.3.1..v1.3.2|HEAD

doc/source/whatsnew/v1.4.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ enhancement2
2929

3030
Other enhancements
3131
^^^^^^^^^^^^^^^^^^
32+
- :class:`DataFrameGroupBy` operations with ``as_index=False`` now correctly retain ``ExtensionDtype`` dtypes for columns being grouped on (:issue:`41373`)
3233
- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
3334
- :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
3435
- Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
@@ -235,6 +236,7 @@ MultiIndex
235236
^^^^^^^^^^
236237
- Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`)
237238
- Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`)
239+
- Bug in :meth:`MultiIndex.get_loc` raising ``TypeError`` instead of ``KeyError`` on nested tuple (:issue:`42440`)
238240
-
239241

240242
I/O
@@ -262,8 +264,8 @@ Groupby/resample/rolling
262264

263265
Reshaping
264266
^^^^^^^^^
267+
- :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`)
265268
- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
266-
-
267269

268270
Sparse
269271
^^^^^^

pandas/core/groupby/generic.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1033,7 +1033,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
10331033
self._insert_inaxis_grouper_inplace(result)
10341034
result.index = Index(range(len(result)))
10351035

1036-
return result._convert(datetime=True)
1036+
return result
10371037

10381038
agg = aggregate
10391039

@@ -1684,6 +1684,8 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
16841684
if self.axis == 1:
16851685
result = result.T
16861686

1687+
# Note: we only need to pass datetime=True in order to get numeric
1688+
# values converted
16871689
return self._reindex_output(result)._convert(datetime=True)
16881690

16891691
def _iterate_column_groupbys(self, obj: FrameOrSeries):

pandas/core/groupby/grouper.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -619,11 +619,20 @@ def group_arraylike(self) -> ArrayLike:
619619
Analogous to result_index, but holding an ArrayLike to ensure
620620
we can can retain ExtensionDtypes.
621621
"""
622+
if self._group_index is not None:
623+
# _group_index is set in __init__ for MultiIndex cases
624+
return self._group_index._values
625+
626+
elif self._all_grouper is not None:
627+
# retain dtype for categories, including unobserved ones
628+
return self.result_index._values
629+
622630
return self._codes_and_uniques[1]
623631

624632
@cache_readonly
625633
def result_index(self) -> Index:
626-
# TODO: what's the difference between result_index vs group_index?
634+
# result_index retains dtype for categories, including unobserved ones,
635+
# which group_index does not
627636
if self._all_grouper is not None:
628637
group_idx = self.group_index
629638
assert isinstance(group_idx, CategoricalIndex)
@@ -635,7 +644,8 @@ def group_index(self) -> Index:
635644
if self._group_index is not None:
636645
# _group_index is set in __init__ for MultiIndex cases
637646
return self._group_index
638-
uniques = self.group_arraylike
647+
648+
uniques = self._codes_and_uniques[1]
639649
return Index(uniques, name=self.name)
640650

641651
@cache_readonly

pandas/core/groupby/ops.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,7 @@ def result_arraylike(self) -> ArrayLike:
885885
if len(self.groupings) == 1:
886886
return self.groupings[0].group_arraylike
887887

888+
# result_index is MultiIndex
888889
return self.result_index._values
889890

890891
@cache_readonly
@@ -903,12 +904,12 @@ def get_group_levels(self) -> list[ArrayLike]:
903904
# Note: only called from _insert_inaxis_grouper_inplace, which
904905
# is only called for BaseGrouper, never for BinGrouper
905906
if len(self.groupings) == 1:
906-
return [self.groupings[0].result_index]
907+
return [self.groupings[0].group_arraylike]
907908

908909
name_list = []
909910
for ping, codes in zip(self.groupings, self.reconstructed_codes):
910911
codes = ensure_platform_int(codes)
911-
levels = ping.result_index.take(codes)
912+
levels = ping.group_arraylike.take(codes)
912913

913914
name_list.append(levels)
914915

pandas/core/indexes/base.py

-18
Original file line numberDiff line numberDiff line change
@@ -5514,16 +5514,6 @@ def _get_indexer_non_comparable(
55145514
"""
55155515
if method is not None:
55165516
other = unpack_nested_dtype(target)
5517-
if self._is_multi ^ other._is_multi:
5518-
kind = other.dtype.type if self._is_multi else self.dtype.type
5519-
raise TypeError(
5520-
f"'<' not supported between instances of {kind} and 'tuple'"
5521-
)
5522-
elif self._is_multi and other._is_multi:
5523-
assert self.nlevels != other.nlevels
5524-
# Python allows comparison between tuples of different lengths,
5525-
# but for our purposes such a comparison is not meaningful.
5526-
raise TypeError("'<' not supported between tuples of different lengths")
55275517
raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")
55285518

55295519
no_matches = -1 * np.ones(target.shape, dtype=np.intp)
@@ -5653,14 +5643,6 @@ def _should_compare(self, other: Index) -> bool:
56535643

56545644
other = unpack_nested_dtype(other)
56555645
dtype = other.dtype
5656-
if other._is_multi:
5657-
if not self._is_multi:
5658-
# other contains only tuples so unless we are object-dtype,
5659-
# there can never be any matches
5660-
return self._is_comparable_dtype(dtype)
5661-
return self.nlevels == other.nlevels
5662-
# TODO: we can get more specific requiring levels are comparable?
5663-
56645646
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
56655647

56665648
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:

pandas/core/indexes/multi.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -2846,9 +2846,17 @@ def _maybe_to_slice(loc):
28462846
# needs linear search within the slice
28472847
i = self._lexsort_depth
28482848
lead_key, follow_key = key[:i], key[i:]
2849-
start, stop = (
2850-
self.slice_locs(lead_key, lead_key) if lead_key else (0, len(self))
2851-
)
2849+
2850+
if not lead_key:
2851+
start = 0
2852+
stop = len(self)
2853+
else:
2854+
try:
2855+
start, stop = self.slice_locs(lead_key, lead_key)
2856+
except TypeError as err:
2857+
# e.g. test_groupby_example key = ((0, 0, 1, 2), "new_col")
2858+
# when self has 5 integer levels
2859+
raise KeyError(key) from err
28522860

28532861
if start == stop:
28542862
raise KeyError(key)

pandas/core/indexing.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -646,8 +646,8 @@ def _get_setitem_indexer(self, key):
646646

647647
ax = self.obj._get_axis(0)
648648

649-
if isinstance(ax, MultiIndex) and self.name != "iloc":
650-
with suppress(TypeError, KeyError, InvalidIndexError):
649+
if isinstance(ax, MultiIndex) and self.name != "iloc" and is_hashable(key):
650+
with suppress(KeyError, InvalidIndexError):
651651
# TypeError e.g. passed a bool
652652
return ax.get_loc(key)
653653

pandas/core/reshape/concat.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -717,8 +717,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
717717
new_levels.extend(new_index.levels)
718718
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
719719
else:
720-
new_levels.append(new_index)
721-
new_codes.append(np.tile(np.arange(n), kpieces))
720+
new_levels.append(new_index.unique())
721+
single_codes = new_index.unique().get_indexer(new_index)
722+
new_codes.append(np.tile(single_codes, kpieces))
722723

723724
if len(new_names) < len(new_levels):
724725
new_names.extend(new_index.names)

pandas/tests/extension/base/groupby.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,14 @@ def test_grouping_grouper(self, data_for_grouping):
2222
def test_groupby_extension_agg(self, as_index, data_for_grouping):
2323
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
2424
result = df.groupby("B", as_index=as_index).A.mean()
25-
_, index = pd.factorize(data_for_grouping, sort=True)
25+
_, uniques = pd.factorize(data_for_grouping, sort=True)
2626

27-
index = pd.Index(index, name="B")
28-
expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
2927
if as_index:
28+
index = pd.Index(uniques, name="B")
29+
expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
3030
self.assert_series_equal(result, expected)
3131
else:
32-
expected = expected.reset_index()
32+
expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]})
3333
self.assert_frame_equal(result, expected)
3434

3535
def test_groupby_agg_extension(self, data_for_grouping):

pandas/tests/extension/json/test_json.py

-4
Original file line numberDiff line numberDiff line change
@@ -312,10 +312,6 @@ def test_groupby_extension_apply(self):
312312
we'll be able to dispatch unique.
313313
"""
314314

315-
@pytest.mark.parametrize("as_index", [True, False])
316-
def test_groupby_extension_agg(self, as_index, data_for_grouping):
317-
super().test_groupby_extension_agg(as_index, data_for_grouping)
318-
319315
@pytest.mark.xfail(reason="GH#39098: Converts agg result to object")
320316
def test_groupby_agg_extension(self, data_for_grouping):
321317
super().test_groupby_agg_extension(data_for_grouping)

pandas/tests/extension/test_boolean.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -269,14 +269,14 @@ def test_grouping_grouper(self, data_for_grouping):
269269
def test_groupby_extension_agg(self, as_index, data_for_grouping):
270270
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
271271
result = df.groupby("B", as_index=as_index).A.mean()
272-
_, index = pd.factorize(data_for_grouping, sort=True)
272+
_, uniques = pd.factorize(data_for_grouping, sort=True)
273273

274-
index = pd.Index(index, name="B")
275-
expected = pd.Series([3.0, 1.0], index=index, name="A")
276274
if as_index:
275+
index = pd.Index(uniques, name="B")
276+
expected = pd.Series([3.0, 1.0], index=index, name="A")
277277
self.assert_series_equal(result, expected)
278278
else:
279-
expected = expected.reset_index()
279+
expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0]})
280280
self.assert_frame_equal(result, expected)
281281

282282
def test_groupby_agg_extension(self, data_for_grouping):

pandas/tests/groupby/test_groupby.py

+4
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,10 @@ def test_ops_not_as_index(reduction_func):
717717
expected = expected.rename("size")
718718
expected = expected.reset_index()
719719

720+
if reduction_func != "size":
721+
# 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64
722+
expected["a"] = expected["a"].astype(df["a"].dtype)
723+
720724
g = df.groupby("a", as_index=False)
721725

722726
result = getattr(g, reduction_func)()

pandas/tests/indexes/multi/test_indexing.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import timedelta
2+
import re
23

34
import numpy as np
45
import pytest
@@ -457,15 +458,6 @@ def test_get_indexer_kwarg_validation(self):
457458
with pytest.raises(ValueError, match=msg):
458459
mi.get_indexer(mi[:-1], tolerance="piano")
459460

460-
def test_get_indexer_mismatched_nlevels(self):
461-
mi = MultiIndex.from_product([range(3), ["A", "B"]])
462-
463-
other = MultiIndex.from_product([range(3), ["A", "B"], range(2)])
464-
465-
msg = "tuples of different lengths"
466-
with pytest.raises(TypeError, match=msg):
467-
mi.get_indexer(other, method="pad")
468-
469461

470462
def test_getitem(idx):
471463
# scalar
@@ -698,6 +690,14 @@ def test_multiindex_get_loc_list_raises(self):
698690
with pytest.raises(TypeError, match=msg):
699691
idx.get_loc([])
700692

693+
def test_get_loc_nested_tuple_raises_keyerror(self):
694+
# raise KeyError, not TypeError
695+
mi = MultiIndex.from_product([range(3), range(4), range(5), range(6)])
696+
key = ((2, 3, 4), "foo")
697+
698+
with pytest.raises(KeyError, match=re.escape(str(key))):
699+
mi.get_loc(key)
700+
701701

702702
class TestWhere:
703703
def test_where(self):

pandas/tests/reshape/concat/test_dataframe.py

+12
Original file line numberDiff line numberDiff line change
@@ -180,3 +180,15 @@ def test_concat_bool_with_int(self):
180180
result = concat([df1, df2])
181181
expected = concat([df1.astype("int64"), df2])
182182
tm.assert_frame_equal(result, expected)
183+
184+
def test_concat_duplicates_in_index_with_keys(self):
185+
# GH#42651
186+
index = [1, 1, 3]
187+
data = [1, 2, 3]
188+
189+
df = DataFrame(data=data, index=index)
190+
result = concat([df], keys=["A"], names=["ID", "date"])
191+
mi = pd.MultiIndex.from_product([["A"], index], names=["ID", "date"])
192+
expected = DataFrame(data=data, index=mi)
193+
tm.assert_frame_equal(result, expected)
194+
tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))

pandas/tests/tools/test_to_datetime.py

+12
Original file line numberDiff line numberDiff line change
@@ -2530,3 +2530,15 @@ def test_empty_string_datetime_coerce__unit():
25302530
# verify that no exception is raised even when errors='raise' is set
25312531
result = to_datetime([1, ""], unit="s", errors="raise")
25322532
tm.assert_index_equal(expected, result)
2533+
2534+
2535+
@pytest.mark.parametrize("cache", [True, False])
2536+
def test_to_datetime_monotonic_increasing_index(cache):
2537+
# GH28238
2538+
cstart = start_caching_at
2539+
times = date_range(Timestamp("1980"), periods=cstart, freq="YS")
2540+
times = times.to_frame(index=False, name="DT").sample(n=cstart, random_state=1)
2541+
times.index = times.index.to_series().astype(float) / 1000
2542+
result = to_datetime(times.iloc[:, 0], cache=cache)
2543+
expected = times.iloc[:, 0]
2544+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)