Skip to content

Commit e8ce5e8

Browse files
committed
down to 322 fails
1 parent 7f73a89 commit e8ce5e8

File tree

7 files changed

+58
-34
lines changed

7 files changed

+58
-34
lines changed

pandas/core/frame.py

+4-9
Original file line numberDiff line numberDiff line change
@@ -4760,7 +4760,8 @@ def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
47604760
and not is_bool_dtype(dtype)
47614761
)
47624762

4763-
def predicate(dtype: DtypeObj) -> bool:
4763+
def predicate(arr: ArrayLike) -> bool:
4764+
dtype = arr.dtype
47644765
if include:
47654766
if not dtype_predicate(dtype, include):
47664767
return False
@@ -4771,14 +4772,8 @@ def predicate(dtype: DtypeObj) -> bool:
47714772

47724773
return True
47734774

4774-
def arr_predicate(arr: ArrayLike) -> bool:
4775-
dtype = arr.dtype
4776-
return predicate(dtype)
4777-
4778-
mgr, taker = self._mgr._get_data_subset(arr_predicate).copy(deep=None)
4779-
# FIXME: get axes without mgr.axes
4780-
# FIXME: return taker from _get_data_subset, this is really slow
4781-
#taker = self.dtypes.apply(predicate).values.nonzero()[0]
4775+
mgr, taker = self._mgr._get_data_subset(predicate)
4776+
mgr = mgr.copy(deep=None)
47824777
columns = self.columns.take(taker)
47834778
return type(self)(mgr, columns=columns, index=self.index).__finalize__(self)
47844779

pandas/core/generic.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,7 @@ def _validate_set_axis(self, axis: int, new_labels: Index) -> None:
840840
old_len = self.shape[axis]
841841
new_len = len(new_labels)
842842

843-
if axis == 1 and len(self.columns) == 0:
843+
if self.ndim > 1 and axis == 0 and len(self.columns) == 0:
844844
# If we are setting the index on a DataFrame with no columns,
845845
# it is OK to change the length.
846846
pass
@@ -3933,6 +3933,14 @@ def _take(
39333933
convert_indices=convert_indices,
39343934
)
39353935

3936+
# We have 6 tests that get here with a slice; TODO: maybe avoid?
3937+
# TODO: de-duplicate with similar inside BlockManager.take
3938+
indices = (
3939+
np.arange(indices.start, indices.stop, indices.step, dtype=np.intp)
3940+
if isinstance(indices, slice)
3941+
else np.asanyarray(indices, dtype=np.intp) # <- converts some cases with empty float64
3942+
)
3943+
39363944
axes_dict = self._construct_axes_dict()
39373945
if convert_indices and isinstance(indices, np.ndarray):
39383946
# i.e. exclude slice, which in principle shouldn't be in a _take
@@ -5490,9 +5498,6 @@ def _reindex_with_indexers(
54905498
if copy and new_data is self._mgr:
54915499
new_data = new_data.copy()
54925500

5493-
# FIXME: get axes without mgr.axes
5494-
#axes_dict = self._get_axes_from_mgr(new_data)
5495-
54965501
return self._constructor(new_data, **axes_dict).__finalize__(self)
54975502

54985503
def filter(

pandas/core/groupby/generic.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series:
175175
else:
176176
mgr = cast(Manager2D, mgr)
177177
single = mgr.iget(0)
178+
#breakpoint()
178179
# FIXME: get axes without mgr.axes
179180
index = single.axes[0]
180181
ser = self.obj._constructor(single, index=index, name=self.obj.name)
@@ -1329,14 +1330,26 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
13291330

13301331
# We could use `mgr.apply` here and not have to set_axis, but
13311332
# we would have to do shape gymnastics for ArrayManager compat
1332-
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
1333+
res_mgr, taker = mgr.grouped_reduce(arr_func, ignore_failures=True)
13331334
res_mgr.set_axis(1, mgr.axes[1])
13341335

13351336
if len(res_mgr) < orig_mgr_len:
13361337
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
13371338

1338-
# FIXME: get axes without mgr.axes
1339-
res_df = self.obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0])
1339+
columns = mgr.axes[0]
1340+
index = res_mgr.axes[1] # FIXME: get index without res_mgr.axes
1341+
if self.axis == 0:
1342+
1343+
pass#index = self._obj_with_exclusions.index
1344+
#columns = columns[taker]
1345+
#breakpoint()
1346+
else:
1347+
#columns = self._obj_with_exclusions.index
1348+
pass#index = self._obj_with_exclusions.columns
1349+
#breakpoint()
1350+
1351+
columns = columns[taker]
1352+
res_df = self.obj._constructor(res_mgr, index=index, columns=columns)
13401353
if self.axis == 1:
13411354
res_df = res_df.T
13421355
return res_df

pandas/core/groupby/groupby.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -1780,7 +1780,7 @@ def array_func(values: ArrayLike) -> ArrayLike:
17801780

17811781
# TypeError -> we may have an exception in trying to aggregate
17821782
# continue and exclude the block
1783-
new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
1783+
new_mgr, taker = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
17841784

17851785
if not is_ser and len(new_mgr) < orig_len:
17861786
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
@@ -2055,7 +2055,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
20552055
return counted[0]
20562056
return counted
20572057

2058-
new_mgr = data.grouped_reduce(hfunc)
2058+
new_mgr, taker = data.grouped_reduce(hfunc)
20592059

20602060
# If we are grouping on categoricals we want unobserved categories to
20612061
# return zero, rather than the default of NaN which the reindexing in
@@ -3374,7 +3374,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
33743374
mgr = self._get_data_to_aggregate()
33753375
data = mgr.get_numeric_data()[0] if numeric_only_bool else mgr
33763376
ignore_failures = numeric_only_bool
3377-
res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
3377+
res_mgr, taker = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
33783378

33793379
if (
33803380
numeric_only is lib.no_default
@@ -3401,6 +3401,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
34013401
axes_dict["index"] = res_mgr.axes[-1]
34023402
if res_mgr.ndim == 2:
34033403
axes_dict["columns"] = res_mgr.axes[0]
3404+
#breakpoint()
34043405
res = obj._constructor(res_mgr, **axes_dict)
34053406

34063407
if orig_scalar:
@@ -3693,7 +3694,7 @@ def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:
36933694
skipna = kwargs.get("skipna", True)
36943695
if axis != 0:
36953696
f = lambda x: np.minimum.accumulate(x, axis)
3696-
numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)
3697+
numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) # TODO: "cummin"?
36973698
obj = self._selected_obj
36983699
if numeric_only_bool:
36993700
obj = obj._get_numeric_data()
@@ -3853,7 +3854,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
38533854
if numeric_only_bool:
38543855
mgr = mgr.get_numeric_data()[0]
38553856

3856-
res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
3857+
res_mgr, taker = mgr.grouped_reduce(blk_func, ignore_failures=True)
38573858

38583859
if not is_ser and len(res_mgr.items) != orig_mgr_len:
38593860
howstr = how.replace("group_", "")
@@ -3871,7 +3872,12 @@ def blk_func(values: ArrayLike) -> ArrayLike:
38713872
out = self._wrap_agged_manager(res_mgr)
38723873
else:
38733874
# FIXME: get axes without mgr.axes
3874-
out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=res_mgr.axes[0])
3875+
if self.axis == 0 and not numeric_only_bool:
3876+
columns = self._obj_with_exclusions.columns[taker]
3877+
else:
3878+
#breakpoint()
3879+
columns = res_mgr.axes[0]
3880+
out = obj._constructor(res_mgr, index=res_mgr.axes[1], columns=columns)
38753881

38763882
return self._wrap_aggregated_output(out)
38773883

pandas/core/internals/array_manager.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ def is_view(self) -> bool:
464464
def is_single_block(self) -> bool:
465465
return len(self.arrays) == 1
466466

467-
def _get_data_subset(self: T, predicate: Callable) -> T:
467+
def _get_data_subset(self: T, predicate: Callable) -> tuple[T, npt.NDArray[np.intp]]:
468468
indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
469469
arrays = [self.arrays[i] for i in indices]
470470
# TODO copy?
@@ -473,9 +473,9 @@ def _get_data_subset(self: T, predicate: Callable) -> T:
473473
taker = np.array(indices, dtype="intp")
474474
new_cols = self._axes[1].take(taker)
475475
new_axes = [self._axes[0], new_cols]
476-
return type(self)(arrays, new_axes, verify_integrity=False)
476+
return type(self)(arrays, new_axes, verify_integrity=False), taker
477477

478-
def get_bool_data(self: T, copy: bool = False) -> T:
478+
def get_bool_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
479479
"""
480480
Select columns that are bool-dtype and object-dtype columns that are all-bool.
481481
@@ -485,9 +485,8 @@ def get_bool_data(self: T, copy: bool = False) -> T:
485485
Whether to copy the blocks
486486
"""
487487
return self._get_data_subset(is_inferred_bool_dtype)
488-
# FIXME: return indexer
489488

490-
def get_numeric_data(self: T, copy: bool = False) -> T:
489+
def get_numeric_data(self: T, copy: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
491490
"""
492491
Select columns that have a numeric dtype.
493492
@@ -935,7 +934,7 @@ def idelete(self, indexer) -> ArrayManager:
935934
# --------------------------------------------------------------------
936935
# Array-wise Operation
937936

938-
def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
937+
def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
939938
"""
940939
Apply grouped reduction function columnwise, returning a new ArrayManager.
941940
@@ -948,6 +947,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
948947
Returns
949948
-------
950949
ArrayManager
950+
np.ndarray[intp]
951951
"""
952952
result_arrays: list[np.ndarray] = []
953953
result_indices: list[int] = []
@@ -975,14 +975,16 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
975975
else:
976976
index = Index(range(result_arrays[0].shape[0]))
977977

978+
taker = None
978979
if ignore_failures:
979-
columns = self.items[np.array(result_indices, dtype="int64")]
980+
taker = np.array(result_indices, dtype=np.intp)
981+
columns = self.items[taker]
980982
else:
981983
columns = self.items
982984

983985
# error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
984986
# expected "List[Union[ndarray, ExtensionArray]]"
985-
return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
987+
return type(self)(result_arrays, [index, columns]), taker # type: ignore[arg-type]
986988

987989
def reduce(
988990
self: T, func: Callable, ignore_failures: bool = False

pandas/core/internals/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def grouped_reduce(self, func, ignore_failures: bool = False):
203203
index = default_index(len(res))
204204

205205
mgr = type(self).from_array(res, index)
206-
return mgr
206+
return mgr, np.arange(len(res), dtype=np.intp) # TODO: is taker meaningful here?
207207

208208
@classmethod
209209
def from_array(cls, arr: ArrayLike, index: Index):

pandas/core/internals/managers.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -1475,7 +1475,7 @@ def idelete(self, indexer) -> BlockManager:
14751475
# ----------------------------------------------------------------
14761476
# Block-wise Operation
14771477

1478-
def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
1478+
def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> tuple[T, npt.NDArray[np.intp]]:
14791479
"""
14801480
Apply grouped reduction function blockwise, returning a new BlockManager.
14811481
@@ -1488,6 +1488,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
14881488
Returns
14891489
-------
14901490
BlockManager
1491+
np.ndarray[intp]
14911492
"""
14921493
result_blocks: list[Block] = []
14931494
dropped_any = False
@@ -1522,9 +1523,10 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
15221523

15231524
if dropped_any:
15241525
# faster to skip _combine if we haven't dropped any blocks
1525-
return self._combine(result_blocks, copy=False, index=index)[0]
1526+
return self._combine(result_blocks, copy=False, index=index)
15261527

1527-
return type(self).from_blocks(result_blocks, [self.axes[0], index])
1528+
taker = np.arange(len(self), dtype=np.intp)
1529+
return type(self).from_blocks(result_blocks, [self.axes[0], index]), taker
15281530

15291531
def reduce(
15301532
self: T, func: Callable, ignore_failures: bool = False
@@ -2055,6 +2057,7 @@ def array_values(self):
20552057

20562058
def get_numeric_data(self, copy: bool = False):
20572059
if self._block.is_numeric:
2060+
taker = np.arange(len(self.items), dtype=np.intp)
20582061
return self.copy(deep=copy), taker
20592062
taker = np.array([], dtype=np.intp)
20602063
return self.make_empty(), taker

0 commit comments

Comments
 (0)