Skip to content

Commit d515b5b

Browse files
author
Mateusz Górski
committed
BUG: fix DataFrame.apply returning wrong result when dealing with dtype (pandas-dev#28773)
The DataFrame.apply was sometimes returning wrong result when we passed function, that was dealing with dtypes. It was caused by retrieving the DataFrame.values of whole DataFrame, and applying the function to it: values are represented by NumPy array, which has one type for all data inside. It sometimes caused treating objects in DataFrame as if they had one common type. What's worth mentioning, the problem only existed, when we were applying function on columns. The implemented solution "cuts" the DataFrame by columns and applies function to each part, as it was whole DataFrame. After that, all results are concatenated into final result on whole DataFrame. The "cuts" are done in following way: the first column is taken, and then we iterate through next columns and take them into first cut while their dtype is identical as in the first column. The process is then repeated for the rest of DataFrame
1 parent 555e6e1 commit d515b5b

File tree

3 files changed

+65
-11
lines changed

3 files changed

+65
-11
lines changed

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,7 @@ Other
559559
- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
560560
- Fix :class:`AbstractHolidayCalendar` to return correct results for
561561
years after 2030 (now goes up to 2200) (:issue:`27790`)
562-
562+
- Bug in :meth:`DataFrame.apply` returning wrong result in some cases when dtype was involved in passed function (:issue:`28773`)
563563

564564
.. _whatsnew_1000.contributors:
565565

pandas/core/frame.py

+55-10
Original file line numberDiff line numberDiff line change
@@ -6603,16 +6603,61 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
66036603
"""
66046604
from pandas.core.apply import frame_apply
66056605

6606-
op = frame_apply(
6607-
self,
6608-
func=func,
6609-
axis=axis,
6610-
raw=raw,
6611-
result_type=result_type,
6612-
args=args,
6613-
kwds=kwds,
6614-
)
6615-
return op.get_result()
6606+
#Old apply function, which will be used for each part of DataFrame
6607+
def partial_apply(dataframe):
6608+
op = frame_apply(
6609+
dataframe,
6610+
func=func,
6611+
axis=axis,
6612+
raw=raw,
6613+
result_type=result_type,
6614+
args=args,
6615+
kwds=kwds,
6616+
)
6617+
return op.get_result()
6618+
6619+
def get_dtype(dataframe, column):
6620+
return dataframe.dtypes.values[column]
6621+
6622+
if axis == 0 or axis == 'index':
6623+
if self.shape[1] == 0:
6624+
return partial_apply(self)
6625+
6626+
frame = self.iloc[:, [0]]
6627+
result = partial_apply(frame)
6628+
if isinstance(result, Series):
6629+
results = result.values
6630+
else:
6631+
results = result
6632+
6633+
i = 1
6634+
while i < self.shape[1]:
6635+
type = get_dtype(self, i)
6636+
j = i + 1
6637+
6638+
#While the dtype of column is the same as previous ones, they are handled together
6639+
while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal(type, get_dtype(self, j)):
6640+
j += 1
6641+
frame = self.iloc[:, i: j]
6642+
i = j
6643+
result = partial_apply(frame)
6644+
6645+
if isinstance(result, Series):
6646+
results = np.append(results, result.values)
6647+
else:
6648+
for i in range(result.shape[0], results.shape[0]):
6649+
result.loc[i, :] = np.nan
6650+
for i in range(results.shape[0], result.shape[0]):
6651+
results.loc[i, :] = np.nan
6652+
results = pandas.concat([results, result], axis=1)
6653+
6654+
if isinstance(result, Series):
6655+
return Series(results, index=self.columns)
6656+
else:
6657+
return results
6658+
else:
6659+
return partial_apply(self)
6660+
66166661

66176662
def applymap(self, func):
66186663
"""

pandas/tests/frame/test_apply.py

+9
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,15 @@ def test_apply_dup_names_multi_agg(self):
689689

690690
tm.assert_frame_equal(result, expected)
691691

692+
def test_apply_get_dtype(self):
693+
# GH 28773
694+
df = DataFrame({
695+
"col_1": [1, 2, 3],
696+
"col_2": ["hi", "there", "friend"]
697+
})
698+
expected = Series(data=['int64', 'object'] ,index=['col_1', 'col_2'])
699+
tm.assert_series_equal(df.apply(lambda x: x.dtype), expected)
700+
692701

693702
class TestInferOutputShape:
694703
# the user has supplied an opaque UDF where

0 commit comments

Comments
 (0)