BUG: fix DataFrame.apply returning wrong result when dealing with dtype (pandas-dev#28773)

Mateusz Górski · Mateusz Górski · commit d515b5be54c5 · 2019-12-17T13:13:11.000+01:00
The DataFrame.apply was sometimes returning wrong result when we passed
function, that was dealing with dtypes. It was caused by retrieving
the DataFrame.values of whole DataFrame, and applying the function
to it: values are represented by NumPy array, which has one
type for all data inside. It sometimes caused treating objects
in DataFrame as if they had one common type. What's worth mentioning,
the problem only existed, when we were applying function on columns.

The implemented solution "cuts" the DataFrame by columns and applies
function to each part, as it was whole DataFrame. After that, all
results are concatenated into final result on whole DataFrame.
The "cuts" are done in following way: the first column is taken, and
then we iterate through next columns and take them into first cut
while their dtype is identical as in the first column. The process
is then repeated for the rest of DataFrame
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -559,7 +559,7 @@ Other
 - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
 - Fix :class:`AbstractHolidayCalendar` to return correct results for
   years after 2030 (now goes up to 2200) (:issue:`27790`)
-
+- Bug in :meth:`DataFrame.apply` returning wrong result in some cases when dtype was involved in passed function (:issue:`28773`)
 
 .. _whatsnew_1000.contributors:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6603,16 +6603,61 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
         """
         from pandas.core.apply import frame_apply
 
-        op = frame_apply(
-            self,
-            func=func,
-            axis=axis,
-            raw=raw,
-            result_type=result_type,
-            args=args,
-            kwds=kwds,
-        )
-        return op.get_result()
+        #Old apply function, which will be used for each part of DataFrame
+        def partial_apply(dataframe):
+            op = frame_apply(
+                dataframe,
+                func=func,
+                axis=axis,
+                raw=raw,
+                result_type=result_type,
+                args=args,
+                kwds=kwds,
+            )
+            return op.get_result()
+
+        def get_dtype(dataframe, column):
+            return dataframe.dtypes.values[column]
+
+        if axis == 0 or axis == 'index':
+            if self.shape[1] == 0:
+                return partial_apply(self)
+
+            frame = self.iloc[:, [0]]
+            result = partial_apply(frame)
+            if isinstance(result, Series):
+                results = result.values
+            else:
+                results = result
+
+            i = 1
+            while i < self.shape[1]:
+                type = get_dtype(self, i)
+                j = i + 1
+
+                #While the dtype of column is the same as previous ones, they are handled together
+                while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal(type, get_dtype(self, j)):
+                    j += 1
+                frame = self.iloc[:, i: j]
+                i = j
+                result = partial_apply(frame)
+
+                if isinstance(result, Series):
+                    results = np.append(results, result.values)
+                else:
+                    for i in range(result.shape[0], results.shape[0]):
+                        result.loc[i, :] = np.nan
+                    for i in range(results.shape[0], result.shape[0]):
+                        results.loc[i, :] = np.nan
+                    results = pandas.concat([results, result], axis=1)
+
+            if isinstance(result, Series):
+                return Series(results, index=self.columns)
+            else:
+                return results
+        else:
+            return partial_apply(self)
+
 
     def applymap(self, func):
         """
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -689,6 +689,15 @@ def test_apply_dup_names_multi_agg(self):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_apply_get_dtype(self):
+        # GH 28773
+        df = DataFrame({
+            "col_1": [1, 2, 3],
+            "col_2": ["hi", "there", "friend"]
+        })
+        expected = Series(data=['int64', 'object'] ,index=['col_1', 'col_2'])
+        tm.assert_series_equal(df.apply(lambda x: x.dtype), expected)
+
 
 class TestInferOutputShape:
     # the user has supplied an opaque UDF where