Skip to content

Commit 44ce988

Browse files
authored
BUG: df.diff axis=1 mixed dtypes (pandas-dev#36710)
1 parent ff55413 commit 44ce988

File tree

5 files changed

+65
-25
lines changed

5 files changed

+65
-25
lines changed

pandas/core/frame.py

+41-10
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@
100100
is_dict_like,
101101
is_dtype_equal,
102102
is_extension_array_dtype,
103+
is_float,
103104
is_float_dtype,
104105
is_hashable,
105106
is_integer,
@@ -4465,7 +4466,34 @@ def _replace_columnwise(
44654466
return res.__finalize__(self)
44664467

44674468
@doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
4468-
def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> DataFrame:
4469+
def shift(
4470+
self, periods=1, freq=None, axis=0, fill_value=lib.no_default
4471+
) -> DataFrame:
4472+
axis = self._get_axis_number(axis)
4473+
4474+
ncols = len(self.columns)
4475+
if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0:
4476+
# We will infer fill_value to match the closest column
4477+
4478+
if periods > 0:
4479+
result = self.iloc[:, :-periods]
4480+
for col in range(min(ncols, abs(periods))):
4481+
# TODO(EA2D): doing this in a loop unnecessary with 2D EAs
4482+
# Define filler inside loop so we get a copy
4483+
filler = self.iloc[:, 0].shift(len(self))
4484+
result.insert(0, col, filler, allow_duplicates=True)
4485+
else:
4486+
result = self.iloc[:, -periods:]
4487+
for col in range(min(ncols, abs(periods))):
4488+
# Define filler inside loop so we get a copy
4489+
filler = self.iloc[:, -1].shift(len(self))
4490+
result.insert(
4491+
len(result.columns), col, filler, allow_duplicates=True
4492+
)
4493+
4494+
result.columns = self.columns.copy()
4495+
return result
4496+
44694497
return super().shift(
44704498
periods=periods, freq=freq, axis=axis, fill_value=fill_value
44714499
)
@@ -7215,13 +7243,13 @@ def melt(
72157243
Difference with previous column
72167244
72177245
>>> df.diff(axis=1)
7218-
a b c
7219-
0 NaN 0.0 0.0
7220-
1 NaN -1.0 3.0
7221-
2 NaN -1.0 7.0
7222-
3 NaN -1.0 13.0
7223-
4 NaN 0.0 20.0
7224-
5 NaN 2.0 28.0
7246+
a b c
7247+
0 NaN 0 0
7248+
1 NaN -1 3
7249+
2 NaN -1 7
7250+
3 NaN -1 13
7251+
4 NaN 0 20
7252+
5 NaN 2 28
72257253
72267254
Difference with 3rd previous row
72277255
@@ -7255,12 +7283,15 @@ def melt(
72557283
),
72567284
)
72577285
def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
7286+
if not isinstance(periods, int):
7287+
if not (is_float(periods) and periods.is_integer()):
7288+
raise ValueError("periods must be an integer")
7289+
periods = int(periods)
72587290

72597291
bm_axis = self._get_block_manager_axis(axis)
7260-
self._consolidate_inplace()
72617292

72627293
if bm_axis == 0 and periods != 0:
7263-
return self.T.diff(periods, axis=0).T
7294+
return self - self.shift(periods, axis=axis) # type: ignore[operator]
72647295

72657296
new_data = self._mgr.diff(n=periods, axis=bm_axis)
72667297
return self._constructor(new_data)

pandas/core/generic.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -9245,11 +9245,11 @@ def shift(
92459245
92469246
>>> df.shift(periods=1, axis="columns")
92479247
Col1 Col2 Col3
9248-
2020-01-01 NaN 10.0 13.0
9249-
2020-01-02 NaN 20.0 23.0
9250-
2020-01-03 NaN 15.0 18.0
9251-
2020-01-04 NaN 30.0 33.0
9252-
2020-01-05 NaN 45.0 48.0
9248+
2020-01-01 NaN 10 13
9249+
2020-01-02 NaN 20 23
9250+
2020-01-03 NaN 15 18
9251+
2020-01-04 NaN 30 33
9252+
2020-01-05 NaN 45 48
92539253
92549254
>>> df.shift(periods=3, fill_value=0)
92559255
Col1 Col2 Col3

pandas/core/internals/managers.py

+4
Original file line numberDiff line numberDiff line change
@@ -557,8 +557,12 @@ def interpolate(self, **kwargs) -> "BlockManager":
557557
return self.apply("interpolate", **kwargs)
558558

559559
def shift(self, periods: int, axis: int, fill_value) -> "BlockManager":
560+
if fill_value is lib.no_default:
561+
fill_value = None
562+
560563
if axis == 0 and self.ndim == 2 and self.nblocks > 1:
561564
# GH#35488 we need to watch out for multi-block cases
565+
# We only get here with fill_value not-lib.no_default
562566
ncols = self.shape[0]
563567
if periods > 0:
564568
indexer = [-1] * periods + list(range(ncols - periods))

pandas/core/reshape/melt.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ def wide_to_long(
444444
8 3 3 2.1 2.9
445445
446446
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
447-
... sep='_', suffix='\w+')
447+
... sep='_', suffix=r'\w+')
448448
>>> l
449449
... # doctest: +NORMALIZE_WHITESPACE
450450
ht

pandas/tests/frame/methods/test_diff.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77

88

99
class TestDataFrameDiff:
10+
def test_diff_requires_integer(self):
11+
df = pd.DataFrame(np.random.randn(2, 2))
12+
with pytest.raises(ValueError, match="periods must be an integer"):
13+
df.diff(1.5)
14+
1015
def test_diff(self, datetime_frame):
1116
the_diff = datetime_frame.diff(1)
1217

@@ -31,9 +36,7 @@ def test_diff(self, datetime_frame):
3136
df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])})
3237
df.insert(0, "x", 1)
3338
result = df.diff(axis=1)
34-
expected = pd.DataFrame(
35-
{"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}
36-
).astype("float64")
39+
expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)})
3740
tm.assert_frame_equal(result, expected)
3841

3942
@pytest.mark.parametrize("tz", [None, "UTC"])
@@ -116,19 +119,13 @@ def test_diff_axis(self):
116119
df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])
117120
)
118121

119-
@pytest.mark.xfail(
120-
reason="GH#32995 needs to operate column-wise or do inference",
121-
raises=AssertionError,
122-
)
123122
def test_diff_period(self):
124123
# GH#32995 Don't pass an incorrect axis
125-
# TODO(EA2D): this bug wouldn't have happened with 2D EA
126124
pi = pd.date_range("2016-01-01", periods=3).to_period("D")
127125
df = pd.DataFrame({"A": pi})
128126

129127
result = df.diff(1, axis=1)
130128

131-
# TODO: should we make Block.diff do type inference? or maybe algos.diff?
132129
expected = (df - pd.NaT).astype(object)
133130
tm.assert_frame_equal(result, expected)
134131

@@ -141,6 +138,14 @@ def test_diff_axis1_mixed_dtypes(self):
141138
result = df.diff(axis=1)
142139
tm.assert_frame_equal(result, expected)
143140

141+
# GH#21437 mixed-float-dtypes
142+
df = pd.DataFrame(
143+
{"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")}
144+
)
145+
result = df.diff(axis=1)
146+
expected = pd.DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
147+
tm.assert_frame_equal(result, expected)
148+
144149
def test_diff_axis1_mixed_dtypes_large_periods(self):
145150
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
146151
df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})

0 commit comments

Comments
 (0)