Skip to content

BUG: df.diff axis=1 mixed dtypes #36710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 7, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 41 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
is_float,
is_float_dtype,
is_hashable,
is_integer,
Expand Down Expand Up @@ -4458,7 +4459,34 @@ def _replace_columnwise(
return res.__finalize__(self)

@doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> DataFrame:
def shift(
self, periods=1, freq=None, axis=0, fill_value=lib.no_default
) -> DataFrame:
axis = self._get_axis_number(axis)

ncols = len(self.columns)
if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0:
# We will infer fill_value to match the closest column

if periods > 0:
result = self.iloc[:, :-periods]
for col in range(min(ncols, abs(periods))):
# TODO(EA2D): doing this in a loop unnecessary with 2D EAs
# Define filler inside loop so we get a copy
filler = self.iloc[:, 0].shift(len(self))
result.insert(0, col, filler, allow_duplicates=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aren't these inplace? do we really want that? (or does .iloc do an implict copy the way you have it), i don't remember exactly

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is inplace on result which we havent returned to the user yet

else:
result = self.iloc[:, -periods:]
for col in range(min(ncols, abs(periods))):
# Define filler inside loop so we get a copy
filler = self.iloc[:, -1].shift(len(self))
result.insert(
len(result.columns), col, filler, allow_duplicates=True
)

result.columns = self.columns.copy()
return result

return super().shift(
periods=periods, freq=freq, axis=axis, fill_value=fill_value
)
Expand Down Expand Up @@ -7208,13 +7236,13 @@ def melt(
Difference with previous column

>>> df.diff(axis=1)
a b c
0 NaN 0.0 0.0
1 NaN -1.0 3.0
2 NaN -1.0 7.0
3 NaN -1.0 13.0
4 NaN 0.0 20.0
5 NaN 2.0 28.0
a b c
0 NaN 0 0
1 NaN -1 3
2 NaN -1 7
3 NaN -1 13
4 NaN 0 20
5 NaN 2 28

Difference with 3rd previous row

Expand Down Expand Up @@ -7248,12 +7276,15 @@ def melt(
),
)
def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
if not isinstance(periods, int):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use is_integer

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do

if not (is_float(periods) and periods.is_integer()):
raise ValueError("periods must be an integer")
periods = int(periods)

bm_axis = self._get_block_manager_axis(axis)
self._consolidate_inplace()

if bm_axis == 0 and periods != 0:
return self.T.diff(periods, axis=0).T
return self - self.shift(periods, axis=axis) # type: ignore[operator]

new_data = self._mgr.diff(n=periods, axis=bm_axis)
return self._constructor(new_data)
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9244,11 +9244,11 @@ def shift(

>>> df.shift(periods=1, axis="columns")
Col1 Col2 Col3
2020-01-01 NaN 10.0 13.0
2020-01-02 NaN 20.0 23.0
2020-01-03 NaN 15.0 18.0
2020-01-04 NaN 30.0 33.0
2020-01-05 NaN 45.0 48.0
2020-01-01 NaN 10 13
2020-01-02 NaN 20 23
2020-01-03 NaN 15 18
2020-01-04 NaN 30 33
2020-01-05 NaN 45 48

>>> df.shift(periods=3, fill_value=0)
Col1 Col2 Col3
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,8 +557,12 @@ def interpolate(self, **kwargs) -> "BlockManager":
return self.apply("interpolate", **kwargs)

def shift(self, periods: int, axis: int, fill_value) -> "BlockManager":
if fill_value is lib.no_default:
fill_value = None

if axis == 0 and self.ndim == 2 and self.nblocks > 1:
# GH#35488 we need to watch out for multi-block cases
# We only get here with fill_value not-lib.no_default
ncols = self.shape[0]
if periods > 0:
indexer = [-1] * periods + list(range(ncols - periods))
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ def wide_to_long(
8 3 3 2.1 2.9

>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
... sep='_', suffix='\w+')
... sep='_', suffix=r'\w+')
>>> l
... # doctest: +NORMALIZE_WHITESPACE
ht
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def str_count(arr, pat, flags=0):
Escape ``'$'`` to find the literal dollar sign.

>>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
>>> s.str.count('\\$')
>>> s.str.count(r'\\$')
0 1
1 0
2 1
Expand Down Expand Up @@ -410,7 +410,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):

Returning any digit using regular expression.

>>> s1.str.contains('\\d', regex=True)
>>> s1.str.contains(r'\\d', regex=True)
0 False
1 False
2 False
Expand Down
23 changes: 14 additions & 9 deletions pandas/tests/frame/methods/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@


class TestDataFrameDiff:
def test_diff_requires_integer(self):
df = pd.DataFrame(np.random.randn(2, 2))
with pytest.raises(ValueError, match="periods must be an integer"):
df.diff(1.5)

def test_diff(self, datetime_frame):
the_diff = datetime_frame.diff(1)

Expand All @@ -31,9 +36,7 @@ def test_diff(self, datetime_frame):
df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])})
df.insert(0, "x", 1)
result = df.diff(axis=1)
expected = pd.DataFrame(
{"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}
).astype("float64")
expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)})
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("tz", [None, "UTC"])
Expand Down Expand Up @@ -116,19 +119,13 @@ def test_diff_axis(self):
df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])
)

@pytest.mark.xfail(
reason="GH#32995 needs to operate column-wise or do inference",
raises=AssertionError,
)
def test_diff_period(self):
# GH#32995 Don't pass an incorrect axis
# TODO(EA2D): this bug wouldn't have happened with 2D EA
pi = pd.date_range("2016-01-01", periods=3).to_period("D")
df = pd.DataFrame({"A": pi})

result = df.diff(1, axis=1)

# TODO: should we make Block.diff do type inference? or maybe algos.diff?
expected = (df - pd.NaT).astype(object)
tm.assert_frame_equal(result, expected)

Expand All @@ -141,6 +138,14 @@ def test_diff_axis1_mixed_dtypes(self):
result = df.diff(axis=1)
tm.assert_frame_equal(result, expected)

# GH#21437 mixed-float-dtypes
df = pd.DataFrame(
{"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")}
)
result = df.diff(axis=1)
expected = pd.DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
tm.assert_frame_equal(result, expected)

def test_diff_axis1_mixed_dtypes_large_periods(self):
# GH#32995 operate column-wise when we have mixed dtypes and axis=1
df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
Expand Down