diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9b2540a1ce043..79a5d6cb458e0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -100,6 +100,7 @@ is_dict_like, is_dtype_equal, is_extension_array_dtype, + is_float, is_float_dtype, is_hashable, is_integer, @@ -4458,7 +4459,34 @@ def _replace_columnwise( return res.__finalize__(self) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) - def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> DataFrame: + def shift( + self, periods=1, freq=None, axis=0, fill_value=lib.no_default + ) -> DataFrame: + axis = self._get_axis_number(axis) + + ncols = len(self.columns) + if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: + # We will infer fill_value to match the closest column + + if periods > 0: + result = self.iloc[:, :-periods] + for col in range(min(ncols, abs(periods))): + # TODO(EA2D): doing this in a loop unnecessary with 2D EAs + # Define filler inside loop so we get a copy + filler = self.iloc[:, 0].shift(len(self)) + result.insert(0, col, filler, allow_duplicates=True) + else: + result = self.iloc[:, -periods:] + for col in range(min(ncols, abs(periods))): + # Define filler inside loop so we get a copy + filler = self.iloc[:, -1].shift(len(self)) + result.insert( + len(result.columns), col, filler, allow_duplicates=True + ) + + result.columns = self.columns.copy() + return result + return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -7208,13 +7236,13 @@ def melt( Difference with previous column >>> df.diff(axis=1) - a b c - 0 NaN 0.0 0.0 - 1 NaN -1.0 3.0 - 2 NaN -1.0 7.0 - 3 NaN -1.0 13.0 - 4 NaN 0.0 20.0 - 5 NaN 2.0 28.0 + a b c + 0 NaN 0 0 + 1 NaN -1 3 + 2 NaN -1 7 + 3 NaN -1 13 + 4 NaN 0 20 + 5 NaN 2 28 Difference with 3rd previous row @@ -7248,12 +7276,15 @@ def melt( ), ) def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: + if not isinstance(periods, int): + if not (is_float(periods) and periods.is_integer()): + raise ValueError("periods must be an integer") + periods = int(periods) bm_axis = self._get_block_manager_axis(axis) - self._consolidate_inplace() if bm_axis == 0 and periods != 0: - return self.T.diff(periods, axis=0).T + return self - self.shift(periods, axis=axis) # type: ignore[operator] new_data = self._mgr.diff(n=periods, axis=bm_axis) return self._constructor(new_data) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6f0aa70625c1d..60cf2a8998fc6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9244,11 +9244,11 @@ def shift( >>> df.shift(periods=1, axis="columns") Col1 Col2 Col3 - 2020-01-01 NaN 10.0 13.0 - 2020-01-02 NaN 20.0 23.0 - 2020-01-03 NaN 15.0 18.0 - 2020-01-04 NaN 30.0 33.0 - 2020-01-05 NaN 45.0 48.0 + 2020-01-01 NaN 10 13 + 2020-01-02 NaN 20 23 + 2020-01-03 NaN 15 18 + 2020-01-04 NaN 30 33 + 2020-01-05 NaN 45 48 >>> df.shift(periods=3, fill_value=0) Col1 Col2 Col3 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f2480adce89b4..a7a9a77bab3bc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -557,8 +557,12 @@ def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": + if fill_value is lib.no_default: + fill_value = None + if axis == 0 and self.ndim == 2 and self.nblocks > 1: # GH#35488 we need to watch out for multi-block cases + # We only get here with fill_value not-lib.no_default ncols = self.shape[0] if periods > 0: indexer = [-1] * periods + list(range(ncols - periods)) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 83a5f43c2a340..e2161013c0166 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -444,7 +444,7 @@ def wide_to_long( 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', - ... sep='_', suffix='\w+') + ... sep='_', suffix=r'\w+') >>> l ... # doctest: +NORMALIZE_WHITESPACE ht diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 0486fb2d588b6..42586c14092f2 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -7,6 +7,11 @@ class TestDataFrameDiff: + def test_diff_requires_integer(self): + df = pd.DataFrame(np.random.randn(2, 2)) + with pytest.raises(ValueError, match="periods must be an integer"): + df.diff(1.5) + def test_diff(self, datetime_frame): the_diff = datetime_frame.diff(1) @@ -31,9 +36,7 @@ def test_diff(self, datetime_frame): df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])}) df.insert(0, "x", 1) result = df.diff(axis=1) - expected = pd.DataFrame( - {"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)} - ).astype("float64") + expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) @@ -116,19 +119,13 @@ def test_diff_axis(self): df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) ) - @pytest.mark.xfail( - reason="GH#32995 needs to operate column-wise or do inference", - raises=AssertionError, - ) def test_diff_period(self): # GH#32995 Don't pass an incorrect axis - # TODO(EA2D): this bug wouldn't have happened with 2D EA pi = pd.date_range("2016-01-01", periods=3).to_period("D") df = pd.DataFrame({"A": pi}) result = df.diff(1, axis=1) - # TODO: should we make Block.diff do type inference? or maybe algos.diff? expected = (df - pd.NaT).astype(object) tm.assert_frame_equal(result, expected) @@ -141,6 +138,14 @@ def test_diff_axis1_mixed_dtypes(self): result = df.diff(axis=1) tm.assert_frame_equal(result, expected) + # GH#21437 mixed-float-dtypes + df = pd.DataFrame( + {"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")} + ) + result = df.diff(axis=1) + expected = pd.DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0}) + tm.assert_frame_equal(result, expected) + def test_diff_axis1_mixed_dtypes_large_periods(self): # GH#32995 operate column-wise when we have mixed dtypes and axis=1 df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})