From 3c2e34bc7bc692d1ffd3550db0c3ff568d850e81 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 00:54:19 +0100 Subject: [PATCH 1/3] CoW: Avoid warnings in stata code --- pandas/io/stata.py | 22 +++++++++------------- pandas/tests/io/test_common.py | 2 -- pandas/tests/io/test_stata.py | 17 +++++------------ 3 files changed, 14 insertions(+), 27 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1eb8f531dc62a..481826bd1a4ba 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -345,10 +345,7 @@ def convert_delta_safe(base, deltas, unit) -> Series: has_bad_values = False if bad_locs.any(): has_bad_values = True - # reset cache to avoid SettingWithCopy checks (we own the DataFrame and the - # `dates` Series is used to overwrite itself in the DataFramae) - dates._reset_cacher() - dates[bad_locs] = 1.0 # Replace with NaT + dates._values[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) if fmt.startswith(("%tc", "tc")): # Delta ms relative to base @@ -467,9 +464,9 @@ def g(x: datetime) -> int: if bad_loc.any(): dates = Series(dates) if lib.is_np_dtype(dates.dtype, "M"): - dates[bad_loc] = to_datetime(stata_epoch) + dates._values[bad_loc] = to_datetime(stata_epoch) else: - dates[bad_loc] = stata_epoch + dates._values[bad_loc] = stata_epoch if fmt in ["%tc", "tc"]: d = parse_dates_safe(dates, delta=True) @@ -599,9 +596,8 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: for col in data: # Cast from unsupported types to supported types is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype)) - orig = data[col] # We need to find orig_missing before altering data below - orig_missing = orig.isna() + orig_missing = data[col].isna() if is_nullable_int: missing_loc = data[col].isna() if missing_loc.any(): @@ -1783,15 +1779,15 @@ def read( for idx in valid_dtypes: dtype = data.iloc[:, idx].dtype if dtype not in (object_type, self._dtyplist[idx]): - data.iloc[:, idx] = data.iloc[:, idx].astype(dtype) + data.isetitem(idx, data.iloc[:, idx].astype(dtype)) data = self._do_convert_missing(data, convert_missing) if convert_dates: for i, fmt in enumerate(self._fmtlist): if any(fmt.startswith(date_fmt) for date_fmt in _date_formats): - data.iloc[:, i] = _stata_elapsed_date_to_datetime_vec( - data.iloc[:, i], fmt + data.isetitem( + i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) ) if convert_categoricals and self._format_version > 108: @@ -1866,7 +1862,7 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra replacements[i] = replacement if replacements: for idx, value in replacements.items(): - data.iloc[:, idx] = value + data.isetitem(idx, value) return data def _insert_strls(self, data: DataFrame) -> DataFrame: @@ -1876,7 +1872,7 @@ def _insert_strls(self, data: DataFrame) -> DataFrame: if typ != "Q": continue # Wrap v_o in a string to allow uint64 values as keys on 32bit OS - data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]] + data.isetitem(i, [self.GSO[str(k)] for k in data.iloc[:, i]]) return data def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame: diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 718f967f2f3d8..074033868635a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -289,8 +289,6 @@ def test_read_expands_user_home_dir( ): reader(path) - # TODO(CoW-warn) avoid warnings in the stata reader code - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize( "reader, module, path", [ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 19d81d50f5774..47ab862242146 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -32,11 +32,6 @@ read_stata, ) -# TODO(CoW-warn) avoid warnings in the stata reader code -pytestmark = pytest.mark.filterwarnings( - "ignore:Setting a value on a view:FutureWarning" -) - @pytest.fixture def mixed_frame(): @@ -140,8 +135,8 @@ def test_read_dta1(self, file, datapath): tm.assert_frame_equal(parsed, expected) - @pytest.mark.filterwarnings("always") - def test_read_dta2(self, datapath): + # @pytest.mark.filterwarnings("always") + def test_read_dta2(self, datapath, warn_copy_on_write): expected = DataFrame.from_records( [ ( @@ -183,13 +178,11 @@ def test_read_dta2(self, datapath): path2 = datapath("io", "data", "stata", "stata2_115.dta") path3 = datapath("io", "data", "stata", "stata2_117.dta") - # TODO(CoW-warn) avoid warnings in the stata reader code - # once fixed -> remove `raise_on_extra_warnings=False` again - with tm.assert_produces_warning(UserWarning, raise_on_extra_warnings=False): + with tm.assert_produces_warning(UserWarning): parsed_114 = self.read_dta(path1) - with tm.assert_produces_warning(UserWarning, raise_on_extra_warnings=False): + with tm.assert_produces_warning(UserWarning): parsed_115 = self.read_dta(path2) - with tm.assert_produces_warning(UserWarning, raise_on_extra_warnings=False): + with tm.assert_produces_warning(UserWarning): parsed_117 = self.read_dta(path3) # FIXME: don't leave commented-out # 113 is buggy due to limits of date format support in Stata From 22fc1048a909d0c3eb610dfc132f063754d2a49d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 00:55:51 +0100 Subject: [PATCH 2/3] CoW: Avoid warnings in stata code --- pandas/tests/io/test_stata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 47ab862242146..a337e73494e03 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -135,7 +135,6 @@ def test_read_dta1(self, file, datapath): tm.assert_frame_equal(parsed, expected) - # @pytest.mark.filterwarnings("always") def test_read_dta2(self, datapath, warn_copy_on_write): expected = DataFrame.from_records( [ From a9fcd91fb6718d20e06269de77e0c00a530497fc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 10:42:08 +0100 Subject: [PATCH 3/3] Update --- pandas/io/stata.py | 1 - pandas/tests/io/test_stata.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 481826bd1a4ba..218afb734d629 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -462,7 +462,6 @@ def g(x: datetime) -> int: bad_loc = isna(dates) index = dates.index if bad_loc.any(): - dates = Series(dates) if lib.is_np_dtype(dates.dtype, "M"): dates._values[bad_loc] = to_datetime(stata_epoch) else: diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a337e73494e03..6e76c8fef6bc3 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -135,7 +135,7 @@ def test_read_dta1(self, file, datapath): tm.assert_frame_equal(parsed, expected) - def test_read_dta2(self, datapath, warn_copy_on_write): + def test_read_dta2(self, datapath): expected = DataFrame.from_records( [ (