Skip to content

Commit feddb06

Browse files
refactor code to work with pandas 2.0 (#660)
* Transform positional argument into keyword argument From pandas 2.0 any only accepts keyworkd arguments ref pandas-dev/pandas#44896 * Change how reciprocal is computed I have not fully understood why this solve the problem, but splitting the operation in 2 lines does not seem to work * Catch warnings from pandas.to_datetime Now pandas.to_datetime raises a warning when the column cannot be converted * check_dtype=False in tests datetime features Pandas dataframes created from python integers are created with int column types `int64` but the operation tested returns `int32` which caused issues * Use droplevel before merging Merging dfs with different column lelvels has been disallowed ref pandas-dev/pandas#34862 * Change expected values for months I am not sure why this caused an issue, maybe due to type casting? * run black * run black on tests * isort _variable_type_checks.py * Fix datetime_subtraction --------- Co-authored-by: Claudio Salvatore Arcidiacono <[email protected]>
1 parent e73772d commit feddb06

File tree

6 files changed

+34
-14
lines changed

6 files changed

+34
-14
lines changed

feature_engine/datetime/datetime_subtraction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ def _sub(self, dt_df: pd.DataFrame):
318318
new_df[new_varnames] = (
319319
dt_df[self.variables_]
320320
.sub(dt_df[reference], axis=0)
321-
.apply(lambda s: s / np.timedelta64(1, self.output_unit))
321+
.div(np.timedelta64(1, self.output_unit).astype("timedelta64[ns]"))
322322
)
323323

324324
if self.new_variables_names is not None:

feature_engine/imputation/drop_missing_data.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def return_na_data(self, X: pd.DataFrame) -> pd.DataFrame:
205205
idx = pd.isnull(X[self.variables_]).mean(axis=1) >= self.threshold
206206
idx = idx[idx]
207207
else:
208-
idx = pd.isnull(X[self.variables_]).any(1)
208+
idx = pd.isnull(X[self.variables_]).any(axis=1)
209209
idx = idx[idx]
210210

211211
return X.loc[idx.index, :]

feature_engine/transformation/reciprocal.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ class ReciprocalTransformer(BaseNumericalTransformer):
9696
def __init__(
9797
self, variables: Union[None, int, str, List[Union[str, int]]] = None
9898
) -> None:
99-
10099
self.variables = _check_init_parameter_variables(variables)
101100

102101
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
@@ -152,8 +151,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
152151

153152
# transform
154153
# for some reason reciprocal does not work with integers
155-
X.loc[:, self.variables_] = X.loc[:, self.variables_].astype("float")
156-
X.loc[:, self.variables_] = np.reciprocal(X.loc[:, self.variables_])
154+
X.loc[:, self.variables_] = np.reciprocal(
155+
X.loc[:, self.variables_].astype("float")
156+
)
157157

158158
return X
159159

feature_engine/variable_handling/_variable_type_checks.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import warnings
2+
13
import pandas as pd
24
from pandas.core.dtypes.common import is_categorical_dtype as is_categorical
35
from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime
@@ -6,7 +8,6 @@
68

79

810
def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool:
9-
1011
# check for datetime only if object cannot be cast as numeric because
1112
# if it could pd.to_datetime would convert it to datetime regardless
1213
if is_object(column):
@@ -25,15 +26,16 @@ def _is_categories_num(column: pd.Series) -> bool:
2526

2627

2728
def _is_convertible_to_dt(column: pd.Series) -> bool:
28-
return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
29+
with warnings.catch_warnings():
30+
warnings.simplefilter("ignore")
31+
return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
2932

3033

3134
def _is_convertible_to_num(column: pd.Series) -> bool:
3235
return is_numeric(pd.to_numeric(column, errors="ignore"))
3336

3437

3538
def _is_categorical_and_is_datetime(column: pd.Series) -> bool:
36-
3739
# check for datetime only if object cannot be cast as numeric because
3840
# if it could pd.to_datetime would convert it to datetime regardless
3941
if is_object(column):

tests/test_datetime/test_datetime_features.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ def test_extract_datetime_features_with_default_options(
183183
df_datetime_transformed[
184184
vars_non_dt + [var + feat for var in vars_dt for feat in feat_names_default]
185185
],
186+
check_dtype=False,
186187
)
187188

188189

@@ -198,6 +199,7 @@ def test_extract_datetime_features_from_specified_variables(
198199
+ ["datetime_range", "date_obj2", "time_obj"]
199200
+ ["date_obj1" + feat for feat in feat_names_default]
200201
],
202+
check_dtype=False,
201203
)
202204

203205
# multiple datetime variables
@@ -215,6 +217,7 @@ def test_extract_datetime_features_from_specified_variables(
215217
for feat in feat_names_default
216218
]
217219
],
220+
check_dtype=False,
218221
)
219222

220223
# multiple datetime variables in different order than they appear in the df
@@ -232,6 +235,7 @@ def test_extract_datetime_features_from_specified_variables(
232235
for feat in feat_names_default
233236
]
234237
],
238+
check_dtype=False,
235239
)
236240

237241
# datetime variable is index
@@ -251,12 +255,15 @@ def test_extract_datetime_features_from_specified_variables(
251255
],
252256
axis=1,
253257
),
258+
check_dtype=False,
254259
)
255260

256261

257262
def test_extract_all_datetime_features(df_datetime, df_datetime_transformed):
258263
X = DatetimeFeatures(features_to_extract="all").fit_transform(df_datetime)
259-
pd.testing.assert_frame_equal(X, df_datetime_transformed.drop(vars_dt, axis=1))
264+
pd.testing.assert_frame_equal(
265+
X, df_datetime_transformed.drop(vars_dt, axis=1), check_dtype=False
266+
)
260267

261268

262269
def test_extract_specified_datetime_features(df_datetime, df_datetime_transformed):
@@ -269,6 +276,7 @@ def test_extract_specified_datetime_features(df_datetime, df_datetime_transforme
269276
vars_non_dt
270277
+ [var + "_" + feat for var in vars_dt for feat in ["semester", "week"]]
271278
],
279+
check_dtype=False,
272280
)
273281

274282
# different order than they appear in the glossary
@@ -281,6 +289,7 @@ def test_extract_specified_datetime_features(df_datetime, df_datetime_transforme
281289
vars_non_dt
282290
+ [var + "_" + feat for var in vars_dt for feat in ["hour", "day_of_week"]]
283291
],
292+
check_dtype=False,
284293
)
285294

286295

@@ -290,7 +299,9 @@ def test_extract_features_from_categorical_variable(
290299
cat_date = pd.DataFrame({"date_obj1": df_datetime["date_obj1"].astype("category")})
291300
X = DatetimeFeatures(variables="date_obj1").fit_transform(cat_date)
292301
pd.testing.assert_frame_equal(
293-
X, df_datetime_transformed[["date_obj1" + feat for feat in feat_names_default]]
302+
X,
303+
df_datetime_transformed[["date_obj1" + feat for feat in feat_names_default]],
304+
check_dtype=False,
294305
)
295306

296307

@@ -311,6 +322,7 @@ def test_extract_features_from_different_timezones(
311322
df_datetime_transformed[["time_obj_hour"]].apply(
312323
lambda x: x.subtract(time_zones)
313324
),
325+
check_dtype=False,
314326
)
315327
exp_err_msg = (
316328
"ValueError: variable(s) time_obj "
@@ -356,7 +368,7 @@ def test_extract_features_from_localized_tz_variables():
356368
# transform
357369
X = transformer.transform(tz_df)
358370
df_expected = pd.DataFrame({"date_var_hour": [1, 2, 2, 2, 2, 3, 3]})
359-
pd.testing.assert_frame_equal(X, df_expected)
371+
pd.testing.assert_frame_equal(X, df_expected, check_dtype=False)
360372

361373
# when utc is True
362374
transformer = DatetimeFeatures(features_to_extract=["hour"], utc=True).fit(tz_df)
@@ -372,7 +384,7 @@ def test_extract_features_from_localized_tz_variables():
372384
# transform
373385
X = transformer.transform(tz_df)
374386
df_expected = pd.DataFrame({"date_var_hour": [5, 6, 6, 6, 6, 7, 7]})
375-
pd.testing.assert_frame_equal(X, df_expected)
387+
pd.testing.assert_frame_equal(X, df_expected, check_dtype=False)
376388

377389

378390
def test_extract_features_without_dropping_original_variables(
@@ -399,6 +411,7 @@ def test_extract_features_without_dropping_original_variables(
399411
],
400412
axis=1,
401413
),
414+
check_dtype=False,
402415
)
403416

404417

@@ -435,6 +448,7 @@ def test_extract_features_with_different_datetime_parsing_options(df_datetime):
435448
pd.testing.assert_frame_equal(
436449
X,
437450
pd.DataFrame({"date_obj2_day_of_month": [10, 31, 30, 17]}),
451+
check_dtype=False,
438452
)
439453

440454
X = DatetimeFeatures(features_to_extract=["year"], yearfirst=True).fit_transform(
@@ -443,6 +457,7 @@ def test_extract_features_with_different_datetime_parsing_options(df_datetime):
443457
pd.testing.assert_frame_equal(
444458
X,
445459
pd.DataFrame({"date_obj2_year": [2010, 2009, 1995, 2004]}),
460+
check_dtype=False,
446461
)
447462

448463

@@ -457,8 +472,7 @@ def test_get_feature_names_out(df_datetime, df_datetime_transformed):
457472
transformer.get_feature_names_out(input_features=vars_dt)
458473

459474
with pytest.raises(ValueError):
460-
transformer.get_feature_names_out(input_features=["date_obj1"])\
461-
475+
transformer.get_feature_names_out(input_features=["date_obj1"])
462476
# default features from 1 variable
463477
transformer = DatetimeFeatures(variables="date_obj1")
464478
X = transformer.fit_transform(df_datetime)

tests/test_time_series/test_forecasting/test_window_features.py

+4
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,10 @@ def test_multiple_windows(df_time):
380380
X = df_time.copy()
381381
num_vars = ["ambient_temp", "module_temp", "irradiation"]
382382
tmp = X[num_vars].rolling(2).agg(["sum", "mean"]).shift(periods=15, freq="min")
383+
tmp.columns = tmp.columns.droplevel()
383384
X_tr = X.merge(tmp, left_index=True, right_index=True, how="left")
384385
tmp = X[num_vars].rolling(3).agg(["sum", "mean"]).shift(periods=15, freq="min")
386+
tmp.columns = tmp.columns.droplevel()
385387
X_tr = X_tr.merge(tmp, left_index=True, right_index=True, how="left")
386388
X_tr.columns = transformer.get_feature_names_out()
387389

@@ -404,13 +406,15 @@ def test_multiple_windows(df_time):
404406
.agg(["sum", "mean"])
405407
.shift(freq="30min")
406408
)
409+
tmp.columns = tmp.columns.droplevel()
407410
X_tr = X.merge(tmp, left_index=True, right_index=True, how="left")
408411
tmp = (
409412
X[["ambient_temp", "irradiation"]]
410413
.rolling(3)
411414
.agg(["sum", "mean"])
412415
.shift(freq="30min")
413416
)
417+
tmp.columns = tmp.columns.droplevel()
414418
X_tr = X_tr.merge(tmp, left_index=True, right_index=True, how="left")
415419
X_tr.columns = transformer.get_feature_names_out()
416420

0 commit comments

Comments
 (0)