Found problem with pandas 2.2.0, fixed, resolves #128

enzbus · enzbus · commit 761ee3aa79d7 · 2024-01-22T20:23:28.000+04:00
diff --git a/cvxportfolio/data.py b/cvxportfolio/data.py
@@ -306,12 +306,12 @@ def _clean(data):
         # print(data.isnull().sum())
 
         # if low is not the lowest, set it to nan
-        data['low'].loc[
-            data['low'] > data[['open', 'high', 'close']].min(1)] = np.nan
+        data.loc[data['low'] > data[['open', 'high', 'close']].min(1),
+            'low']  = np.nan
 
         # if high is not the highest, set it to nan
-        data['high'].loc[
-            data['high'] < data[['open', 'high', 'close']].max(1)] = np.nan
+        data.loc[data['high'] < data[['open', 'high', 'close']].max(1),
+            'high'] = np.nan
 
         # print(data)
         # print(data.isnull().sum())
@@ -959,7 +959,8 @@ def _add_cash_column(self, cash_key, grace_period):
         # be misaligned (e.g., with tz-aware timestamps)
         cash_returns_per_period.name = self.cash_key
         original_returns_index = self.returns.index
-        tmp = pd.concat([self.returns, cash_returns_per_period], axis=1)
+        tmp = pd.concat(
+            [self.returns, cash_returns_per_period], sort=True, axis=1)
         tmp[cash_key] = tmp[cash_key].ffill()
         self.returns = tmp.loc[original_returns_index]
 
@@ -1082,9 +1083,8 @@ def _downsample(self, interval):
 
         # we nan-out the first non-nan element of every col
         for col in self.returns.columns[:-1]:
-            self.returns[col].loc[
-                    (~(self.returns[col].isnull())).idxmax()
-                ] = np.nan
+            self.returns.loc[
+                    (~(self.returns[col].isnull())).idxmax(), col] = np.nan
 
         # and we drop the first row, which is mostly NaNs anyway
         self.returns = self.returns.iloc[1:]
@@ -1107,9 +1107,8 @@ def _downsample(self, interval):
 
             # we nan-out the first non-nan element of every col
             for col in self.volumes.columns:
-                self.volumes[col].loc[
-                        (~(self.volumes[col].isnull())).idxmax()
-                    ] = np.nan
+                self.volumes.loc[
+                    (~(self.volumes[col].isnull())).idxmax(), col] = np.nan
 
             # and we drop the first row, which is mostly NaNs anyway
             self.volumes = self.volumes.iloc[1:]
@@ -1129,9 +1128,8 @@ def _downsample(self, interval):
 
             # we nan-out the first non-nan element of every col
             for col in self.prices.columns:
-                self.prices[col].loc[
-                        (~(self.prices[col].isnull())).idxmax()
-                    ] = np.nan
+                self.prices.loc[
+                    (~(self.prices[col].isnull())).idxmax(), col] = np.nan
 
             # and we drop the first row, which is mostly NaNs anyway
             self.prices = self.prices.iloc[1:]
diff --git a/cvxportfolio/tests/test_data.py b/cvxportfolio/tests/test_data.py
@@ -203,17 +203,17 @@ def _base_test_series(self, loader, storer):
                 name="test2"),
             pd.Series("hello",
                 pd.date_range("2020-01-01", "2020-01-02",  tz='UTC-05:00',
-                    freq="H"),
+                    freq="h"),
                 name="test3"),
             # test overwrite
             pd.Series("hello",
-                pd.date_range("2020-01-01", "2020-01-02",  tz='UTC', freq="H"),
+                pd.date_range("2020-01-01", "2020-01-02",  tz='UTC', freq="h"),
                 name="test3"),
             # test datetime conversion
             pd.Series(
                 pd.date_range("2022-01-01", "2022-01-02",  tz='UTC',
-                    freq="H"),
-                pd.date_range("2020-01-01", "2020-01-02",  tz='UTC', freq="H"),
+                    freq="h"),
+                pd.date_range("2020-01-01", "2020-01-02",  tz='UTC', freq="h"),
                 name="test4"),
             ]:
 
@@ -248,7 +248,7 @@ def _base_test_series(self, loader, storer):
     def _base_test_dataframe(self, loader, storer):
         """Test storing and retrieving of a DataFrame with datetime index."""
 
-        index = pd.date_range("2020-01-01", "2020-01-02", freq="H", tz='UTC')
+        index = pd.date_range("2020-01-01", "2020-01-02", freq="h", tz='UTC')
         data = {
             "one": range(len(index)),
             "two": np.arange(len(index)) / 19.0,
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ license = {text = "Apache License (2.0)"}
 authors = [{name = "Enzo Busseti"}, {name = "Stephen Boyd"},
     {name = "Steven Diamond"}, {name = "BlackRock Inc."}]
 maintainers = [{name = "Enzo Busseti", email = "enzo.busseti@gmail.com"}]
-dependencies = ["pandas<2.2.0", "numpy", "matplotlib", "requests", "cvxpy",
+dependencies = ["pandas", "numpy", "matplotlib", "requests", "cvxpy",
     "multiprocess"]
 
 [project.optional-dependencies]