diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index bc3c4fbde2712..4ed0de692f9fd 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -39,7 +39,6 @@ dependencies: - numexpr - openpyxl - odfpy - - pandas-gbq - psycopg2 - pyarrow<10 - pymysql @@ -68,5 +67,6 @@ dependencies: - statsmodels - coverage - pandas-datareader + - pandas-gbq - pyyaml - py diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 786192e8c3ebd..3c7d84bb866f1 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -827,20 +827,54 @@ In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``. For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.ols, 'data')`` to ``pipe``: -.. ipython:: python - :okwarning: - - import statsmodels.formula.api as sm - - bb = pd.read_csv("data/baseball.csv", index_col="id") +.. code-block:: ipython - ( - bb.query("h > 0") - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") - .fit() - .summary() - ) + In [147]: import statsmodels.formula.api as sm + + In [148]: bb = pd.read_csv("data/baseball.csv", index_col="id") + + In [149]: ( + .....: bb.query("h > 0") + .....: .assign(ln_h=lambda df: np.log(df.h)) + .....: .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + .....: .fit() + .....: .summary() + .....: ) + .....: + Out[149]: + + """ + OLS Regression Results + ============================================================================== + Dep. Variable: hr R-squared: 0.685 + Model: OLS Adj. R-squared: 0.665 + Method: Least Squares F-statistic: 34.28 + Date: Tue, 22 Nov 2022 Prob (F-statistic): 3.48e-15 + Time: 05:34:17 Log-Likelihood: -205.92 + No. Observations: 68 AIC: 421.8 + Df Residuals: 63 BIC: 432.9 + Df Model: 4 + Covariance Type: nonrobust + =============================================================================== + coef std err t P>|t| [0.025 0.975] + ------------------------------------------------------------------------------- + Intercept -8484.7720 4664.146 -1.819 0.074 -1.78e+04 835.780 + C(lg)[T.NL] -2.2736 1.325 -1.716 0.091 -4.922 0.375 + ln_h -1.3542 0.875 -1.547 0.127 -3.103 0.395 + year 4.2277 2.324 1.819 0.074 -0.417 8.872 + g 0.1841 0.029 6.258 0.000 0.125 0.243 + ============================================================================== + Omnibus: 10.875 Durbin-Watson: 1.999 + Prob(Omnibus): 0.004 Jarque-Bera (JB): 17.298 + Skew: 0.537 Prob(JB): 0.000175 + Kurtosis: 5.225 Cond. No. 1.49e+07 + ============================================================================== + + Notes: + [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. + [2] The condition number is large, 1.49e+07. This might indicate that there are + strong multicollinearity or other numerical problems. + """ The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which have introduced the popular ``(%>%)`` (read pipe) operator for R_. diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index c6c134a383e11..ef73c4b092fc1 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -61,21 +61,55 @@ In the example above, the functions ``f``, ``g``, and ``h`` each expected the Da When the function you wish to apply takes its data anywhere other than the first argument, pass a tuple of ``(function, keyword)`` indicating where the DataFrame should flow. For example: -.. ipython:: python - :okwarning: - - import statsmodels.formula.api as sm - - bb = pd.read_csv("data/baseball.csv", index_col="id") - - # sm.ols takes (formula, data) - ( - bb.query("h > 0") - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") - .fit() - .summary() - ) +.. code-block:: ipython + + In [1]: import statsmodels.formula.api as sm + + In [2]: bb = pd.read_csv("data/baseball.csv", index_col="id") + + # sm.ols takes (formula, data) + In [3]: ( + ...: bb.query("h > 0") + ...: .assign(ln_h=lambda df: np.log(df.h)) + ...: .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + ...: .fit() + ...: .summary() + ...: ) + ...: + Out[3]: + + """ + OLS Regression Results + ============================================================================== + Dep. Variable: hr R-squared: 0.685 + Model: OLS Adj. R-squared: 0.665 + Method: Least Squares F-statistic: 34.28 + Date: Tue, 22 Nov 2022 Prob (F-statistic): 3.48e-15 + Time: 05:35:23 Log-Likelihood: -205.92 + No. Observations: 68 AIC: 421.8 + Df Residuals: 63 BIC: 432.9 + Df Model: 4 + Covariance Type: nonrobust + =============================================================================== + coef std err t P>|t| [0.025 0.975] + ------------------------------------------------------------------------------- + Intercept -8484.7720 4664.146 -1.819 0.074 -1.78e+04 835.780 + C(lg)[T.NL] -2.2736 1.325 -1.716 0.091 -4.922 0.375 + ln_h -1.3542 0.875 -1.547 0.127 -3.103 0.395 + year 4.2277 2.324 1.819 0.074 -0.417 8.872 + g 0.1841 0.029 6.258 0.000 0.125 0.243 + ============================================================================== + Omnibus: 10.875 Durbin-Watson: 1.999 + Prob(Omnibus): 0.004 Jarque-Bera (JB): 17.298 + Skew: 0.537 Prob(JB): 0.000175 + Kurtosis: 5.225 Cond. No. 1.49e+07 + ============================================================================== + + Notes: + [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. + [2] The condition number is large, 1.49e+07. This might indicate that there are + strong multicollinearity or other numerical problems. + """ The pipe method is inspired by unix pipes, which stream text through processes. More recently dplyr_ and magrittr_ have introduced the diff --git a/environment.yml b/environment.yml index d52f5d19dd750..c8deb2e96a52c 100644 --- a/environment.yml +++ b/environment.yml @@ -17,6 +17,7 @@ dependencies: - psutil - pytest-asyncio>=0.17 - boto3 + - coverage # required dependencies - python-dateutil @@ -27,12 +28,14 @@ dependencies: - beautifulsoup4 - blosc - brotlipy + - botocore - bottleneck - fastparquet - fsspec - html5lib - hypothesis - gcsfs + - ipython - jinja2 - lxml - matplotlib>=3.6.1 @@ -40,7 +43,7 @@ dependencies: - numexpr>=2.8.0 # pin for "Run checks on imported code" job - openpyxl - odfpy - - pandas-gbq + - py - psycopg2 - pyarrow<10 - pymysql @@ -60,17 +63,8 @@ dependencies: # downstream packages - aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild - - botocore - - cftime - - dask - - ipython - - seaborn - - scikit-learn - - statsmodels - - coverage - - pandas-datareader - - pyyaml - - py + - dask-core + - seaborn-base # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index 73e5520d26bae..583474e52efb9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,18 +10,21 @@ pytest-xdist>=1.31 psutil pytest-asyncio>=0.17 boto3 +coverage python-dateutil numpy pytz beautifulsoup4 blosc brotlipy +botocore bottleneck fastparquet fsspec html5lib hypothesis gcsfs +ipython jinja2 lxml matplotlib>=3.6.1 @@ -29,7 +32,7 @@ numba>=0.53.1 numexpr>=2.8.0 openpyxl odfpy -pandas-gbq +py psycopg2-binary pyarrow<10 pymysql @@ -47,17 +50,8 @@ xlrd xlsxwriter zstandard aiobotocore<2.0.0 -botocore -cftime dask -ipython seaborn -scikit-learn -statsmodels -coverage -pandas-datareader -pyyaml -py moto flask asv diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index f25ac9a24b98b..8190104428724 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -24,8 +24,9 @@ REMAP_VERSION = {"tzdata": "2022.1"} RENAME = { "pytables": "tables", - "geopandas-base": "geopandas", "psycopg2": "psycopg2-binary", + "dask-core": "dask", + "seaborn-base": "seaborn", }