diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d6744f578560c..ca0c75f9de94f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -65,7 +65,7 @@ jobs: if: always() - name: Testing docstring validation script - run: pytest --capture=no --strict-markers scripts + run: pytest scripts if: always() - name: Running benchmarks diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index ba5a0a1fd0909..a5aef7825c770 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -78,7 +78,7 @@ jobs: uses: ./.github/actions/build_pandas - name: Test - run: pytest -m "${{ env.PATTERN }}" -n 2 --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml -s --cov=pandas --cov-report=xml pandas/tests/io + run: pytest -m "${{ env.PATTERN }}" -n 2 --dist=loadfile --cov=pandas --cov-report=xml pandas/tests/io if: always() - name: Build Version diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml new file mode 100644 index 0000000000000..2643dc5ec656e --- /dev/null +++ b/.github/workflows/python-dev.yml @@ -0,0 +1,70 @@ +name: Python Dev + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + name: actions-310-dev + timeout-minutes: 60 + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Set up Python Dev Version + uses: actions/setup-python@v2 + with: + python-version: '3.10-dev' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + pip install git+https://github.com/numpy/numpy.git + pip install git+https://github.com/pytest-dev/pytest.git + pip install git+https://github.com/nedbat/coveragepy.git + pip install cython python-dateutil pytz hypothesis pytest-xdist + pip list + + - name: Build Pandas + run: | + python setup.py build_ext -q -j2 + python -m pip install -e . --no-build-isolation --no-use-pep517 + + - name: Build Version + run: | + python -c "import pandas; pandas.show_versions();" + + - name: Test with pytest + run: | + coverage run -m pytest -m 'not slow and not network and not clipboard' pandas + continue-on-error: true + + - name: Publish test results + uses: actions/upload-artifact@master + with: + name: Test results + path: test-data.xml + if: failure() + + - name: Print skipped tests + run: | + python ci/print_skipped.py + + - name: Report Coverage + run: | + coverage report -m + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + flags: unittests + name: codecov-pandas + fail_ci_if_error: true diff --git a/.gitignore b/.gitignore index b682d93efbd04..2c337be60e94e 100644 --- a/.gitignore +++ b/.gitignore @@ -104,6 +104,7 @@ asv_bench/env/ asv_bench/html/ asv_bench/results/ asv_bench/pandas/ +test-data.xml # Documentation generated files # ################################# diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2f46190ef5eb7..5b11490479088 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,7 +35,7 @@ repos: exclude: ^pandas/_libs/src/(klib|headers)/ args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir'] - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.1 + rev: 3.9.0 hooks: - id: flake8 additional_dependencies: @@ -75,7 +75,7 @@ repos: hooks: - id: yesqa additional_dependencies: - - flake8==3.9.1 + - flake8==3.9.0 - flake8-comprehensions==3.1.0 - flake8-bugbear==21.3.2 - pandas-dev-flaker==0.2.0 diff --git a/LICENSE b/LICENSE index 76954a5a339ab..a0cc369f725b8 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ BSD 3-Clause License Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. -Copyright (c) 2011-2020, Open source contributors. +Copyright (c) 2011-2021, Open source contributors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 65167e6467fd5..760da36a30075 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -563,6 +563,14 @@ def time_frame_nunique(self): self.df.nunique() +class SeriesNuniqueWithNan: + def setup(self): + self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) + + def time_series_nunique_nan(self): + self.ser.nunique() + + class Duplicated: def setup(self): n = 1 << 20 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index b4b20553ec460..27761ccd0d917 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -505,6 +505,34 @@ def time_frame_agg(self, dtype, method): self.df.groupby("key").agg(method) +class CumminMax: + param_names = ["dtype", "method"] + params = [ + ["float64", "int64", "Float64", "Int64"], + ["cummin", "cummax"], + ] + + def setup(self, dtype, method): + N = 500_000 + vals = np.random.randint(-10, 10, (N, 5)) + null_vals = vals.astype(float, copy=True) + null_vals[::2, :] = np.nan + null_vals[::3, :] = np.nan + df = DataFrame(vals, columns=list("abcde"), dtype=dtype) + null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) + keys = np.random.randint(0, 100, size=N) + df["key"] = keys + null_df["key"] = keys + self.df = df + self.null_df = null_df + + def time_frame_transform(self, dtype, method): + self.df.groupby("key").transform(method) + + def time_frame_transform_many_nulls(self, dtype, method): + self.null_df.groupby("key").transform(method) + + class RankWithTies: # GH 21237 param_names = ["dtype", "tie_method"] diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index 6c0ca6fac6ec3..e4369d67ca67e 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -17,19 +17,19 @@ def setup(self, cols, rows): def time_apply_render(self, cols, rows): self._style_apply() - self.st.render() + self.st._render_html() def peakmem_apply_render(self, cols, rows): self._style_apply() - self.st.render() + self.st._render_html() def time_classes_render(self, cols, rows): self._style_classes() - self.st.render() + self.st._render_html() def peakmem_classes_render(self, cols, rows): self._style_classes() - self.st.render() + self.st._render_html() def _style_apply(self): def _apply_func(s): diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 76257e1b40f1a..45a9053954569 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -50,91 +50,126 @@ def peakmem_cat_frame_construction(self, dtype): class Methods: - def setup(self): - self.s = Series(tm.makeStringIndex(10 ** 5)) + params = ["str", "string", "arrow_string"] + param_names = ["dtype"] + + def setup(self, dtype): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - def time_center(self): + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) + except ImportError: + raise NotImplementedError + + def time_center(self, dtype): self.s.str.center(100) - def time_count(self): + def time_count(self, dtype): self.s.str.count("A") - def time_endswith(self): + def time_endswith(self, dtype): self.s.str.endswith("A") - def time_extract(self): + def time_extract(self, dtype): with warnings.catch_warnings(record=True): self.s.str.extract("(\\w*)A(\\w*)") - def time_findall(self): + def time_findall(self, dtype): self.s.str.findall("[A-Z]+") - def time_find(self): + def time_find(self, dtype): self.s.str.find("[A-Z]+") - def time_rfind(self): + def time_rfind(self, dtype): self.s.str.rfind("[A-Z]+") - def time_get(self): + def time_get(self, dtype): self.s.str.get(0) - def time_len(self): + def time_len(self, dtype): self.s.str.len() - def time_join(self): + def time_join(self, dtype): self.s.str.join(" ") - def time_match(self): + def time_match(self, dtype): self.s.str.match("A") - def time_normalize(self): + def time_normalize(self, dtype): self.s.str.normalize("NFC") - def time_pad(self): + def time_pad(self, dtype): self.s.str.pad(100, side="both") - def time_partition(self): + def time_partition(self, dtype): self.s.str.partition("A") - def time_rpartition(self): + def time_rpartition(self, dtype): self.s.str.rpartition("A") - def time_replace(self): + def time_replace(self, dtype): self.s.str.replace("A", "\x01\x01") - def time_translate(self): + def time_translate(self, dtype): self.s.str.translate({"A": "\x01\x01"}) - def time_slice(self): + def time_slice(self, dtype): self.s.str.slice(5, 15, 2) - def time_startswith(self): + def time_startswith(self, dtype): self.s.str.startswith("A") - def time_strip(self): + def time_strip(self, dtype): self.s.str.strip("A") - def time_rstrip(self): + def time_rstrip(self, dtype): self.s.str.rstrip("A") - def time_lstrip(self): + def time_lstrip(self, dtype): self.s.str.lstrip("A") - def time_title(self): + def time_title(self, dtype): self.s.str.title() - def time_upper(self): + def time_upper(self, dtype): self.s.str.upper() - def time_lower(self): + def time_lower(self, dtype): self.s.str.lower() - def time_wrap(self): + def time_wrap(self, dtype): self.s.str.wrap(10) - def time_zfill(self): + def time_zfill(self, dtype): self.s.str.zfill(10) + def time_isalnum(self, dtype): + self.s.str.isalnum() + + def time_isalpha(self, dtype): + self.s.str.isalpha() + + def time_isdecimal(self, dtype): + self.s.str.isdecimal() + + def time_isdigit(self, dtype): + self.s.str.isdigit() + + def time_islower(self, dtype): + self.s.str.islower() + + def time_isnumeric(self, dtype): + self.s.str.isnumeric() + + def time_isspace(self, dtype): + self.s.str.isspace() + + def time_istitle(self, dtype): + self.s.str.istitle() + + def time_isupper(self, dtype): + self.s.str.isupper() + class Repeat: @@ -178,13 +213,18 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): class Contains: - params = [True, False] - param_names = ["regex"] + params = (["str", "string", "arrow_string"], [True, False]) + param_names = ["dtype", "regex"] + + def setup(self, dtype, regex): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - def setup(self, regex): - self.s = Series(tm.makeStringIndex(10 ** 5)) + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) + except ImportError: + raise NotImplementedError - def time_contains(self, regex): + def time_contains(self, dtype, regex): self.s.str.contains("A", regex=regex) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d4b6c0d6ff09d..149e10b48933d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -106,84 +106,43 @@ fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - # Individual files - - MSG='Doctests accessor.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/accessor.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests aggregation.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/aggregation.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests base.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/base.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests construction.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/construction.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests frame.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/frame.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests generic.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/generic.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests series.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/series.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests strings.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/strings/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests sql.py' ; echo $MSG - pytest -q --doctest-modules pandas/io/sql.py - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Directories - - MSG='Doctests arrays'; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests computation' ; echo $MSG - pytest -q --doctest-modules pandas/core/computation/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests dtypes'; echo $MSG - pytest -q --doctest-modules pandas/core/dtypes/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests groupby' ; echo $MSG - pytest -q --doctest-modules pandas/core/groupby/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests indexes' ; echo $MSG - pytest -q --doctest-modules pandas/core/indexes/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests ops' ; echo $MSG - pytest -q --doctest-modules pandas/core/ops/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests reshape' ; echo $MSG - pytest -q --doctest-modules pandas/core/reshape/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests tools' ; echo $MSG - pytest -q --doctest-modules pandas/core/tools/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests window' ; echo $MSG - pytest -q --doctest-modules pandas/core/window/ - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Doctests tseries' ; echo $MSG - pytest -q --doctest-modules pandas/tseries/ + MSG='Doctests for individual files' ; echo $MSG + pytest -q --doctest-modules \ + pandas/core/accessor.py \ + pandas/core/aggregation.py \ + pandas/core/algorithms.py \ + pandas/core/base.py \ + pandas/core/construction.py \ + pandas/core/frame.py \ + pandas/core/generic.py \ + pandas/core/indexers.py \ + pandas/core/nanops.py \ + pandas/core/series.py \ + pandas/io/sql.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests for directories' ; echo $MSG + pytest -q --doctest-modules \ + pandas/_libs/ \ + pandas/api/ \ + pandas/arrays/ \ + pandas/compat/ \ + pandas/core/array_algos/ \ + pandas/core/arrays/ \ + pandas/core/computation/ \ + pandas/core/dtypes/ \ + pandas/core/groupby/ \ + pandas/core/indexes/ \ + pandas/core/ops/ \ + pandas/core/reshape/ \ + pandas/core/strings/ \ + pandas/core/tools/ \ + pandas/core/window/ \ + pandas/errors/ \ + pandas/io/clipboard/ \ + pandas/io/parsers/ \ + pandas/io/sas/ \ + pandas/tseries/ RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/ci/deps/actions-37-minimum_versions.yaml b/ci/deps/actions-37-minimum_versions.yaml index 8052156858a32..3237cf9770220 100644 --- a/ci/deps/actions-37-minimum_versions.yaml +++ b/ci/deps/actions-37-minimum_versions.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython=0.29.21 - - pytest=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index 61f431256dd4a..f29830e9b3e79 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - botocore>=1.11 - fsspec>=0.7.4 - - numpy + - numpy=1.19 - python-dateutil - nomkl - pyarrow=0.15.1 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index f5e3420b8c9b3..261d6364cb5e1 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -19,7 +19,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then XVFB="xvfb-run " fi -PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" +PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas" if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then # GH#37455 windows py38 build appears to be running out of memory @@ -30,7 +30,7 @@ fi echo $PYTEST_CMD sh -c "$PYTEST_CMD" -PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" +PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas" echo $PYTEST_AM_CMD sh -c "$PYTEST_AM_CMD" diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b9afbe387799e..f4a09e0daa750 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -110,8 +110,8 @@ version control to allow many people to work together on the project. Some great resources for learning Git: * the `GitHub help pages `_. -* the `NumPy's documentation `_. -* Matthew Brett's `Pydagogue `_. +* the `NumPy documentation `_. +* Matthew Brett's `Pydagogue `_. Getting started with Git ------------------------ diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 56aa734deddd6..d53d0556dca04 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -164,6 +164,21 @@ A good implementation for Python users is `has2k1/plotnine `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. +`Lux `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`Lux `__ is a Python library that facilitates fast and easy experimentation with data by automating the visual data exploration process. To use Lux, simply add an extra import alongside pandas: + +.. code:: python + + import lux + import pandas as pd + + df = pd.read_csv("data.csv") + df # discover interesting insights! + +By printing out a dataframe, Lux automatically `recommends a set of visualizations `__ that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a `powerful, intuitive language `__ that allow users to create `Altair `__, `matplotlib `__, or `Vega-Lite `__ visualizations without having to think at the level of code. + `Qtpandas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b6351ac2232ff..16beb00d201b7 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -195,7 +195,7 @@ pandas is equipped with an exhaustive set of unit tests, covering about 97% of the code base as of this writing. To run it on your machine to verify that everything is working (and that you have all of the dependencies, soft and hard, installed), make sure you have `pytest -`__ >= 5.0.1 and `Hypothesis +`__ >= 6.0 and `Hypothesis `__ >= 3.58, then run: :: @@ -362,6 +362,21 @@ pyarrow 0.15.0 Parquet, ORC, and feather reading / pyreadstat SPSS files (.sav) reading ========================= ================== ============================================================= +.. _install.warn_orc: + +.. warning:: + + * If you want to use :func:`~pandas.read_orc`, it is highly recommended to install pyarrow using conda. + The following is a summary of the environment in which :func:`~pandas.read_orc` can work. + + ========================= ================== ============================================================= + System Conda PyPI + ========================= ================== ============================================================= + Linux Successful Failed(pyarrow==3.0 Successful) + macOS Successful Failed + Windows Failed Failed + ========================= ================== ============================================================= + Access data in the cloud ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 180f833a2753d..1de978b195382 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -178,7 +178,7 @@ To test for membership in the values, use the method :meth:`~pandas.Series.isin` For ``DataFrames``, likewise, ``in`` applies to the column axis, testing for membership in the list of column names. -.. _udf-mutation: +.. _gotchas.udf-mutation: Mutating with User Defined Function (UDF) methods ------------------------------------------------- diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index afb2e72cbff07..3f596388ca226 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -739,6 +739,26 @@ optimized Cython implementations: Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above code would work even without the special versions via dispatching (see below). +.. _groupby.aggregate.udfs: + +Aggregations with User-Defined Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Users can also provide their own functions for custom aggregations. When aggregating +with a User-Defined Function (UDF), the UDF should not mutate the provided ``Series``, see +:ref:`gotchas.udf-mutation` for more information. + +.. ipython:: python + + animals.groupby("kind")[["height"]].agg(lambda x: set(x)) + +The resulting dtype will reflect that of the aggregating function. If the results from different groups have +different dtypes, then a common dtype will be determined in the same way as ``DataFrame`` construction. + +.. ipython:: python + + animals.groupby("kind")[["height"]].agg(lambda x: x.astype(int).sum()) + .. _groupby.transform: Transformation @@ -759,7 +779,11 @@ as the one being grouped. The transform function must: * (Optionally) operates on the entire group chunk. If this is supported, a fast path is used starting from the *second* chunk. -For example, suppose we wished to standardize the data within each group: +Similar to :ref:`groupby.aggregate.udfs`, the resulting dtype will reflect that of the +transformation function. If the results from different groups have different dtypes, then +a common dtype will be determined in the same way as ``DataFrame`` construction. + +Suppose we wished to standardize the data within each group: .. ipython:: python @@ -1065,13 +1089,16 @@ that is itself a series, and possibly upcast the result to a DataFrame: s s.apply(f) - .. note:: ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to it. So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in the output as well as set the indices. +Similar to :ref:`groupby.aggregate.udfs`, the resulting dtype will reflect that of the +apply function. If the results from different groups have different dtypes, then +a common dtype will be determined in the same way as ``DataFrame`` construction. + Numba Accelerated Routines -------------------------- diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3b7a6037a9715..5148bb87b0eb0 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5443,6 +5443,11 @@ Similar to the :ref:`parquet ` format, the `ORC Format `__ library. +.. warning:: + + * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. + * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. + .. _io.sql: SQL queries diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 1fcaac1a91d09..278eb907102ed 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -456,6 +456,10 @@ io.hdf.dropna_table True drop ALL nan rows when appe io.parquet.engine None The engine to use as a default for parquet reading and writing. If None then try 'pyarrow' and 'fastparquet' +io.sql.engine None The engine to use as a default for + sql reading and writing, with SQLAlchemy + as a higher level interface. If None + then try 'sqlalchemy' mode.chained_assignment warn Controls ``SettingWithCopyWarning``: 'raise', 'warn', or None. Raise an exception, warn, or no action if diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 765b2929d3014..86696cc909764 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1006,7 +1006,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We expect certain styling functions to be common enough that we've included a few \"built-in\" to the `Styler`, so you don't have to write them yourself." + "Some styling functions are common enough that we've \"built them in\" to the `Styler`, so you don't have to write them and apply them yourself. The current list of such functions is:\n", + "\n", + " - [.highlight_null][nullfunc]: for use with identifying missing data. \n", + " - [.highlight_min][minfunc] and [.highlight_max][maxfunc]: for use with identifying extremeties in data.\n", + " - [.highlight_between][betweenfunc] and [.highlight_quantile][quantilefunc]: for use with identifying classes within data.\n", + " - [.background_gradient][bgfunc]: a flexible method for highlighting cells based or their, or other, values on a numeric scale.\n", + " - [.bar][barfunc]: to display mini-charts within cell backgrounds.\n", + " \n", + "The individual documentation on each function often gives more examples of their arguments.\n", + "\n", + "[nullfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_null.rst\n", + "[minfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_min.rst\n", + "[maxfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_max.rst\n", + "[betweenfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_between.rst\n", + "[quantilefunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_quantile.rst\n", + "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst\n", + "[barfunc]: ../reference/api/pandas.io.formats.style.Styler.bar.rst" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Highlight Null" ] }, { @@ -1017,14 +1040,14 @@ "source": [ "df2.iloc[0,2] = np.nan\n", "df2.iloc[4,3] = np.nan\n", - "df2.loc[:4].style.highlight_null(null_color='red')" + "df2.loc[:4].style.highlight_null(null_color='yellow')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can create \"heatmaps\" with the `background_gradient` method. These require matplotlib, and we'll use [Seaborn](https://stanford.edu/~mwaskom/software/seaborn/) to get a nice colormap." + "### Highlight Min or Max" ] }, { @@ -1033,17 +1056,15 @@ "metadata": {}, "outputs": [], "source": [ - "import seaborn as sns\n", - "cm = sns.light_palette(\"green\", as_cmap=True)\n", - "\n", - "df2.style.background_gradient(cmap=cm)" + "df2.loc[:4].style.highlight_max(axis=1, props='color:white; font-weight:bold; background-color:darkblue;')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "`Styler.background_gradient` takes the keyword arguments `low` and `high`. Roughly speaking these extend the range of your data by `low` and `high` percent so that when we convert the colors, the colormap's entire range isn't used. This is useful so that you can actually read the text still." + "### Highlight Between\n", + "This method accepts ranges as float, or NumPy arrays or Series provided the indexes match." ] }, { @@ -1052,8 +1073,16 @@ "metadata": {}, "outputs": [], "source": [ - "# Uses the full color range\n", - "df2.loc[:4].style.background_gradient(cmap='viridis')" + "left = pd.Series([1.0, 0.0, 1.0], index=[\"A\", \"B\", \"D\"])\n", + "df2.loc[:4].style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:purple;')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Highlight Quantile\n", + "Useful for detecting the highest or lowest percentile values" ] }, { @@ -1062,17 +1091,21 @@ "metadata": {}, "outputs": [], "source": [ - "# Compress the color range\n", - "df2.loc[:4].style\\\n", - " .background_gradient(cmap='viridis', low=.5, high=0)\\\n", - " .highlight_null('red')" + "df2.loc[:4].style.highlight_quantile(q_left=0.85, axis=None, color='yellow')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Background Gradient" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "There's also `.highlight_min` and `.highlight_max`, which is almost identical to the user defined version we created above, and also a `.highlight_null` method. " + "You can create \"heatmaps\" with the `background_gradient` method. These require matplotlib, and we'll use [Seaborn](https://stanford.edu/~mwaskom/software/seaborn/) to get a nice colormap." ] }, { @@ -1081,7 +1114,19 @@ "metadata": {}, "outputs": [], "source": [ - "df2.loc[:4].style.highlight_max(axis=0)" + "import seaborn as sns\n", + "cm = sns.light_palette(\"green\", as_cmap=True)\n", + "\n", + "df2.style.background_gradient(cmap=cm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[.background_gradient][bgfunc] has a number of keyword arguments to customise the gradients and colors. See its documentation.\n", + "\n", + "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst" ] }, { diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 16f9284802407..60e146b2212eb 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) -- +- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index a286d152f03c3..bf63a51204f5c 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -801,10 +801,13 @@ Plotting - Bug in :func:`scatter_matrix` raising when 2d ``ax`` argument passed (:issue:`16253`) - Prevent warnings when matplotlib's ``constrained_layout`` is enabled (:issue:`25261`) +- Bug in :meth:`BoxPlot._validate_color_args` In box plot when 'dark_background' theme was selected caps or min/max markers for the plot was not visible (:issue:`40769`) - Bug in :func:`DataFrame.plot` was showing the wrong colors in the legend if the function was called repeatedly and some calls used ``yerr`` while others didn't (partial fix of :issue:`39522`) - Bug in :func:`DataFrame.plot` was showing the wrong colors in the legend if the function was called repeatedly and some calls used ``secondary_y`` and others use ``legend=False`` (:issue:`40044`) + + Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` with :class:`PeriodDtype` columns incorrectly casting results too aggressively (:issue:`38254`) @@ -839,6 +842,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`) - Bug in :meth:`.GroupBy.rank` with nullable dtypes incorrectly raising ``TypeError`` (:issue:`41010`) + Reshaping ^^^^^^^^^ - Bug in :func:`merge` raising error when performing an inner join with partial index and ``right_index`` when no overlap between indices (:issue:`33814`) diff --git a/environment.yml b/environment.yml index 0d03ad8e0a46a..146bf6db08d8b 100644 --- a/environment.yml +++ b/environment.yml @@ -20,7 +20,7 @@ dependencies: # code checks - black=20.8b1 - cpplint - - flake8=3.9.1 + - flake8=3.9.0 - flake8-bugbear=21.3.2 # used by flake8, find likely bugs - flake8-comprehensions=3.1.0 # used by flake8, linting of unnecessary comprehensions - isort>=5.2.1 # check that imports are in the right order diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 8fb307150a48f..3fa92ce2229c3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1277,6 +1277,7 @@ def group_min(groupby_t[:, ::1] out, @cython.wraparound(False) cdef group_cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, const intp_t[:] labels, int ngroups, bint is_datetimelike, @@ -1290,6 +1291,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Array to store cummin/max in. values : np.ndarray[groupby_t, ndim=2] Values to take cummin/max of. + mask : np.ndarray[bool] or None + If not None, indices represent missing values, + otherwise the mask will not be used labels : np.ndarray[np.intp] Labels to group by. ngroups : int @@ -1307,11 +1311,14 @@ cdef group_cummin_max(groupby_t[:, ::1] out, cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval - ndarray[groupby_t, ndim=2] accum + groupby_t[:, ::1] accum intp_t lab + bint val_is_nan, use_mask + + use_mask = mask is not None N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + accum = np.empty((ngroups, K), dtype=values.dtype) if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: @@ -1326,11 +1333,29 @@ cdef group_cummin_max(groupby_t[:, ::1] out, if lab < 0: continue for j in range(K): - val = values[i, j] + val_is_nan = False + + if use_mask: + if mask[i, j]: + + # `out` does not need to be set since it + # will be masked anyway + val_is_nan = True + else: + + # If using the mask, we can avoid grabbing the + # value unless necessary + val = values[i, j] - if _treat_as_na(val, is_datetimelike): - out[i, j] = val + # Otherwise, `out` must be set accordingly if the + # value is missing else: + val = values[i, j] + if _treat_as_na(val, is_datetimelike): + val_is_nan = True + out[i, j] = val + + if not val_is_nan: mval = accum[lab, j] if compute_max: if val > mval: @@ -1347,9 +1372,18 @@ def group_cummin(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, int ngroups, - bint is_datetimelike) -> None: + bint is_datetimelike, + uint8_t[:, ::1] mask=None) -> None: """See group_cummin_max.__doc__""" - group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=False) + group_cummin_max( + out, + values, + mask, + labels, + ngroups, + is_datetimelike, + compute_max=False + ) @cython.boundscheck(False) @@ -1358,6 +1392,15 @@ def group_cummax(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, int ngroups, - bint is_datetimelike) -> None: + bint is_datetimelike, + uint8_t[:, ::1] mask=None) -> None: """See group_cummin_max.__doc__""" - group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=True) + group_cummin_max( + out, + values, + mask, + labels, + ngroups, + is_datetimelike, + compute_max=True + ) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 1e2a336f12444..4566f22be2c36 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -163,7 +163,7 @@ cdef class Int64Factorizer: @cython.wraparound(False) @cython.boundscheck(False) -def unique_label_indices(const int64_t[:] labels): +def unique_label_indices(const int64_t[:] labels) -> ndarray: """ Indices of the first occurrences of the unique labels *excluding* -1. equivalent to: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 4dc5e7516db7e..a25867c4a3b0c 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -687,7 +687,10 @@ cdef class {{name}}HashTable(HashTable): {{if dtype == 'int64'}} @cython.boundscheck(False) - def get_labels_groupby(self, const {{dtype}}_t[:] values): + def get_labels_groupby( + self, const {{dtype}}_t[:] values + ) -> tuple[ndarray, ndarray]: + # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]] cdef: Py_ssize_t i, n = len(values) intp_t[:] labels diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 7966fe8d4f045..bdbaa05138072 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -3,9 +3,10 @@ cdef class NDFrameIndexerBase: A base class for _NDFrameIndexer for fast instantiation and attribute access. """ cdef public: - object obj, name, _ndim + str name + object obj, _ndim - def __init__(self, name, obj): + def __init__(self, name: str, obj): self.obj = obj self.name = name self._ndim = None diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index f3436e9c7afba..a46a1747d1d8d 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -79,5 +79,3 @@ class BlockManager: _blklocs: np.ndarray def __init__(self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=True): ... - - def get_slice(self: T, slobj: slice, axis: int=...) -> T: ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index f3bc70ad8a26b..3fd580684a6a2 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -515,7 +515,7 @@ cdef class NumpyBlock(SharedBlock): self.values = values # @final # not useful in cython, but we _would_ annotate with @final - cpdef NumpyBlock getitem_block_index(self, slice slicer): + def getitem_block_index(self, slicer: slice) -> NumpyBlock: """ Perform __getitem__-like specialized to slicing along index. @@ -610,30 +610,3 @@ cdef class BlockManager: self._rebuild_blknos_and_blklocs() # ------------------------------------------------------------------- - # Indexing - - cdef BlockManager _get_index_slice(self, slobj): - cdef: - SharedBlock blk, nb - - nbs = [] - for blk in self.blocks: - nb = blk.getitem_block_index(slobj) - nbs.append(nb) - - new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] - return type(self)(tuple(nbs), new_axes, verify_integrity=False) - - def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: - - if axis == 0: - new_blocks = self._slice_take_blocks_ax0(slobj) - elif axis == 1: - return self._get_index_slice(slobj) - else: - raise IndexError("Requested axis not found in manager") - - new_axes = list(self.axes) - new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - - return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index bbcee479aeb5a..31b43cdb28d9d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -83,7 +83,6 @@ typedef struct __PdBlockContext { int ncols; int transpose; - int *cindices; // frame column -> block column map NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; @@ -294,7 +293,12 @@ static int is_simple_frame(PyObject *obj) { if (!mgr) { return 0; } - int ret = (get_attr_length(mgr, "blocks") <= 1); + int ret; + if (PyObject_HasAttrString(mgr, "blocks")) { + ret = (get_attr_length(mgr, "blocks") <= 1); + } else { + ret = 0; + } Py_DECREF(mgr); return ret; @@ -656,16 +660,10 @@ void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { } void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *blocks, *block, *values, *tmp; - PyArrayObject *locs; + PyObject *obj, *values, *arrays, *array; PdBlockContext *blkCtxt; NpyArrContext *npyarr; Py_ssize_t i; - NpyIter *iter; - NpyIter_IterNextFunc *iternext; - npy_int64 **dataptr; - npy_int64 colIdx; - npy_intp idx; obj = (PyObject *)_obj; @@ -687,7 +685,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { if (blkCtxt->ncols == 0) { blkCtxt->npyCtxts = NULL; - blkCtxt->cindices = NULL; GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; @@ -701,104 +698,45 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { return; } - blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols); - if (!blkCtxt->cindices) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - blocks = get_sub_attr(obj, "_mgr", "blocks"); - if (!blocks) { + arrays = get_sub_attr(obj, "_mgr", "column_arrays"); + if (!arrays) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; - } else if (!PyTuple_Check(blocks)) { - PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!"); - goto BLKRET; } - // force transpose so each NpyArrContext strides down its column - GET_TC(tc)->transpose = 1; - - for (i = 0; i < PyObject_Length(blocks); i++) { - block = PyTuple_GET_ITEM(blocks, i); - if (!block) { + for (i = 0; i < PyObject_Length(arrays); i++) { + array = PyList_GET_ITEM(arrays, i); + if (!array) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + goto ARR_RET; } - tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL); - if (!tmp) { + // ensure we have a numpy array (i.e. np.asarray) + values = PyObject_CallMethod(array, "__array__", NULL); + if ((!values) || (!PyArray_CheckExact(values))) { + // Didn't get a numpy array ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - values = PyArray_Transpose((PyArrayObject *)tmp, NULL); - Py_DECREF(tmp); - if (!values) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - - locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array"); - if (!locs) { - Py_DECREF(values); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + goto ARR_RET; } - iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER, - NPY_NO_CASTING, NULL); - if (!iter) { - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { - NpyIter_Deallocate(iter); - Py_DECREF(values); - Py_DECREF(locs); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter); - do { - colIdx = **dataptr; - idx = NpyIter_GetIterIndex(iter); + GET_TC(tc)->newObj = values; - blkCtxt->cindices[colIdx] = idx; + // init a dedicated context for this column + NpyArr_iterBegin(obj, tc); + npyarr = GET_TC(tc)->npyarr; - // Reference freed in Pdblock_iterend - Py_INCREF(values); - GET_TC(tc)->newObj = values; - - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - npyarr = GET_TC(tc)->npyarr; - - // set the dataptr to our desired column and initialise - if (npyarr != NULL) { - npyarr->dataptr += npyarr->stride * idx; - NpyArr_iterNext(obj, tc); - } - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - - blkCtxt->npyCtxts[colIdx] = npyarr; - GET_TC(tc)->newObj = NULL; - } while (iternext(iter)); + GET_TC(tc)->itemValue = NULL; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - NpyIter_Deallocate(iter); - Py_DECREF(values); - Py_DECREF(locs); + blkCtxt->npyCtxts[i] = npyarr; + GET_TC(tc)->newObj = NULL; } GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; + goto ARR_RET; -BLKRET: - Py_DECREF(blocks); +ARR_RET: + Py_DECREF(arrays); } void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { @@ -830,9 +768,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { if (blkCtxt->npyCtxts) { PyObject_Free(blkCtxt->npyCtxts); } - if (blkCtxt->cindices) { - PyObject_Free(blkCtxt->cindices); - } PyObject_Free(blkCtxt); } } diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 0c598beb6ad16..60bfaa38b495f 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -277,6 +277,17 @@ cdef class _NaT(datetime): See Also -------- DatetimeIndex.to_numpy : Similar method for DatetimeIndex. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.to_numpy() + numpy.datetime64('2020-03-14T15:32:52.192548651') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.to_numpy() + numpy.datetime64('NaT') """ return self.to_datetime64() @@ -414,6 +425,17 @@ class NaTType(_NaT): Returns ------- str + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.month_name() + 'March' + + Analogous for ``pd.NaT``: + + >>> pd.NaT.month_name() + nan """, ) day_name = _make_nan_func( @@ -429,6 +451,17 @@ class NaTType(_NaT): Returns ------- str + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.day_name() + 'Saturday' + + Analogous for ``pd.NaT``: + + >>> pd.NaT.day_name() + nan """, ) # _nat_methods @@ -467,6 +500,12 @@ class NaTType(_NaT): Format string to convert Timestamp to string. See strftime documentation for more information on the format string: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.strftime('%Y-%m-%d %X') + '2020-03-14 15:32:52' """, ) @@ -485,6 +524,11 @@ class NaTType(_NaT): Timestamp.utcfromtimestamp(ts) Construct a naive UTC datetime from a POSIX timestamp. + + Examples + -------- + >>> pd.Timestamp.fromtimestamp(1584199972) + Timestamp('2020-03-14 15:32:52') """, ) fromtimestamp = _make_error_func( @@ -493,6 +537,13 @@ class NaTType(_NaT): Timestamp.fromtimestamp(ts) Transform timestamp[, tz] to tz's local time from POSIX timestamp. + + Examples + -------- + >>> pd.Timestamp.utcfromtimestamp(1584199972) + Timestamp('2020-03-14 15:32:52') + + Note that the output may change depending on your local time. """, ) combine = _make_error_func( @@ -501,6 +552,12 @@ class NaTType(_NaT): Timestamp.combine(date, time) Combine date, time into datetime with same date and time fields. + + Examples + -------- + >>> from datetime import date, time + >>> pd.Timestamp.combine(date(2020, 3, 14), time(15, 30, 15)) + Timestamp('2020-03-14 15:30:15') """, ) utcnow = _make_error_func( @@ -509,10 +566,26 @@ class NaTType(_NaT): Timestamp.utcnow() Return a new Timestamp representing UTC day and time. + + Examples + -------- + >>> pd.Timestamp.utcnow() + Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC') """, ) - timestamp = _make_error_func("timestamp", "Return POSIX timestamp as float.") + timestamp = _make_error_func( + "timestamp", + """ + Return POSIX timestamp as float. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') + >>> ts.timestamp() + 1584199972.192548 + """ + ) # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or # return NaT create functions that raise, for binding to NaTType @@ -535,6 +608,29 @@ class NaTType(_NaT): ------ TypeError If Timestamp is tz-naive. + + Examples + -------- + Create a timestamp object with UTC timezone: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC') + >>> ts + Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC') + + Change to Tokyo timezone: + + >>> ts.tz_convert(tz='Asia/Tokyo') + Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo') + + Can also use ``astimezone``: + + >>> ts.astimezone(tz='Asia/Tokyo') + Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.tz_convert(tz='Asia/Tokyo') + NaT """, ) fromordinal = _make_error_func( @@ -553,6 +649,11 @@ class NaTType(_NaT): Offset to apply to the Timestamp. tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + + Examples + -------- + >>> pd.Timestamp.fromordinal(737425) + Timestamp('2020-01-01 00:00:00') """, ) @@ -563,6 +664,17 @@ class NaTType(_NaT): Convert a Timestamp object to a native Python datetime object. If warn=True, issue a warning if nanoseconds is nonzero. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') + >>> ts.to_pydatetime() + datetime.datetime(2020, 3, 14, 15, 32, 52, 192548) + + Analogous for ``pd.NaT``: + + >>> pd.NaT.to_pydatetime() + NaT """, ) @@ -578,6 +690,16 @@ class NaTType(_NaT): ---------- tz : str or timezone object, default None Timezone to localize to. + + Examples + -------- + >>> pd.Timestamp.now() + Timestamp('2020-11-16 22:06:16.378782') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.now() + NaT """, ) today = _make_nat_func( @@ -593,6 +715,16 @@ class NaTType(_NaT): ---------- tz : str or timezone object, default None Timezone to localize to. + + Examples + -------- + >>> pd.Timestamp.today() + Timestamp('2020-11-16 22:37:39.969883') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.today() + NaT """, ) round = _make_nat_func( @@ -636,6 +768,41 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted + + Examples + -------- + Create a timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + + A timestamp can be rounded using multiple frequency units: + + >>> ts.round(freq='H') # hour + Timestamp('2020-03-14 16:00:00') + + >>> ts.round(freq='T') # minute + Timestamp('2020-03-14 15:33:00') + + >>> ts.round(freq='S') # seconds + Timestamp('2020-03-14 15:32:52') + + >>> ts.round(freq='L') # milliseconds + Timestamp('2020-03-14 15:32:52.193000') + + ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + + >>> ts.round(freq='5T') + Timestamp('2020-03-14 15:35:00') + + or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + + >>> ts.round(freq='1H30T') + Timestamp('2020-03-14 15:00:00') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.round() + NaT """, ) floor = _make_nat_func( @@ -675,6 +842,41 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted. + + Examples + -------- + Create a timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + + A timestamp can be floored using multiple frequency units: + + >>> ts.floor(freq='H') # hour + Timestamp('2020-03-14 15:00:00') + + >>> ts.floor(freq='T') # minute + Timestamp('2020-03-14 15:32:00') + + >>> ts.floor(freq='S') # seconds + Timestamp('2020-03-14 15:32:52') + + >>> ts.floor(freq='N') # nanoseconds + Timestamp('2020-03-14 15:32:52.192548651') + + ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + + >>> ts.floor(freq='5T') + Timestamp('2020-03-14 15:30:00') + + or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + + >>> ts.floor(freq='1H30T') + Timestamp('2020-03-14 15:00:00') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.floor() + NaT """, ) ceil = _make_nat_func( @@ -714,6 +916,41 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted. + + Examples + -------- + Create a timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + + A timestamp can be ceiled using multiple frequency units: + + >>> ts.ceil(freq='H') # hour + Timestamp('2020-03-14 16:00:00') + + >>> ts.ceil(freq='T') # minute + Timestamp('2020-03-14 15:33:00') + + >>> ts.ceil(freq='S') # seconds + Timestamp('2020-03-14 15:32:53') + + >>> ts.ceil(freq='U') # microseconds + Timestamp('2020-03-14 15:32:52.192549') + + ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + + >>> ts.ceil(freq='5T') + Timestamp('2020-03-14 15:35:00') + + or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + + >>> ts.ceil(freq='1H30T') + Timestamp('2020-03-14 16:30:00') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.ceil() + NaT """, ) @@ -736,6 +973,29 @@ timedelta}, default 'raise' ------ TypeError If Timestamp is tz-naive. + + Examples + -------- + Create a timestamp object with UTC timezone: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC') + >>> ts + Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC') + + Change to Tokyo timezone: + + >>> ts.tz_convert(tz='Asia/Tokyo') + Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo') + + Can also use ``astimezone``: + + >>> ts.astimezone(tz='Asia/Tokyo') + Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.tz_convert(tz='Asia/Tokyo') + NaT """, ) tz_localize = _make_nat_func( @@ -791,6 +1051,24 @@ default 'raise' ------ TypeError If the Timestamp is tz-aware and tz is not None. + + Examples + -------- + Create a naive timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts + Timestamp('2020-03-14 15:32:52.192548651') + + Add 'Europe/Stockholm' as timezone: + + >>> ts.tz_localize(tz='Europe/Stockholm') + Timestamp('2020-03-14 15:32:52.192548651+0100', tz='Europe/Stockholm') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.tz_localize() + NaT """, ) replace = _make_nat_func( @@ -814,6 +1092,30 @@ default 'raise' Returns ------- Timestamp with fields replaced + + Examples + -------- + Create a timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC') + >>> ts + Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC') + + Replace year and the hour: + + >>> ts.replace(year=1999, hour=10) + Timestamp('1999-03-14 10:32:52.192548651+0000', tz='UTC') + + Replace timezone (not a conversion): + + >>> import pytz + >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + NaT """, ) diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi new file mode 100644 index 0000000000000..8728b700a1f6d --- /dev/null +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -0,0 +1,205 @@ +from datetime import ( + date as _date, + datetime, + time as _time, + timedelta, + tzinfo as _tzinfo, +) +import sys +from time import struct_time +from typing import ( + ClassVar, + Optional, + Type, + TypeVar, + overload, +) + +import numpy as np + +from pandas._libs.tslibs import ( + NaT, + NaTType, + Period, + Timedelta, +) + +_S = TypeVar("_S") + + +def integer_op_not_supported(obj) -> None: ... + + +class Timestamp(datetime): + min: ClassVar[Timestamp] + max: ClassVar[Timestamp] + + resolution: ClassVar[Timedelta] + value: int # np.int64 + + # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") + def __new__( # type: ignore[misc] + cls: Type[_S], + ts_input: int | np.integer | float | str | _date | datetime | np.datetime64 = ..., + freq=..., + tz: str | _tzinfo | None | int= ..., + unit=..., + year: int | None = ..., + month: int | None = ..., + day: int | None = ..., + hour: int | None = ..., + minute: int | None = ..., + second: int | None = ..., + microsecond: int | None = ..., + nanosecond: int | None = ..., + tzinfo: _tzinfo | None = ..., + *, + fold: int | None= ..., + ) -> _S | NaTType: ... + + @property + def year(self) -> int: ... + @property + def month(self) -> int: ... + @property + def day(self) -> int: ... + @property + def hour(self) -> int: ... + @property + def minute(self) -> int: ... + @property + def second(self) -> int: ... + @property + def microsecond(self) -> int: ... + @property + def tzinfo(self) -> Optional[_tzinfo]: ... + @property + def tz(self) -> Optional[_tzinfo]: ... + + @property + def fold(self) -> int: ... + + @classmethod + def fromtimestamp(cls: Type[_S], t: float, tz: Optional[_tzinfo] = ...) -> _S: ... + @classmethod + def utcfromtimestamp(cls: Type[_S], t: float) -> _S: ... + @classmethod + def today(cls: Type[_S]) -> _S: ... + @classmethod + def fromordinal(cls: Type[_S], n: int) -> _S: ... + + if sys.version_info >= (3, 8): + @classmethod + def now(cls: Type[_S], tz: _tzinfo | str | None = ...) -> _S: ... + else: + @overload + @classmethod + def now(cls: Type[_S], tz: None = ...) -> _S: ... + @overload + @classmethod + def now(cls, tz: _tzinfo) -> datetime: ... + + @classmethod + def utcnow(cls: Type[_S]) -> _S: ... + @classmethod + def combine(cls, date: _date, time: _time, tzinfo: Optional[_tzinfo] = ...) -> datetime: ... + + @classmethod + def fromisoformat(cls: Type[_S], date_string: str) -> _S: ... + + def strftime(self, fmt: str) -> str: ... + def __format__(self, fmt: str) -> str: ... + + def toordinal(self) -> int: ... + def timetuple(self) -> struct_time: ... + + def timestamp(self) -> float: ... + + def utctimetuple(self) -> struct_time: ... + def date(self) -> _date: ... + def time(self) -> _time: ... + def timetz(self) -> _time: ... + + def replace( + self, + year: int = ..., + month: int = ..., + day: int = ..., + hour: int = ..., + minute: int = ..., + second: int = ..., + microsecond: int = ..., + tzinfo: Optional[_tzinfo] = ..., + *, + fold: int = ..., + ) -> datetime: ... + + if sys.version_info >= (3, 8): + def astimezone(self: _S, tz: Optional[_tzinfo] = ...) -> _S: ... + else: + def astimezone(self, tz: Optional[_tzinfo] = ...) -> datetime: ... + + def ctime(self) -> str: ... + def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... + + @classmethod + def strptime(cls, date_string: str, format: str) -> datetime: ... + + def utcoffset(self) -> Optional[timedelta]: ... + def tzname(self) -> Optional[str]: ... + def dst(self) -> Optional[timedelta]: ... + + def __le__(self, other: datetime) -> bool: ... # type: ignore + def __lt__(self, other: datetime) -> bool: ... # type: ignore + def __ge__(self, other: datetime) -> bool: ... # type: ignore + def __gt__(self, other: datetime) -> bool: ... # type: ignore + if sys.version_info >= (3, 8): + def __add__(self: _S, other: timedelta) -> _S: ... + def __radd__(self: _S, other: timedelta) -> _S: ... + else: + def __add__(self, other: timedelta) -> datetime: ... + def __radd__(self, other: timedelta) -> datetime: ... + @overload # type: ignore + def __sub__(self, other: datetime) -> timedelta: ... + @overload + def __sub__(self, other: timedelta) -> datetime: ... + + def __hash__(self) -> int: ... + def weekday(self) -> int: ... + def isoweekday(self) -> int: ... + def isocalendar(self) -> tuple[int, int, int]: ... + + @property + def is_leap_year(self) -> bool: ... + @property + def is_month_start(self) -> bool: ... + @property + def is_quarter_start(self) -> bool: ... + @property + def is_year_start(self) -> bool: ... + @property + def is_month_end(self) -> bool: ... + @property + def is_quarter_end(self) -> bool: ... + @property + def is_year_end(self) -> bool: ... + + def to_pydatetime(self, warn: bool = ...) -> datetime: ... + def to_datetime64(self) -> np.datetime64: ... + def to_period(self, freq) -> Period: ... + def to_julian_date(self) -> np.float64: ... + + @property + def asm8(self) -> np.datetime64: ... + + def tz_convert(self: _S, tz) -> _S: ... + + # TODO: could return NaT? + def tz_localize(self: _S, tz, ambiguous: str = ..., nonexistent: str = ...) -> _S: ... + + def normalize(self: _S) -> _S: ... + + # TODO: round/floor/ceil could return NaT? + def round(self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ...) -> _S: ... + def floor(self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ...) -> _S: ... + def ceil(self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ...) -> _S: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 60ffa3dd46989..a4f764878d19e 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -386,6 +386,16 @@ cdef class _Timestamp(ABCTimestamp): def is_month_start(self) -> bool: """ Return True if date is first day of month. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.is_month_start + False + + >>> ts = pd.Timestamp(2020, 1, 1) + >>> ts.is_month_start + True """ if self.freq is None: # fast-path for non-business frequencies @@ -396,6 +406,16 @@ cdef class _Timestamp(ABCTimestamp): def is_month_end(self) -> bool: """ Return True if date is last day of month. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.is_month_end + False + + >>> ts = pd.Timestamp(2020, 12, 31) + >>> ts.is_month_end + True """ if self.freq is None: # fast-path for non-business frequencies @@ -406,6 +426,16 @@ cdef class _Timestamp(ABCTimestamp): def is_quarter_start(self) -> bool: """ Return True if date is first day of the quarter. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.is_quarter_start + False + + >>> ts = pd.Timestamp(2020, 4, 1) + >>> ts.is_quarter_start + True """ if self.freq is None: # fast-path for non-business frequencies @@ -416,6 +446,16 @@ cdef class _Timestamp(ABCTimestamp): def is_quarter_end(self) -> bool: """ Return True if date is last day of the quarter. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.is_quarter_end + False + + >>> ts = pd.Timestamp(2020, 3, 31) + >>> ts.is_quarter_end + True """ if self.freq is None: # fast-path for non-business frequencies @@ -426,6 +466,16 @@ cdef class _Timestamp(ABCTimestamp): def is_year_start(self) -> bool: """ Return True if date is first day of the year. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.is_year_start + False + + >>> ts = pd.Timestamp(2020, 1, 1) + >>> ts.is_year_start + True """ if self.freq is None: # fast-path for non-business frequencies @@ -436,6 +486,16 @@ cdef class _Timestamp(ABCTimestamp): def is_year_end(self) -> bool: """ Return True if date is last day of the year. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.is_year_end + False + + >>> ts = pd.Timestamp(2020, 12, 31) + >>> ts.is_year_end + True """ if self.freq is None: # fast-path for non-business frequencies @@ -464,6 +524,17 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- str + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.day_name() + 'Saturday' + + Analogous for ``pd.NaT``: + + >>> pd.NaT.day_name() + nan """ return self._get_date_name_field("day_name", locale) @@ -479,6 +550,17 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- str + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.month_name() + 'March' + + Analogous for ``pd.NaT``: + + >>> pd.NaT.month_name() + nan """ return self._get_date_name_field("month_name", locale) @@ -486,6 +568,12 @@ cdef class _Timestamp(ABCTimestamp): def is_leap_year(self) -> bool: """ Return True if year is a leap year. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.is_leap_year + True """ return bool(ccalendar.is_leapyear(self.year)) @@ -493,6 +581,12 @@ cdef class _Timestamp(ABCTimestamp): def day_of_week(self) -> int: """ Return day of the week. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.day_of_week + 5 """ return self.weekday() @@ -500,6 +594,12 @@ cdef class _Timestamp(ABCTimestamp): def day_of_year(self) -> int: """ Return the day of the year. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.day_of_year + 74 """ return ccalendar.get_day_of_year(self.year, self.month, self.day) @@ -507,6 +607,12 @@ cdef class _Timestamp(ABCTimestamp): def quarter(self) -> int: """ Return the quarter of the year. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.quarter + 1 """ return ((self.month - 1) // 3) + 1 @@ -514,6 +620,12 @@ cdef class _Timestamp(ABCTimestamp): def week(self) -> int: """ Return the week number of the year. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.week + 11 """ return ccalendar.get_week_of_year(self.year, self.month, self.day) @@ -521,6 +633,12 @@ cdef class _Timestamp(ABCTimestamp): def days_in_month(self) -> int: """ Return the number of days in the month. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14) + >>> ts.days_in_month + 31 """ return ccalendar.get_days_in_month(self.year, self.month) @@ -530,6 +648,12 @@ cdef class _Timestamp(ABCTimestamp): def normalize(self) -> "Timestamp": """ Normalize Timestamp to midnight, preserving tz information. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14, 15, 30) + >>> ts.normalize() + Timestamp('2020-03-14 00:00:00') """ cdef: local_val = self._maybe_convert_value_to_local() @@ -639,11 +763,25 @@ cdef class _Timestamp(ABCTimestamp): def asm8(self) -> np.datetime64: """ Return numpy datetime64 format in nanoseconds. + + Examples + -------- + >>> ts = pd.Timestamp(2020, 3, 14, 15) + >>> ts.asm8 + numpy.datetime64('2020-03-14T15:00:00.000000000') """ return np.datetime64(self.value, 'ns') def timestamp(self): - """Return POSIX timestamp as float.""" + """ + Return POSIX timestamp as float. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') + >>> ts.timestamp() + 1584199972.192548 + """ # GH 17329 # Note: Naive timestamps will not match datetime.stdlib return round(self.value / 1e9, 6) @@ -653,6 +791,17 @@ cdef class _Timestamp(ABCTimestamp): Convert a Timestamp object to a native Python datetime object. If warn=True, issue a warning if nanoseconds is nonzero. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') + >>> ts.to_pydatetime() + datetime.datetime(2020, 3, 14, 15, 32, 52, 192548) + + Analogous for ``pd.NaT``: + + >>> pd.NaT.to_pydatetime() + NaT """ if self.nanosecond != 0 and warn: warnings.warn("Discarding nonzero nanoseconds in conversion", @@ -685,12 +834,38 @@ cdef class _Timestamp(ABCTimestamp): See Also -------- DatetimeIndex.to_numpy : Similar method for DatetimeIndex. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.to_numpy() + numpy.datetime64('2020-03-14T15:32:52.192548651') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.to_numpy() + numpy.datetime64('NaT') """ return self.to_datetime64() def to_period(self, freq=None): """ Return an period of which this timestamp is an observation. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.to_period(freq='Y) # Year end frequency + numpy.datetime64('2020-03-14T15:32:52.192548651') + + >>> ts.to_period(freq='M') # Month end frequency + Period('2020-03', 'M') + + >>> ts.to_period(freq='W') # Weekly frequency + Period('2020-03-09/2020-03-15', 'W-SUN') + + >>> ts.to_period(freq='Q') # Quarter end frequency + Period('2020Q1', 'Q-DEC') """ from pandas import Period @@ -800,6 +975,11 @@ class Timestamp(_Timestamp): Offset to apply to the Timestamp. tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + + Examples + -------- + >>> pd.Timestamp.fromordinal(737425) + Timestamp('2020-01-01 00:00:00') """ return cls(datetime.fromordinal(ordinal), freq=freq, tz=tz) @@ -816,6 +996,16 @@ class Timestamp(_Timestamp): ---------- tz : str or timezone object, default None Timezone to localize to. + + Examples + -------- + >>> pd.Timestamp.now() + Timestamp('2020-11-16 22:06:16.378782') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.now() + NaT """ if isinstance(tz, str): tz = maybe_get_tz(tz) @@ -834,6 +1024,16 @@ class Timestamp(_Timestamp): ---------- tz : str or timezone object, default None Timezone to localize to. + + Examples + -------- + >>> pd.Timestamp.today() + Timestamp('2020-11-16 22:37:39.969883') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.today() + NaT """ return cls.now(tz) @@ -843,6 +1043,11 @@ class Timestamp(_Timestamp): Timestamp.utcnow() Return a new Timestamp representing UTC day and time. + + Examples + -------- + >>> pd.Timestamp.utcnow() + Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC') """ return cls.now(UTC) @@ -852,6 +1057,11 @@ class Timestamp(_Timestamp): Timestamp.utcfromtimestamp(ts) Construct a naive UTC datetime from a POSIX timestamp. + + Examples + -------- + >>> pd.Timestamp.fromtimestamp(1584199972) + Timestamp('2020-03-14 15:32:52') """ return cls(datetime.utcfromtimestamp(ts)) @@ -861,6 +1071,13 @@ class Timestamp(_Timestamp): Timestamp.fromtimestamp(ts) Transform timestamp[, tz] to tz's local time from POSIX timestamp. + + Examples + -------- + >>> pd.Timestamp.utcfromtimestamp(1584199972) + Timestamp('2020-03-14 15:32:52') + + Note that the output may change depending on your local time. """ return cls(datetime.fromtimestamp(ts)) @@ -877,6 +1094,12 @@ class Timestamp(_Timestamp): Format string to convert Timestamp to string. See strftime documentation for more information on the format string: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.strftime('%Y-%m-%d %X') + '2020-03-14 15:32:52' """ return datetime.strftime(self, format) @@ -899,6 +1122,12 @@ class Timestamp(_Timestamp): Timestamp.combine(date, time) Combine date, time into datetime with same date and time fields. + + Examples + -------- + >>> from datetime import date, time + >>> pd.Timestamp.combine(date(2020, 3, 14), time(15, 30, 15)) + Timestamp('2020-03-14 15:30:15') """ return cls(datetime.combine(date, time)) @@ -1113,6 +1342,41 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted + + Examples + -------- + Create a timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + + A timestamp can be rounded using multiple frequency units: + + >>> ts.round(freq='H') # hour + Timestamp('2020-03-14 16:00:00') + + >>> ts.round(freq='T') # minute + Timestamp('2020-03-14 15:33:00') + + >>> ts.round(freq='S') # seconds + Timestamp('2020-03-14 15:32:52') + + >>> ts.round(freq='L') # milliseconds + Timestamp('2020-03-14 15:32:52.193000') + + ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + + >>> ts.round(freq='5T') + Timestamp('2020-03-14 15:35:00') + + or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + + >>> ts.round(freq='1H30T') + Timestamp('2020-03-14 15:00:00') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.round() + NaT """ return self._round( freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent @@ -1154,6 +1418,41 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted. + + Examples + -------- + Create a timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + + A timestamp can be floored using multiple frequency units: + + >>> ts.floor(freq='H') # hour + Timestamp('2020-03-14 15:00:00') + + >>> ts.floor(freq='T') # minute + Timestamp('2020-03-14 15:32:00') + + >>> ts.floor(freq='S') # seconds + Timestamp('2020-03-14 15:32:52') + + >>> ts.floor(freq='N') # nanoseconds + Timestamp('2020-03-14 15:32:52.192548651') + + ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + + >>> ts.floor(freq='5T') + Timestamp('2020-03-14 15:30:00') + + or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + + >>> ts.floor(freq='1H30T') + Timestamp('2020-03-14 15:00:00') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.floor() + NaT """ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) @@ -1193,6 +1492,41 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted. + + Examples + -------- + Create a timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + + A timestamp can be ceiled using multiple frequency units: + + >>> ts.ceil(freq='H') # hour + Timestamp('2020-03-14 16:00:00') + + >>> ts.ceil(freq='T') # minute + Timestamp('2020-03-14 15:33:00') + + >>> ts.ceil(freq='S') # seconds + Timestamp('2020-03-14 15:32:53') + + >>> ts.ceil(freq='U') # microseconds + Timestamp('2020-03-14 15:32:52.192549') + + ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + + >>> ts.ceil(freq='5T') + Timestamp('2020-03-14 15:35:00') + + or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + + >>> ts.ceil(freq='1H30T') + Timestamp('2020-03-14 16:30:00') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.ceil() + NaT """ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) @@ -1200,6 +1534,12 @@ timedelta}, default 'raise' def tz(self): """ Alias for tzinfo. + + Examples + -------- + >>> ts = pd.Timestamp(1584226800, unit='s', tz='Europe/Stockholm') + >>> ts.tz + """ return self.tzinfo @@ -1270,6 +1610,24 @@ default 'raise' ------ TypeError If the Timestamp is tz-aware and tz is not None. + + Examples + -------- + Create a naive timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts + Timestamp('2020-03-14 15:32:52.192548651') + + Add 'Europe/Stockholm' as timezone: + + >>> ts.tz_localize(tz='Europe/Stockholm') + Timestamp('2020-03-14 15:32:52.192548651+0100', tz='Europe/Stockholm') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.tz_localize() + NaT """ if ambiguous == 'infer': raise ValueError('Cannot infer offset with only one time.') @@ -1318,6 +1676,29 @@ default 'raise' ------ TypeError If Timestamp is tz-naive. + + Examples + -------- + Create a timestamp object with UTC timezone: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC') + >>> ts + Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC') + + Change to Tokyo timezone: + + >>> ts.tz_convert(tz='Asia/Tokyo') + Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo') + + Can also use ``astimezone``: + + >>> ts.astimezone(tz='Asia/Tokyo') + Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.tz_convert(tz='Asia/Tokyo') + NaT """ if self.tzinfo is None: # tz naive, use tz_localize @@ -1362,6 +1743,30 @@ default 'raise' Returns ------- Timestamp with fields replaced + + Examples + -------- + Create a timestamp object: + + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC') + >>> ts + Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC') + + Replace year and the hour: + + >>> ts.replace(year=1999, hour=10) + Timestamp('1999-03-14 10:32:52.192548651+0000', tz='UTC') + + Replace timezone (not a conversion): + + >>> import pytz + >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') + + Analogous for ``pd.NaT``: + + >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + NaT """ cdef: @@ -1441,6 +1846,12 @@ default 'raise' """ Convert TimeStamp to a Julian Date. 0 Julian date is noon January 1, 4713 BC. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52') + >>> ts.to_julian_date() + 2458923.147824074 """ year = self.year month = self.month diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi new file mode 100644 index 0000000000000..3391edac84224 --- /dev/null +++ b/pandas/_libs/window/aggregations.pyi @@ -0,0 +1,126 @@ +from typing import ( + Any, + Callable, + Literal, +) + +import numpy as np + +def roll_sum( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] + +def roll_mean( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] + +def roll_var( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t + ddof: int = ..., +) -> np.ndarray: ... # np.ndarray[float] + +def roll_skew( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] + +def roll_kurt( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] + +def roll_median_c( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] + +def roll_max( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] + +def roll_min( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] + +def roll_quantile( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t + quantile: float, # float64_t + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"], +) -> np.ndarray: ... # np.ndarray[float] + +def roll_apply( + obj: object, + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t + function: Callable[..., Any], + raw: bool, + args: tuple[Any, ...], + kwargs: dict[str, Any], +) -> np.ndarray: ... # np.ndarray[float] # FIXME: could also be type(obj) if n==0 + +def roll_weighted_sum( + values: np.ndarray, # const float64_t[:] + weights: np.ndarray, # const float64_t[:] + minp: int, +) -> np.ndarray: ... # np.ndarray[np.float64] + +def roll_weighted_mean( + values: np.ndarray, # const float64_t[:] + weights: np.ndarray, # const float64_t[:] + minp: int, +) -> np.ndarray: ... # np.ndarray[np.float64] + +def roll_weighted_var( + values: np.ndarray, # const float64_t[:] + weights: np.ndarray, # const float64_t[:] + minp: int, # int64_t + ddof: int, # unsigned int +) -> np.ndarray: ... # np.ndarray[np.float64] + +def ewma( + vals: np.ndarray, # const float64_t[:] + start: np.ndarray, # const int64_t[:] + end: np.ndarray, # const int64_t[:] + minp: int, + com: float, # float64_t + adjust: bool, + ignore_na: bool, + deltas: np.ndarray, # const float64_t[:] +) -> np.ndarray: ... # np.ndarray[np.float64] + +def ewmcov( + input_x: np.ndarray, # const float64_t[:] + start: np.ndarray, # const int64_t[:] + end: np.ndarray, # const int64_t[:] + minp: int, + input_y: np.ndarray, # const float64_t[:] + com: float, # float64_t + adjust: bool, + ignore_na: bool, + bias: bool, +) -> np.ndarray: ... # np.ndarray[np.float64] diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index 027a53edb1810..5153118e9b142 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -142,6 +142,14 @@ def _assert_caught_no_extra_warnings( for actual_warning in caught_warnings: if _is_unexpected_warning(actual_warning, expected_warning): + unclosed = "unclosed transport = (3, 8) PY39 = sys.version_info >= (3, 9) +PY310 = sys.version_info >= (3, 10) PYPY = platform.python_implementation() == "PyPy" IS64 = sys.maxsize > 2 ** 32 diff --git a/pandas/conftest.py b/pandas/conftest.py index 35affa62ccf68..7b29c41ef70f5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -66,28 +66,10 @@ MultiIndex, ) - # ---------------------------------------------------------------- # Configuration / Settings # ---------------------------------------------------------------- # pytest -def pytest_configure(config): - # Register marks to avoid warnings in pandas.test() - # sync with setup.cfg - config.addinivalue_line("markers", "single: mark a test as single cpu only") - config.addinivalue_line("markers", "slow: mark a test as slow") - config.addinivalue_line("markers", "network: mark a test as network") - config.addinivalue_line( - "markers", "db: tests requiring a database (mysql or postgres)" - ) - config.addinivalue_line("markers", "high_memory: mark a test as a high-memory only") - config.addinivalue_line("markers", "clipboard: mark a pd.read_clipboard test") - config.addinivalue_line( - "markers", "arm_slow: mark a test as slow for arm64 architecture" - ) - config.addinivalue_line( - "markers", "arraymanager: mark a test to run with ArrayManager enabled" - ) def pytest_addoption(parser): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6f906cf8879ff..2c4477056a112 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -37,19 +37,17 @@ from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, infer_dtype_from_array, + sanitize_to_nanoseconds, ) from pandas.core.dtypes.common import ( ensure_float64, - ensure_int64, ensure_object, ensure_platform_int, - ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, is_complex_dtype, is_datetime64_dtype, - is_datetime64_ns_dtype, is_extension_array_dtype, is_float_dtype, is_integer, @@ -57,11 +55,8 @@ is_list_like, is_numeric_dtype, is_object_dtype, - is_period_dtype, is_scalar, - is_signed_integer_dtype, is_timedelta64_dtype, - is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, ) @@ -134,71 +129,49 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: values = extract_array(values, extract_numpy=True) # we check some simple dtypes first - if is_object_dtype(values): + if is_object_dtype(values.dtype): return ensure_object(np.asarray(values)), np.dtype("object") - try: - if is_bool_dtype(values): - # we are actually coercing to uint64 - # until our algos support uint8 directly (see TODO) - return np.asarray(values).astype("uint64"), np.dtype("bool") - elif is_signed_integer_dtype(values): - return ensure_int64(values), np.dtype("int64") - elif is_unsigned_integer_dtype(values): - return ensure_uint64(values), np.dtype("uint64") - elif is_float_dtype(values): - return ensure_float64(values), np.dtype("float64") - elif is_complex_dtype(values): - - # ignore the fact that we are casting to float - # which discards complex parts - with catch_warnings(): - simplefilter("ignore", np.ComplexWarning) - values = ensure_float64(values) - return values, np.dtype("float64") - - except (TypeError, ValueError, OverflowError): - # if we are trying to coerce to a dtype - # and it is incompatible this will fall through to here - return ensure_object(values), np.dtype("object") + elif is_bool_dtype(values.dtype): + if isinstance(values, np.ndarray): + # i.e. actually dtype == np.dtype("bool") + return np.asarray(values).view("uint8"), values.dtype + else: + # i.e. all-bool Categorical, BooleanArray + return np.asarray(values).astype("uint8", copy=False), values.dtype + + elif is_integer_dtype(values.dtype): + return np.asarray(values), values.dtype + + elif is_float_dtype(values.dtype): + # Note: checking `values.dtype == "float128"` raises on Windows and 32bit + # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]" + # has no attribute "itemsize" + if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr] + # we dont (yet) have float128 hashtable support + return ensure_float64(values), values.dtype + return np.asarray(values), values.dtype + + elif is_complex_dtype(values.dtype): + # ignore the fact that we are casting to float + # which discards complex parts + with catch_warnings(): + simplefilter("ignore", np.ComplexWarning) + values = ensure_float64(values) + return values, np.dtype("float64") # datetimelike - if needs_i8_conversion(values.dtype): - if is_period_dtype(values.dtype): - from pandas import PeriodIndex - - values = PeriodIndex(values)._data - elif is_timedelta64_dtype(values.dtype): - from pandas import TimedeltaIndex - - values = TimedeltaIndex(values)._data - else: - # Datetime - if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype): - # Avoid calling the DatetimeIndex constructor as it is 1D only - # Note: this is reached by DataFrame.rank calls GH#27027 - # TODO(EA2D): special case not needed with 2D EAs - asi8 = values.view("i8") - dtype = values.dtype - # error: Incompatible return value type (got "Tuple[Any, - # Union[dtype, ExtensionDtype, None]]", expected - # "Tuple[ndarray, Union[dtype, ExtensionDtype]]") - return asi8, dtype # type: ignore[return-value] - - from pandas import DatetimeIndex - - values = DatetimeIndex(values)._data - dtype = values.dtype - return values.asi8, dtype + elif needs_i8_conversion(values.dtype): + if isinstance(values, np.ndarray): + values = sanitize_to_nanoseconds(values) + npvalues = values.view("i8") + npvalues = cast(np.ndarray, npvalues) + return npvalues, values.dtype elif is_categorical_dtype(values.dtype): values = cast("Categorical", values) values = values.codes dtype = pandas_dtype("category") - - # we are actually coercing to int64 - # until our algos support int* directly (not all do) - values = ensure_int64(values) return values, dtype # we have failed, return object @@ -268,8 +241,15 @@ def _ensure_arraylike(values) -> ArrayLike: _hashtables = { "float64": htable.Float64HashTable, + "float32": htable.Float32HashTable, "uint64": htable.UInt64HashTable, + "uint32": htable.UInt32HashTable, + "uint16": htable.UInt16HashTable, + "uint8": htable.UInt8HashTable, "int64": htable.Int64HashTable, + "int32": htable.Int32HashTable, + "int16": htable.Int16HashTable, + "int8": htable.Int8HashTable, "string": htable.StringHashTable, "object": htable.PyObjectHashTable, } @@ -298,6 +278,10 @@ def _get_values_for_rank(values: ArrayLike) -> np.ndarray: values = cast("Categorical", values)._values_for_rank() values, _ = _ensure_data(values) + if values.dtype.kind in ["i", "u", "f"]: + # rank_t includes only object, int64, uint64, float64 + dtype = values.dtype.kind + "8" + values = values.astype(dtype, copy=False) return values @@ -375,46 +359,60 @@ def unique(values): >>> pd.unique(pd.Series([2] + [1] * 5)) array([2, 1]) - >>> pd.unique(pd.Series([pd.Timestamp('20160101'), - ... pd.Timestamp('20160101')])) + >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') - >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), - ... pd.Timestamp('20160101', tz='US/Eastern')])) - array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], - dtype=object) - - >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), - ... pd.Timestamp('20160101', tz='US/Eastern')])) + >>> pd.unique( + ... pd.Series( + ... [ + ... pd.Timestamp("20160101", tz="US/Eastern"), + ... pd.Timestamp("20160101", tz="US/Eastern"), + ... ] + ... ) + ... ) + + ['2016-01-01 00:00:00-05:00'] + Length: 1, dtype: datetime64[ns, US/Eastern] + + >>> pd.unique( + ... pd.Index( + ... [ + ... pd.Timestamp("20160101", tz="US/Eastern"), + ... pd.Timestamp("20160101", tz="US/Eastern"), + ... ] + ... ) + ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], - ... dtype='datetime64[ns, US/Eastern]', freq=None) + dtype='datetime64[ns, US/Eastern]', + freq=None) - >>> pd.unique(list('baabc')) + >>> pd.unique(list("baabc")) array(['b', 'a', 'c'], dtype=object) An unordered Categorical will return categories in the order of appearance. - >>> pd.unique(pd.Series(pd.Categorical(list('baabc')))) - [b, a, c] - Categories (3, object): [b, a, c] + >>> pd.unique(pd.Series(pd.Categorical(list("baabc")))) + ['b', 'a', 'c'] + Categories (3, object): ['a', 'b', 'c'] - >>> pd.unique(pd.Series(pd.Categorical(list('baabc'), - ... categories=list('abc')))) - [b, a, c] - Categories (3, object): [b, a, c] + >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc")))) + ['b', 'a', 'c'] + Categories (3, object): ['a', 'b', 'c'] An ordered Categorical preserves the category ordering. - >>> pd.unique(pd.Series(pd.Categorical(list('baabc'), - ... categories=list('abc'), - ... ordered=True))) - [b, a, c] - Categories (3, object): [a < b < c] + >>> pd.unique( + ... pd.Series( + ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) + ... ) + ... ) + ['b', 'a', 'c'] + Categories (3, object): ['a' < 'b' < 'c'] An array of tuples - >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')]) + >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ values = _ensure_arraylike(values) @@ -466,7 +464,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # Avoid raising in extract_array values = np.array(values) else: - values = extract_array(values, extract_numpy=True) + values = extract_array(values, extract_numpy=True, extract_range=True) comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 86cde647cc798..693b1832ed3c9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -137,14 +137,6 @@ def f(x): self.orig_f: AggFuncType = func self.f: AggFuncType = f - @property - def index(self) -> Index: - return self.obj.index - - @property - def agg_axis(self) -> Index: - return self.obj._get_agg_axis(self.axis) - @abc.abstractmethod def apply(self) -> FrameOrSeriesUnion: pass @@ -163,9 +155,8 @@ def agg(self) -> FrameOrSeriesUnion | None: args = self.args kwargs = self.kwargs - result = self.maybe_apply_str() - if result is not None: - return result + if isinstance(arg, str): + return self.apply_str() if is_dict_like(arg): return self.agg_dict_like() @@ -369,7 +360,10 @@ def agg_list_like(self) -> FrameOrSeriesUnion: # raised directly in _aggregate_named pass elif "no results" in str(err): - # raised directly in _aggregate_multiple_funcs + # reached in test_frame_apply.test_nuiscance_columns + # where the colg.aggregate(arg) ends up going through + # the selected_obj.ndim == 1 branch above with arg == ["sum"] + # on a datetime64[ns] column pass else: raise @@ -465,27 +459,19 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: return result - def maybe_apply_str(self) -> FrameOrSeriesUnion | None: + def apply_str(self) -> FrameOrSeriesUnion: """ Compute apply in case of a string. Returns ------- - result: Series, DataFrame, or None - Result when self.f is a string, None otherwise. + result: Series or DataFrame """ - f = self.f - if not isinstance(f, str): - return None + # Caller is responsible for checking isinstance(self.f, str) + f = cast(str, self.f) obj = self.obj - # TODO: GH 39993 - Avoid special-casing by replacing with lambda - if f == "size" and isinstance(obj, ABCDataFrame): - # Special-cased because DataFrame.size returns a single scalar - value = obj.shape[self.axis] - return obj._constructor_sliced(value, index=self.agg_axis, name="size") - # Support for `frame.transform('method')` # Some methods (shift, etc.) require the axis argument, others # don't, so inspect and insert if necessary. @@ -498,7 +484,7 @@ def maybe_apply_str(self) -> FrameOrSeriesUnion | None: raise ValueError(f"Operation {f} does not support axis=1") return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs) - def maybe_apply_multiple(self) -> FrameOrSeriesUnion | None: + def apply_multiple(self) -> FrameOrSeriesUnion: """ Compute apply in case of a list-like or dict-like. @@ -507,9 +493,6 @@ def maybe_apply_multiple(self) -> FrameOrSeriesUnion | None: result: Series, DataFrame, or None Result when self.f is a list-like or dict-like, None otherwise. """ - # Note: dict-likes are list-like - if not is_list_like(self.f): - return None return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) def normalize_dictlike_arg( @@ -587,7 +570,22 @@ def _try_aggregate_string_function(self, obj, arg: str, *args, **kwargs): ) -class FrameApply(Apply): +class NDFrameApply(Apply): + """ + Methods shared by FrameApply and SeriesApply but + not GroupByApply or ResamplerWindowApply + """ + + @property + def index(self) -> Index: + return self.obj.index + + @property + def agg_axis(self) -> Index: + return self.obj._get_agg_axis(self.axis) + + +class FrameApply(NDFrameApply): obj: DataFrame # --------------------------------------------------------------- @@ -635,18 +633,16 @@ def dtypes(self) -> Series: def apply(self) -> FrameOrSeriesUnion: """ compute the results """ # dispatch to agg - result = self.maybe_apply_multiple() - if result is not None: - return result + if is_list_like(self.f): + return self.apply_multiple() # all empty if len(self.columns) == 0 and len(self.index) == 0: return self.apply_empty_result() # string dispatch - result = self.maybe_apply_str() - if result is not None: - return result + if isinstance(self.f, str): + return self.apply_str() # ufunc elif isinstance(self.f, np.ufunc): @@ -831,6 +827,16 @@ def wrap_results(self, results: ResType, res_index: Index) -> FrameOrSeriesUnion return result + def apply_str(self) -> FrameOrSeriesUnion: + # Caller is responsible for checking isinstance(self.f, str) + # TODO: GH#39993 - Avoid special-casing by replacing with lambda + if self.f == "size": + # Special-cased because DataFrame.size returns a single scalar + obj = self.obj + value = obj.shape[self.axis] + return obj._constructor_sliced(value, index=self.agg_axis, name="size") + return super().apply_str() + class FrameRowApply(FrameApply): axis = 0 @@ -967,7 +973,7 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame: return result -class SeriesApply(Apply): +class SeriesApply(NDFrameApply): obj: Series axis = 0 @@ -997,14 +1003,12 @@ def apply(self) -> FrameOrSeriesUnion: return self.apply_empty_result() # dispatch to agg - result = self.maybe_apply_multiple() - if result is not None: - return result + if is_list_like(self.f): + return self.apply_multiple() - # if we are a string, try to dispatch - result = self.maybe_apply_str() - if result is not None: - return result + if isinstance(self.f, str): + # if we are a string, try to dispatch + return self.apply_str() return self.apply_standard() diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 31f6896b12f98..6214693f22975 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -1,4 +1,3 @@ -from distutils.version import LooseVersion import json import numpy as np @@ -6,14 +5,14 @@ from pandas.core.arrays.interval import VALID_CLOSED -_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") - def pyarrow_array_to_numpy_and_mask(arr, dtype): """ Convert a primitive pyarrow.Array to a numpy array and boolean mask based on the buffers of the Array. + At the moment pyarrow.BooleanArray is not supported. + Parameters ---------- arr : pyarrow.Array @@ -25,8 +24,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): Tuple of two numpy arrays with the raw data (with specified dtype) and a boolean mask (validity mask, so False means missing) """ + dtype = np.dtype(dtype) + buflist = arr.buffers() - data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + # Since Arrow buffers might contain padding and the data might be offset, + # the buffer gets sliced here before handing it to numpy. + # See also https://github.com/pandas-dev/pandas/issues/40896 + offset = arr.offset * dtype.itemsize + length = len(arr) * dtype.itemsize + data_buf = buflist[1][offset : offset + length] + data = np.frombuffer(data_buf, dtype=dtype) bitmask = buflist[0] if bitmask is not None: mask = pyarrow.BooleanArray.from_buffers( @@ -38,97 +45,97 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): return data, mask -if _pyarrow_version_ge_015: - # the pyarrow extension types are only available for pyarrow 0.15+ - - class ArrowPeriodType(pyarrow.ExtensionType): - def __init__(self, freq): - # attributes need to be set first before calling - # super init (as that calls serialize) - self._freq = freq - pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") - - @property - def freq(self): - return self._freq - - def __arrow_ext_serialize__(self): - metadata = {"freq": self.freq} - return json.dumps(metadata).encode() - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - metadata = json.loads(serialized.decode()) - return ArrowPeriodType(metadata["freq"]) - - def __eq__(self, other): - if isinstance(other, pyarrow.BaseExtensionType): - return type(self) == type(other) and self.freq == other.freq - else: - return NotImplemented - - def __hash__(self): - return hash((str(self), self.freq)) - - def to_pandas_dtype(self): - import pandas as pd - - return pd.PeriodDtype(freq=self.freq) - - # register the type with a dummy instance - _period_type = ArrowPeriodType("D") - pyarrow.register_extension_type(_period_type) - - class ArrowIntervalType(pyarrow.ExtensionType): - def __init__(self, subtype, closed): - # attributes need to be set first before calling - # super init (as that calls serialize) - assert closed in VALID_CLOSED - self._closed = closed - if not isinstance(subtype, pyarrow.DataType): - subtype = pyarrow.type_for_alias(str(subtype)) - self._subtype = subtype - - storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) - pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") - - @property - def subtype(self): - return self._subtype - - @property - def closed(self): - return self._closed - - def __arrow_ext_serialize__(self): - metadata = {"subtype": str(self.subtype), "closed": self.closed} - return json.dumps(metadata).encode() - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - metadata = json.loads(serialized.decode()) - subtype = pyarrow.type_for_alias(metadata["subtype"]) - closed = metadata["closed"] - return ArrowIntervalType(subtype, closed) - - def __eq__(self, other): - if isinstance(other, pyarrow.BaseExtensionType): - return ( - type(self) == type(other) - and self.subtype == other.subtype - and self.closed == other.closed - ) - else: - return NotImplemented - - def __hash__(self): - return hash((str(self), str(self.subtype), self.closed)) - - def to_pandas_dtype(self): - import pandas as pd - - return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) - - # register the type with a dummy instance - _interval_type = ArrowIntervalType(pyarrow.int64(), "left") - pyarrow.register_extension_type(_interval_type) +class ArrowPeriodType(pyarrow.ExtensionType): + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + return ArrowPeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), self.freq)) + + def to_pandas_dtype(self): + import pandas as pd + + return pd.PeriodDtype(freq=self.freq) + + +# register the type with a dummy instance +_period_type = ArrowPeriodType("D") +pyarrow.register_extension_type(_period_type) + + +class ArrowIntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed): + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in VALID_CLOSED + self._closed = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self): + return self._closed + + def __arrow_ext_serialize__(self): + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return ArrowIntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtype + and self.closed == other.closed + ) + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), str(self.subtype), self.closed)) + + def to_pandas_dtype(self): + import pandas as pd + + return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) + + +# register the type with a dummy instance +_interval_type = ArrowIntervalType(pyarrow.int64(), "left") +pyarrow.register_extension_type(_interval_type) diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 34d5ea6cfb20d..a537951786646 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -41,20 +41,20 @@ def generate_regular_range( ------- ndarray[np.int64] Representing nanoseconds. """ - start = start.value if start is not None else None - end = end.value if end is not None else None + istart = start.value if start is not None else None + iend = end.value if end is not None else None stride = freq.nanos if periods is None: - b = start + b = istart # cannot just use e = Timestamp(end) + 1 because arange breaks when # stride is too large, see GH10887 - e = b + (end - b) // stride * stride + stride // 2 + 1 - elif start is not None: - b = start + e = b + (iend - b) // stride * stride + stride // 2 + 1 + elif istart is not None: + b = istart e = _generate_range_overflow_safe(b, periods, stride, side="start") - elif end is not None: - e = end + stride + elif iend is not None: + e = iend + stride b = _generate_range_overflow_safe(e, periods, stride, side="end") else: raise ValueError( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5a2643dd531ed..bd01191719143 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -794,7 +794,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: b = empty return self._concat_same_type([a, b]) - def unique(self): + def unique(self: ExtensionArrayT) -> ExtensionArrayT: """ Compute the ExtensionArray of unique values. @@ -1023,7 +1023,7 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: @Substitution(klass="ExtensionArray") @Appender(_extension_array_shared_docs["repeat"]) - def repeat(self, repeats, axis=None): + def repeat(self, repeats: int | Sequence[int], axis: int | None = None): nv.validate_repeat((), {"axis": axis}) ind = np.arange(len(self)).repeat(repeats) return self.take(ind) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0a0bfccc0ea15..14d059c04b7c0 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -114,6 +114,9 @@ def __from_arrow__( """ import pyarrow + if array.type != pyarrow.bool_(): + raise TypeError(f"Expected array of boolean type, got {array.type} instead") + if isinstance(array, pyarrow.Array): chunks = [array] else: @@ -122,11 +125,27 @@ def __from_arrow__( results = [] for arr in chunks: - # TODO should optimize this without going through object array - bool_arr = BooleanArray._from_sequence(np.array(arr)) + buflist = arr.buffers() + data = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[1]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + if arr.null_count != 0: + mask = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[0]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + mask = ~mask + else: + mask = np.zeros(len(arr), dtype=bool) + + bool_arr = BooleanArray(data, mask) results.append(bool_arr) - return BooleanArray._concat_same_type(results) + if not results: + return BooleanArray( + np.array([], dtype=np.bool_), np.array([], dtype=np.bool_) + ) + else: + return BooleanArray._concat_same_type(results) def coerce_to_array( diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c51e25776e1c2..7cddfef3d4292 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -12,7 +12,11 @@ Union, cast, ) -from warnings import warn +from warnings import ( + catch_warnings, + simplefilter, + warn, +) import numpy as np @@ -951,7 +955,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal if not inplace: return cat - def rename_categories(self, new_categories, inplace=False): + def rename_categories(self, new_categories, inplace=no_default): """ Rename categories. @@ -976,6 +980,8 @@ def rename_categories(self, new_categories, inplace=False): Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. + .. deprecated:: 1.3.0 + Returns ------- cat : Categorical or None @@ -1015,6 +1021,18 @@ def rename_categories(self, new_categories, inplace=False): ['A', 'A', 'B'] Categories (2, object): ['A', 'B'] """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "rename_categories is deprecated and will be removed in " + "a future version. Removing unused categories will always " + "return a new Categorical object.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() @@ -1027,7 +1045,7 @@ def rename_categories(self, new_categories, inplace=False): if not inplace: return cat - def reorder_categories(self, new_categories, ordered=None, inplace=False): + def reorder_categories(self, new_categories, ordered=None, inplace=no_default): """ Reorder categories as specified in new_categories. @@ -1045,6 +1063,8 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): Whether or not to reorder the categories inplace or return a copy of this categorical with reordered categories. + .. deprecated:: 1.3.0 + Returns ------- cat : Categorical or None @@ -1064,6 +1084,18 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "reorder_categories is deprecated and will be removed in " + "a future version. Removing unused categories will always " + "return a new Categorical object.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") if set(self.dtype.categories) != set(new_categories): raise ValueError( @@ -1071,7 +1103,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): ) return self.set_categories(new_categories, ordered=ordered, inplace=inplace) - def add_categories(self, new_categories, inplace=False): + def add_categories(self, new_categories, inplace=no_default): """ Add new categories. @@ -1086,6 +1118,8 @@ def add_categories(self, new_categories, inplace=False): Whether or not to add the categories inplace or return a copy of this categorical with added categories. + .. deprecated:: 1.3.0 + Returns ------- cat : Categorical or None @@ -1105,6 +1139,18 @@ def add_categories(self, new_categories, inplace=False): remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "add_categories is deprecated and will be removed in " + "a future version. Removing unused categories will always " + "return a new Categorical object.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(new_categories): new_categories = [new_categories] @@ -1122,7 +1168,7 @@ def add_categories(self, new_categories, inplace=False): if not inplace: return cat - def remove_categories(self, removals, inplace=False): + def remove_categories(self, removals, inplace=no_default): """ Remove the specified categories. @@ -1137,6 +1183,8 @@ def remove_categories(self, removals, inplace=False): Whether or not to remove the categories inplace or return a copy of this categorical with removed categories. + .. deprecated:: 1.3.0 + Returns ------- cat : Categorical or None @@ -1155,6 +1203,18 @@ def remove_categories(self, removals, inplace=False): remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "remove_categories is deprecated and will be removed in " + "a future version. Removing unused categories will always " + "return a new Categorical object.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(removals): removals = [removals] @@ -2355,17 +2415,25 @@ def replace(self, to_replace, value, inplace: bool = False): continue if replace_value in cat.categories: if isna(new_value): - cat.remove_categories(replace_value, inplace=True) + with catch_warnings(): + simplefilter("ignore") + cat.remove_categories(replace_value, inplace=True) continue + categories = cat.categories.tolist() index = categories.index(replace_value) + if new_value in cat.categories: value_index = categories.index(new_value) cat._codes[cat._codes == index] = value_index - cat.remove_categories(replace_value, inplace=True) + with catch_warnings(): + simplefilter("ignore") + cat.remove_categories(replace_value, inplace=True) else: categories[index] = new_value - cat.rename_categories(categories, inplace=True) + with catch_warnings(): + simplefilter("ignore") + cat.rename_categories(categories, inplace=True) if not inplace: return cat diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4a5dca348a8c0..93df88aba2cba 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -262,9 +262,7 @@ def _box_values(self, values) -> np.ndarray: """ apply box func to passed values """ - # error: Incompatible return value type (got - # "Union[ExtensionArray, ndarray]", expected "ndarray") - return lib.map_infer(values, self._box_func) # type: ignore[return-value] + return lib.map_infer(values, self._box_func) def __iter__(self): if self.ndim > 1: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 289ed4948934f..117b267fd49e5 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -742,7 +742,9 @@ def _sub_datetimelike_scalar(self, other): assert isinstance(other, (datetime, np.datetime64)) assert other is not NaT other = Timestamp(other) - if other is NaT: + # error: Non-overlapping identity check (left operand type: "Timestamp", + # right operand type: "NaTType") + if other is NaT: # type: ignore[comparison-overlap] return self - NaT if not self._has_same_tz(other): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 8d3a8feb89d67..95c95d98bc968 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -829,6 +829,7 @@ def astype(self, dtype, copy: bool = True): """ from pandas import Index from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype if dtype is not None: dtype = pandas_dtype(dtype) @@ -851,7 +852,7 @@ def astype(self, dtype, copy: bool = True): return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self), dtype=dtype) - elif isinstance(dtype, StringDtype): + elif isinstance(dtype, (StringDtype, ArrowStringDtype)): return dtype.construct_array_type()._from_sequence(self, copy=False) # TODO: This try/except will be repeated. @@ -1517,7 +1518,11 @@ def delete(self: IntervalArrayT, loc) -> IntervalArrayT: return self._shallow_copy(left=new_left, right=new_right) @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) - def repeat(self: IntervalArrayT, repeats: int, axis=None) -> IntervalArrayT: + def repeat( + self: IntervalArrayT, + repeats: int | Sequence[int], + axis: int | None = None, + ) -> IntervalArrayT: nv.validate_repeat((), {"axis": axis}) left_repeat = self.left.repeat(repeats) right_repeat = self.right.repeat(repeats) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 4908000a68810..bc467e93c2c2c 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -66,7 +66,11 @@ def __from_arrow__( num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) - if len(results) == 1: + if not results: + return array_class( + np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_) + ) + elif len(results) == 1: # avoid additional copy in _concat_same_type return results[0] else: diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index c9db995319cdf..52900d9b62dc2 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -23,6 +23,7 @@ ) from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -394,7 +395,9 @@ def _cmp_method(self, other, op): if isinstance(other, PandasArray): other = other._ndarray + other = ops.maybe_prepare_scalar_for_op(other, (len(self),)) pd_op = ops.get_array_op(op) + other = ensure_wrapped_if_datetimelike(other) with np.errstate(all="ignore"): result = pd_op(self._ndarray, other) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 37898ce682e4f..4847372f18239 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -550,7 +550,7 @@ def _from_factorized(cls, values, original): # Data # ------------------------------------------------------------------------ @property - def sp_index(self): + def sp_index(self) -> SparseIndex: """ The SparseIndex containing the location of non- ``fill_value`` points. """ @@ -570,7 +570,7 @@ def sp_values(self) -> np.ndarray: return self._sparse_values @property - def dtype(self): + def dtype(self) -> SparseDtype: return self._dtype @property @@ -597,7 +597,7 @@ def kind(self) -> str: return "block" @property - def _valid_sp_values(self): + def _valid_sp_values(self) -> np.ndarray: sp_vals = self.sp_values mask = notna(sp_vals) return sp_vals[mask] @@ -620,7 +620,7 @@ def nbytes(self) -> int: return self.sp_values.nbytes + self.sp_index.nbytes @property - def density(self): + def density(self) -> float: """ The percent of non- ``fill_value`` points, as decimal. @@ -1392,6 +1392,24 @@ def mean(self, axis=0, *args, **kwargs): nsparse = self.sp_index.ngaps return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) + def max(self, axis=0, *args, **kwargs): + nv.validate_max(args, kwargs) + + # This condition returns a nan if there are no valid values in the array. + if self.size > 0 and self._valid_sp_values.size == 0: + return np.nan + else: + return np.nanmax(self, axis) + + def min(self, axis=0, *args, **kwargs): + nv.validate_min(args, kwargs) + + # This condition returns a nan if there are no valid values in the array. + if self.size > 0 and self._valid_sp_values.size == 0: + return np.nan + else: + return np.nanmin(self, axis) + # ------------------------------------------------------------------------ # Ufuncs # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 600aacec9c87a..307517eedb2cd 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -118,7 +118,10 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - return StringArray._concat_same_type(results) + if results: + return StringArray._concat_same_type(results) + else: + return StringArray(np.array([], dtype="object")) class StringArray(PandasArray): @@ -447,9 +450,7 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): if not na_value_is_na: mask[:] = False - # error: Argument 1 to "maybe_convert_objects" has incompatible - # type "Union[ExtensionArray, ndarray]"; expected "ndarray" - return constructor(result, mask) # type: ignore[arg-type] + return constructor(result, mask) elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index dd09ef4e585ce..fd47597b2191f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -25,17 +25,19 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( - is_array_like, - is_bool_dtype, - is_integer, - is_integer_dtype, is_object_dtype, - is_scalar, is_string_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) from pandas.core import missing from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray @@ -675,18 +677,13 @@ def value_counts(self, dropna: bool = True) -> Series: vc = self._data.value_counts() - values = vc.field(0) - counts = vc.field(1) - if dropna and self._data.null_count > 0: - mask = values.is_valid() - values = values.filter(mask) - counts = counts.filter(mask) - + # Index cannot hold ExtensionArrays yet + index = Index(type(self)(vc.field(0)).astype(object)) # No missing values so we can adhere to the interface and return a numpy array. - counts = np.array(counts) + counts = np.array(vc.field(1)) - # Index cannot hold ExtensionArrays yet - index = Index(type(self)(values)).astype(object) + if dropna and self._data.null_count > 0: + raise NotImplementedError("yo") return Series(counts, index=index).astype("Int64") @@ -757,9 +754,3 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._data)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._data)) diff --git a/pandas/core/base.py b/pandas/core/base.py index 42f52618eb07b..3270e3dd82f7d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -16,6 +16,7 @@ import pandas._libs.lib as lib from pandas._typing import ( + ArrayLike, Dtype, DtypeObj, IndexLabel, @@ -996,7 +997,7 @@ def unique(self): values = self._values if not isinstance(values, np.ndarray): - result = values.unique() + result: ArrayLike = values.unique() if self.dtype.kind in ["m", "M"] and isinstance(self, ABCSeries): # GH#31182 Series._values returns EA, unpack for backward-compat if getattr(self.dtype, "tz", None) is None: @@ -1040,8 +1041,10 @@ def nunique(self, dropna: bool = True) -> int: >>> s.nunique() 4 """ - obj = remove_na_arraylike(self) if dropna else self - return len(obj.unique()) + uniqs = self.unique() + if dropna: + uniqs = remove_na_arraylike(uniqs) + return len(uniqs) @property def is_unique(self) -> bool: diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 957a493925405..2f87e0bcce70a 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -114,6 +114,11 @@ def _evaluate_numexpr(op, op_str, a, b): # numexpr raises eg for array ** array with integers # (https://github.com/pydata/numexpr/issues/379) pass + except NotImplementedError: + if _bool_arith_fallback(op_str, a, b): + pass + else: + raise if is_reversed: # reverse order to original for fallback @@ -137,8 +142,9 @@ def _evaluate_numexpr(op, op_str, a, b): roperator.rsub: "-", operator.truediv: "/", roperator.rtruediv: "/", - operator.floordiv: "//", - roperator.rfloordiv: "//", + # floordiv not supported by numexpr 2.x + operator.floordiv: None, + roperator.rfloordiv: None, # we require Python semantics for mod of negative for backwards compatibility # see https://github.com/pydata/numexpr/issues/365 # so sticking with unaccelerated for now @@ -197,26 +203,24 @@ def _has_bool_dtype(x): return isinstance(x, (bool, np.bool_)) -def _bool_arith_check( - op_str, a, b, not_allowed=frozenset(("/", "//", "**")), unsupported=None -): - if unsupported is None: - unsupported = {"+": "|", "*": "&", "-": "^"} +_BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"} + +def _bool_arith_fallback(op_str, a, b): + """ + Check if we should fallback to the python `_evaluate_standard` in case + of an unsupported operation by numexpr, which is the case for some + boolean ops. + """ if _has_bool_dtype(a) and _has_bool_dtype(b): - if op_str in unsupported: + if op_str in _BOOL_OP_UNSUPPORTED: warnings.warn( f"evaluating in Python space because the {repr(op_str)} " - "operator is not supported by numexpr for " - f"the bool dtype, use {repr(unsupported[op_str])} instead" + "operator is not supported by numexpr for the bool dtype, " + f"use {repr(_BOOL_OP_UNSUPPORTED[op_str])} instead" ) - return False - - if op_str in not_allowed: - raise NotImplementedError( - f"operator {repr(op_str)} not implemented for bool dtypes" - ) - return True + return True + return False def evaluate(op, a, b, use_numexpr: bool = True): @@ -233,7 +237,6 @@ def evaluate(op, a, b, use_numexpr: bool = True): """ op_str = _op_str_mapping[op] if op_str is not None: - use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: # error: "None" not callable return _evaluate(op, op_str, a, b) # type: ignore[misc] diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index fd49ac0176ce4..baac872a6a466 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -652,6 +652,22 @@ def use_inf_as_na_cb(key): validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]), ) + +# Set up the io.sql specific configuration. +sql_engine_doc = """ +: string + The default sql reader/writer engine. Available options: + 'auto', 'sqlalchemy', the default is 'auto' +""" + +with cf.config_prefix("io.sql"): + cf.register_option( + "engine", + "auto", + sql_engine_doc, + validator=is_one_of_factory(["auto", "sqlalchemy"]), + ) + # -------- # Plotting # --------- diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 8104b0170fbe2..9671c340a0a92 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -7,6 +7,7 @@ from typing import ( TYPE_CHECKING, Any, + TypeVar, ) import numpy as np @@ -26,6 +27,9 @@ if TYPE_CHECKING: from pandas.core.arrays import ExtensionArray + # To parameterize on same ExtensionDtype + E = TypeVar("E", bound="ExtensionDtype") + class ExtensionDtype: """ @@ -151,7 +155,7 @@ def na_value(self) -> object: return np.nan @property - def type(self) -> type[Any]: + def type(self) -> type_t[Any]: """ The scalar type for the array, e.g. ``int`` @@ -364,7 +368,7 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None -def register_extension_dtype(cls: type[ExtensionDtype]) -> type[ExtensionDtype]: +def register_extension_dtype(cls: type[E]) -> type[E]: """ Register an ExtensionType with pandas as class decorator. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e91927d87d318..6726374dbe30e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -55,6 +55,7 @@ ensure_str, is_bool, is_bool_dtype, + is_categorical_dtype, is_complex, is_complex_dtype, is_datetime64_dtype, @@ -78,7 +79,6 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, @@ -359,15 +359,15 @@ def trans(x): return result -def maybe_cast_pointwise_result( +def maybe_cast_result( result: ArrayLike, dtype: DtypeObj, numeric_only: bool = False, + how: str = "", same_dtype: bool = True, ) -> ArrayLike: """ - Try casting result of a pointwise operation back to the original dtype if - appropriate. + Try casting result to a different type if appropriate Parameters ---------- @@ -377,6 +377,8 @@ def maybe_cast_pointwise_result( Input Series from which result was calculated. numeric_only : bool, default False Whether to cast only numerics or datetimes as well. + how : str, default "" + How the result was computed. same_dtype : bool, default True Specify dtype when calling _from_sequence @@ -385,12 +387,12 @@ def maybe_cast_pointwise_result( result : array-like result maybe casted to the dtype. """ + dtype = maybe_cast_result_dtype(dtype, how) assert not is_scalar(result) if isinstance(dtype, ExtensionDtype): - if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)): - # TODO: avoid this special-casing + if not is_categorical_dtype(dtype) and dtype.kind != "M": # We have to special case categorical so as not to upcast # things like counts back to categorical diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e207dac71752e..593e42f7ed749 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1413,6 +1413,33 @@ def is_extension_type(arr) -> bool: return False +def is_1d_only_ea_obj(obj: Any) -> bool: + """ + ExtensionArray that does not support 2D, or more specifically that does + not use HybridBlock. + """ + from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + TimedeltaArray, + ) + + return isinstance(obj, ExtensionArray) and not isinstance( + obj, (DatetimeArray, TimedeltaArray) + ) + + +def is_1d_only_ea_dtype(dtype: Optional[DtypeObj]) -> bool: + """ + Analogue to is_extension_array_dtype but excluding DatetimeTZDtype. + """ + # Note: if other EA dtypes are ever held in HybridBlock, exclude those + # here too. + # NB: need to check DatetimeTZDtype and not is_datetime64tz_dtype + # to exclude ArrowTimestampUSDtype + return isinstance(dtype, ExtensionDtype) and not isinstance(dtype, DatetimeTZDtype) + + def is_extension_array_dtype(arr_or_dtype) -> bool: """ Check if an object is a pandas extension array type. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index cfadb3e9f45c5..b0d00775bbed1 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -113,11 +113,15 @@ def is_nonempty(x) -> bool: to_concat = non_empties kinds = {obj.dtype.kind for obj in to_concat} + contains_datetime = any(kind in ["m", "M"] for kind in kinds) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat) + if contains_datetime: + return _concat_datetime(to_concat, axis=axis) + if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 @@ -131,9 +135,6 @@ def is_nonempty(x) -> bool: else: return np.concatenate(to_concat) - elif any(kind in ["m", "M"] for kind in kinds): - return _concat_datetime(to_concat, axis=axis) - elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise @@ -349,14 +350,5 @@ def _concat_datetime(to_concat, axis=0): # in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) - if axis == 1: - # TODO(EA2D): kludge not necessary with 2D EAs - to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] - result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) - - if result.ndim == 2 and isinstance(result.dtype, ExtensionDtype): - # TODO(EA2D): kludge not necessary with 2D EAs - assert result.shape[0] == 1 - result = result[0] return result diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 84eede019251b..c5efd8f77495c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -48,9 +48,14 @@ ) if TYPE_CHECKING: + from datetime import tzinfo + import pyarrow - from pandas import Categorical + from pandas import ( + Categorical, + Index, + ) from pandas.core.arrays import ( DatetimeArray, IntervalArray, @@ -445,8 +450,8 @@ def _hash_categories(self) -> int: # assumes if any individual category is a tuple, then all our. ATM # I don't really want to support just some of the categories being # tuples. - categories = list(categories) # breaks if a np.array of categories - cat_array = hash_tuples(categories) + cat_list = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(cat_list) else: if categories.dtype == "O" and len({type(x) for x in categories}) != 1: # TODO: hash_array doesn't handle mixed types. It casts @@ -509,7 +514,7 @@ def validate_ordered(ordered: Ordered) -> None: raise TypeError("'ordered' must either be 'True' or 'False'") @staticmethod - def validate_categories(categories, fastpath: bool = False): + def validate_categories(categories, fastpath: bool = False) -> Index: """ Validates that we have good categories @@ -579,7 +584,7 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: return CategoricalDtype(new_categories, new_ordered) @property - def categories(self): + def categories(self) -> Index: """ An ``Index`` containing the unique categories allowed. """ @@ -717,7 +722,7 @@ def unit(self) -> str_type: return self._unit @property - def tz(self): + def tz(self) -> tzinfo: """ The timezone. """ @@ -882,7 +887,7 @@ def freq(self): return self._freq @classmethod - def _parse_dtype_strict(cls, freq): + def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset: if isinstance(freq, str): if freq.startswith("period[") or freq.startswith("Period["): m = cls._match.search(freq) @@ -1005,6 +1010,8 @@ def __from_arrow__( parr[~mask] = NaT results.append(parr) + if not results: + return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False) return PeriodArray._concat_same_type(results) @@ -1136,7 +1143,7 @@ def construct_array_type(cls) -> type[IntervalArray]: return IntervalArray @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string: str_type) -> IntervalDtype: """ attempt to construct this type from a string, raise a TypeError if its not possible @@ -1238,6 +1245,12 @@ def __from_arrow__( iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) results.append(iarr) + if not results: + return IntervalArray.from_arrays( + np.array([], dtype=self.subtype), + np.array([], dtype=self.subtype), + closed=array.type.closed, + ) return IntervalArray._concat_same_type(results) def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a7ed2bfc18df..8e12a8cb18b68 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -98,6 +98,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, infer_dtype_from_object, + is_1d_only_ea_dtype, is_bool_dtype, is_dataclass, is_datetime64_any_dtype, @@ -845,7 +846,9 @@ def _can_fast_transpose(self) -> bool: if len(blocks) != 1: return False - return not self._mgr.any_extension_types + dtype = blocks[0].dtype + # TODO(EA2D) special case would be unnecessary with 2D EAs + return not is_1d_only_ea_dtype(dtype) # ---------------------------------------------------------------------- # Rendering Methods @@ -6216,7 +6219,6 @@ def sort_values( # type: ignore[override] indexer = lexsort_indexer( keys, orders=ascending, na_position=na_position, key=key ) - indexer = ensure_platform_int(indexer) elif len(by): by = by[0] @@ -6801,6 +6803,7 @@ def _arith_method(self, other, op): return ops.frame_arith_method_with_reindex(self, other, op) axis = 1 # only relevant for Series other case + other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) @@ -8549,7 +8552,7 @@ def apply( Notes ----- Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`udf-mutation` + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. Examples @@ -10450,6 +10453,107 @@ def _AXIS_NAMES(self) -> dict[int, str]: boxplot = pandas.plotting.boxplot_frame sparse = CachedAccessor("sparse", SparseFrameAccessor) + # ---------------------------------------------------------------------- + # Internal Interface Methods + + def _to_dict_of_blocks(self, copy: bool = True): + """ + Return a dict of dtype -> Constructor Types that + each is a homogeneous dtype. + + Internal ONLY - only works for BlockManager + """ + mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well + mgr = mgr_to_mgr(mgr, "block") + mgr = cast(BlockManager, mgr) + return { + k: self._constructor(v).__finalize__(self) + for k, v, in mgr.to_dict(copy=copy).items() + } + + @property + def values(self) -> np.ndarray: + """ + Return a Numpy representation of the DataFrame. + + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + + Only the values in the DataFrame will be returned, the axes labels + will be removed. + + Returns + ------- + numpy.ndarray + The values of the DataFrame. + + See Also + -------- + DataFrame.to_numpy : Recommended alternative to this method. + DataFrame.index : Retrieve the index labels. + DataFrame.columns : Retrieving the column names. + + Notes + ----- + The dtype will be a lower-common-denominator dtype (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. Use this + with care if you are not dealing with the blocks. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By :func:`numpy.find_common_type` convention, mixing int64 + and uint64 will result in a float64 dtype. + + Examples + -------- + A DataFrame where all columns are the same type (e.g., int64) results + in an array of the same type. + + >>> df = pd.DataFrame({'age': [ 3, 29], + ... 'height': [94, 170], + ... 'weight': [31, 115]}) + >>> df + age height weight + 0 3 94 31 + 1 29 170 115 + >>> df.dtypes + age int64 + height int64 + weight int64 + dtype: object + >>> df.values + array([[ 3, 94, 31], + [ 29, 170, 115]]) + + A DataFrame with mixed type columns(e.g., str/object, int64, float32) + results in an ndarray of the broadest type that accommodates these + mixed types (e.g., object). + + >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), + ... ('lion', 80.5, 1), + ... ('monkey', np.nan, None)], + ... columns=('name', 'max_speed', 'rank')) + >>> df2.dtypes + name object + max_speed float64 + rank object + dtype: object + >>> df2.values + array([['parrot', 24.0, 'second'], + ['lion', 80.5, 1], + ['monkey', nan, None]], dtype=object) + """ + self._consolidate_inplace() + return self._mgr.as_array(transpose=True) + + @property + def _values(self) -> np.ndarray: + """internal implementation""" + return self.values + DataFrame._add_numeric_operations() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d69e933164118..d225ac6e6881b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -246,14 +246,18 @@ def __init__( @classmethod def _init_mgr( - cls, mgr, axes, dtype: Dtype | None = None, copy: bool_t = False + cls, + mgr: Manager, + axes, + dtype: Dtype | None = None, + copy: bool_t = False, ) -> Manager: """ passed a manager and a axes dict """ for a, axe in axes.items(): if axe is not None: axe = ensure_index(axe) bm_axis = cls._get_block_manager_axis(a) - mgr = mgr.reindex_axis(axe, axis=bm_axis, copy=False) + mgr = mgr.reindex_axis(axe, axis=bm_axis) # make a copy if explicitly requested if copy: @@ -286,13 +290,18 @@ def _from_mgr(cls, mgr: Manager): object.__setattr__(obj, "_attrs", {}) return obj - def _as_manager(self: FrameOrSeries, typ: str) -> FrameOrSeries: + def _as_manager( + self: FrameOrSeries, typ: str, copy: bool_t = True + ) -> FrameOrSeries: """ Private helper function to create a DataFrame with specific manager. Parameters ---------- typ : {"block", "array"} + copy : bool, default True + Only controls whether the conversion from Block->ArrayManager + copies the 1D arrays (to ensure proper/contiguous memory layout). Returns ------- @@ -301,7 +310,7 @@ def _as_manager(self: FrameOrSeries, typ: str) -> FrameOrSeries: to be a copy or not. """ new_mgr: Manager - new_mgr = mgr_to_mgr(self._mgr, typ=typ) + new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy) # fastpath of passing a manager doesn't check the option/manager class return self._constructor(new_mgr).__finalize__(self) @@ -5605,85 +5614,12 @@ def _get_bool_data(self): @property def values(self) -> np.ndarray: - """ - Return a Numpy representation of the DataFrame. - - .. warning:: - - We recommend using :meth:`DataFrame.to_numpy` instead. - - Only the values in the DataFrame will be returned, the axes labels - will be removed. - - Returns - ------- - numpy.ndarray - The values of the DataFrame. - - See Also - -------- - DataFrame.to_numpy : Recommended alternative to this method. - DataFrame.index : Retrieve the index labels. - DataFrame.columns : Retrieving the column names. - - Notes - ----- - The dtype will be a lower-common-denominator dtype (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. Use this - with care if you are not dealing with the blocks. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcast to - int32. By :func:`numpy.find_common_type` convention, mixing int64 - and uint64 will result in a float64 dtype. - - Examples - -------- - A DataFrame where all columns are the same type (e.g., int64) results - in an array of the same type. - - >>> df = pd.DataFrame({'age': [ 3, 29], - ... 'height': [94, 170], - ... 'weight': [31, 115]}) - >>> df - age height weight - 0 3 94 31 - 1 29 170 115 - >>> df.dtypes - age int64 - height int64 - weight int64 - dtype: object - >>> df.values - array([[ 3, 94, 31], - [ 29, 170, 115]]) - - A DataFrame with mixed type columns(e.g., str/object, int64, float32) - results in an ndarray of the broadest type that accommodates these - mixed types (e.g., object). - - >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), - ... ('lion', 80.5, 1), - ... ('monkey', np.nan, None)], - ... columns=('name', 'max_speed', 'rank')) - >>> df2.dtypes - name object - max_speed float64 - rank object - dtype: object - >>> df2.values - array([['parrot', 24.0, 'second'], - ['lion', 80.5, 1], - ['monkey', nan, None]], dtype=object) - """ - self._consolidate_inplace() - return self._mgr.as_array(transpose=self._AXIS_REVERSED) + raise AbstractMethodError(self) @property def _values(self) -> np.ndarray: """internal implementation""" - return self.values + raise AbstractMethodError(self) @property def dtypes(self): @@ -5716,23 +5652,6 @@ def dtypes(self): data = self._mgr.get_dtypes() return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) - @final - def _to_dict_of_blocks(self, copy: bool_t = True): - """ - Return a dict of dtype -> Constructor Types that - each is a homogeneous dtype. - - Internal ONLY - only works for BlockManager - """ - mgr = self._mgr - # convert to BlockManager if needed -> this way support ArrayManager as well - mgr = mgr_to_mgr(mgr, "block") - mgr = cast(BlockManager, mgr) - return { - k: self._constructor(v).__finalize__(self) - for k, v, in mgr.to_dict(copy=copy).items() - } - def astype( self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" ) -> FrameOrSeries: @@ -6469,13 +6388,53 @@ def fillna( else: return result.__finalize__(self, method="fillna") + @overload + def ffill( + self: FrameOrSeries, + axis: None | Axis = ..., + inplace: Literal[False] = ..., + limit: None | int = ..., + downcast=..., + ) -> FrameOrSeries: + ... + + @overload + def ffill( + self: FrameOrSeries, + axis: None | Axis, + inplace: Literal[True], + limit: None | int = ..., + downcast=..., + ) -> None: + ... + + @overload + def ffill( + self: FrameOrSeries, + *, + inplace: Literal[True], + limit: None | int = ..., + downcast=..., + ) -> None: + ... + + @overload + def ffill( + self: FrameOrSeries, + axis: None | Axis = ..., + inplace: bool_t = ..., + limit: None | int = ..., + downcast=..., + ) -> FrameOrSeries | None: + ... + @final @doc(klass=_shared_doc_kwargs["klass"]) def ffill( self: FrameOrSeries, - axis=None, + axis: None | Axis = None, inplace: bool_t = False, - limit=None, + limit: None | int = None, downcast=None, ) -> FrameOrSeries | None: """ @@ -6492,13 +6451,53 @@ def ffill( pad = ffill + @overload + def bfill( + self: FrameOrSeries, + axis: None | Axis = ..., + inplace: Literal[False] = ..., + limit: None | int = ..., + downcast=..., + ) -> FrameOrSeries: + ... + + @overload + def bfill( + self: FrameOrSeries, + axis: None | Axis, + inplace: Literal[True], + limit: None | int = ..., + downcast=..., + ) -> None: + ... + + @overload + def bfill( + self: FrameOrSeries, + *, + inplace: Literal[True], + limit: None | int = ..., + downcast=..., + ) -> None: + ... + + @overload + def bfill( + self: FrameOrSeries, + axis: None | Axis = ..., + inplace: bool_t = ..., + limit: None | int = ..., + downcast=..., + ) -> FrameOrSeries | None: + ... + @final @doc(klass=_shared_doc_kwargs["klass"]) def bfill( self: FrameOrSeries, - axis=None, + axis: None | Axis = None, inplace: bool_t = False, - limit=None, + limit: None | int = None, downcast=None, ) -> FrameOrSeries | None: """ @@ -7315,10 +7314,10 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): with np.errstate(all="ignore"): if upper is not None: - subset = self.to_numpy() <= upper + subset = (self <= upper).to_numpy() result = result.where(subset, upper, axis=None, inplace=False) if lower is not None: - subset = self.to_numpy() >= lower + subset = (self >= lower).to_numpy() result = result.where(subset, lower, axis=None, inplace=False) if np.any(mask): @@ -7341,8 +7340,6 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): return self._clip_with_scalar(None, threshold, inplace=inplace) return self._clip_with_scalar(threshold, None, inplace=inplace) - subset = method(threshold, axis=axis) | isna(self) - # GH #15390 # In order for where method to work, the threshold must # be transformed to NDFrame from other array like structure. @@ -7351,6 +7348,18 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): threshold = self._constructor(threshold, index=self.index) else: threshold = align_method_FRAME(self, threshold, axis, flex=None)[1] + + # GH 40420 + # Treat missing thresholds as no bounds, not clipping the values + if is_list_like(threshold): + fill_value = np.inf if method.__name__ == "le" else -np.inf + threshold_inf = threshold.fillna(fill_value) + else: + threshold_inf = threshold + + subset = method(threshold_inf, axis=axis) | isna(self) + + # GH 40420 return self.where(subset, threshold, axis=axis, inplace=inplace) @overload @@ -7482,10 +7491,12 @@ def clip( ---------- lower : float or array_like, default None Minimum threshold value. All values below this - threshold will be set to it. + threshold will be set to it. A missing + threshold (e.g `NA`) will not clip the value. upper : float or array_like, default None Maximum threshold value. All values above this - threshold will be set to it. + threshold will be set to it. A missing + threshold (e.g `NA`) will not clip the value. axis : int or str axis name, optional Align object with lower and upper along the given axis. inplace : bool, default False @@ -7546,6 +7557,25 @@ def clip( 2 0 3 3 6 8 4 5 3 + + Clips using specific lower threshold per column element, with missing values: + + >>> t = pd.Series([2, -4, np.NaN, 6, 3]) + >>> t + 0 2.0 + 1 -4.0 + 2 NaN + 3 6.0 + 4 3.0 + dtype: float64 + + >>> df.clip(t, axis=0) + col_0 col_1 + 0 9 2 + 1 -3 -4 + 2 0 6 + 3 6 8 + 4 5 3 """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -7558,9 +7588,17 @@ def clip( # so ignore # GH 19992 # numpy doesn't drop a list-like bound containing NaN - if not is_list_like(lower) and np.any(isna(lower)): + isna_lower = isna(lower) + if not is_list_like(lower): + if np.any(isna_lower): + lower = None + elif np.all(isna_lower): lower = None - if not is_list_like(upper) and np.any(isna(upper)): + isna_upper = isna(upper) + if not is_list_like(upper): + if np.any(isna_upper): + upper = None + elif np.all(isna_upper): upper = None # GH 2747 (arguments were reversed) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 297681f1e10f5..2a2671374efc4 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,7 +1,4 @@ -from typing import ( - Optional, - Tuple, -) +from __future__ import annotations import numpy as np @@ -16,7 +13,7 @@ def recode_for_groupby( c: Categorical, sort: bool, observed: bool -) -> Tuple[Categorical, Optional[Categorical]]: +) -> tuple[Categorical, Categorical | None]: """ Code the categories to ensure we can groupby for categoricals. diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index 3ba70baec1561..26070fcb5e89c 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -1,11 +1,10 @@ """Common utilities for Numba operations with groupby ops""" +from __future__ import annotations + import inspect from typing import ( Any, Callable, - Dict, - Optional, - Tuple, ) import numpy as np @@ -57,10 +56,10 @@ def f(values, index, ...): def generate_numba_agg_func( - args: Tuple, - kwargs: Dict[str, Any], + args: tuple, + kwargs: dict[str, Any], func: Callable[..., Scalar], - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: """ Generate a numba jitted agg function specified by values from engine_kwargs. @@ -117,10 +116,10 @@ def group_agg( def generate_numba_transform_func( - args: Tuple, - kwargs: Dict[str, Any], + args: tuple, + kwargs: dict[str, Any], func: Callable[..., np.ndarray], - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: """ Generate a numba jitted transform function specified by values from engine_kwargs. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6eddf8e9e8773..13279028ee6ff 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,7 +36,7 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import ( - maybe_cast_pointwise_result, + maybe_cast_result, maybe_cast_result_dtype, maybe_downcast_to_dtype, ) @@ -812,7 +812,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): result[label] = res out = lib.maybe_convert_objects(result, try_float=False) - out = maybe_cast_pointwise_result(out, obj.dtype, numeric_only=True) + out = maybe_cast_result(out, obj.dtype, numeric_only=True) return out, counts diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index db28ad710989d..d3756d6252c0a 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -180,7 +180,8 @@ def check_setitem_lengths(indexer, value, values) -> bool: elif isinstance(indexer, slice): if is_list_like(value): - if len(value) != length_of_indexer(indexer, values): + if len(value) != length_of_indexer(indexer, values) and values.ndim == 1: + # In case of two dimensional value is used row-wise and broadcasted raise ValueError( "cannot set using a slice indexer with a " "different length than the value" @@ -209,16 +210,24 @@ def validate_indices(indices: np.ndarray, n: int) -> None: Examples -------- - >>> validate_indices([1, 2], 3) - # OK - >>> validate_indices([1, -2], 3) - ValueError - >>> validate_indices([1, 2, 3], 3) - IndexError - >>> validate_indices([-1, -1], 0) - # OK - >>> validate_indices([0, 1], 0) - IndexError + >>> validate_indices(np.array([1, 2]), 3) # OK + + >>> validate_indices(np.array([1, -2]), 3) + Traceback (most recent call last): + ... + ValueError: negative dimensions are not allowed + + >>> validate_indices(np.array([1, 2, 3]), 3) + Traceback (most recent call last): + ... + IndexError: indices are out-of-bounds + + >>> validate_indices(np.array([-1, -1]), 0) # OK + + >>> validate_indices(np.array([0, 1]), 0) + Traceback (most recent call last): + ... + IndexError: indices are out-of-bounds """ if len(indices): min_idx = indices.min() diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e835990eb8d89..0624a1a64c9f8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -369,6 +369,15 @@ def fillna(self, value, downcast=None): return type(self)._simple_new(cat, name=self.name) + @doc(Index.unique) + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + result = self._values.unique() + # Use _simple_new instead of _shallow_copy to ensure we keep dtype + # of result, not self. + return type(self)._simple_new(result, name=self.name) + def reindex( self, target, method=None, level=None, limit=None, tolerance=None ) -> tuple[Index, np.ndarray | None]: diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 28144af36d6ea..28f563764ef10 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,6 +1,8 @@ +from __future__ import annotations + from typing import ( + Callable, Hashable, - Optional, ) import warnings @@ -48,12 +50,14 @@ class NumericIndex(Index): This is an abstract class. """ + _values: np.ndarray _default_dtype: np.dtype + _dtype_validation_metadata: tuple[Callable[..., bool], str] _is_numeric_dtype = True _can_hold_strings = False - def __new__(cls, data=None, dtype: Optional[Dtype] = None, copy=False, name=None): + def __new__(cls, data=None, dtype: Dtype | None = None, copy=False, name=None): name = maybe_extract_name(name, data, cls) subarr = cls._ensure_array(data, dtype, copy) @@ -94,17 +98,11 @@ def _ensure_array(cls, data, dtype, copy: bool): return subarr @classmethod - def _validate_dtype(cls, dtype: Dtype) -> None: + def _validate_dtype(cls, dtype: Dtype | None) -> None: if dtype is None: return - validation_metadata = { - "int64index": (is_signed_integer_dtype, "signed integer"), - "uint64index": (is_unsigned_integer_dtype, "unsigned integer"), - "float64index": (is_float_dtype, "float"), - "rangeindex": (is_signed_integer_dtype, "signed integer"), - } - - validation_func, expected = validation_metadata[cls._typ] + + validation_func, expected = cls._dtype_validation_metadata if not validation_func(dtype): raise ValueError( f"Incorrect `dtype` passed: expected {expected}, received {dtype}" @@ -253,9 +251,7 @@ def asi8(self) -> np.ndarray: FutureWarning, stacklevel=2, ) - # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", - # expected "ndarray") - return self._values.view(self._default_dtype) # type: ignore[return-value] + return self._values.view(self._default_dtype) class Int64Index(IntegerIndex): @@ -264,6 +260,7 @@ class Int64Index(IntegerIndex): _typ = "int64index" _engine_type = libindex.Int64Engine _default_dtype = np.dtype(np.int64) + _dtype_validation_metadata = (is_signed_integer_dtype, "signed integer") _uint64_descr_args = { @@ -280,6 +277,7 @@ class UInt64Index(IntegerIndex): _typ = "uint64index" _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) + _dtype_validation_metadata = (is_unsigned_integer_dtype, "unsigned integer") # ---------------------------------------------------------------- # Indexing Methods @@ -311,6 +309,7 @@ class Float64Index(NumericIndex): _typ = "float64index" _engine_type = libindex.Float64Engine _default_dtype = np.dtype(np.float64) + _dtype_validation_metadata = (is_float_dtype, "float") @property def inferred_type(self) -> str: @@ -330,10 +329,7 @@ def astype(self, dtype, copy=True): elif is_integer_dtype(dtype) and not is_extension_array_dtype(dtype): # TODO(jreback); this can change once we have an EA Index type # GH 13149 - - # error: Argument 1 to "astype_nansafe" has incompatible type - # "Union[ExtensionArray, ndarray]"; expected "ndarray" - arr = astype_nansafe(self._values, dtype=dtype) # type: ignore[arg-type] + arr = astype_nansafe(self._values, dtype=dtype) return Int64Index(arr, name=self.name) return super().astype(dtype, copy=copy) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1b68ac9780ee1..b267472eba573 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -36,7 +36,6 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCMultiIndex, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -53,7 +52,10 @@ is_list_like_indexer, length_of_indexer, ) -from pandas.core.indexes.api import Index +from pandas.core.indexes.api import ( + Index, + MultiIndex, +) if TYPE_CHECKING: from pandas import ( @@ -642,7 +644,7 @@ def _get_setitem_indexer(self, key): ax = self.obj._get_axis(0) - if isinstance(ax, ABCMultiIndex) and self.name != "iloc": + if isinstance(ax, MultiIndex) and self.name != "iloc": with suppress(TypeError, KeyError, InvalidIndexError): # TypeError e.g. passed a bool return ax.get_loc(key) @@ -690,7 +692,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): if ( axis == column_axis - and not isinstance(self.obj.columns, ABCMultiIndex) + and not isinstance(self.obj.columns, MultiIndex) and is_list_like_indexer(key) and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) @@ -699,7 +701,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): keys = self.obj.columns.union(key, sort=False) self.obj._mgr = self.obj._mgr.reindex_axis( - keys, axis=0, copy=False, consolidate=False, only_slice=True + keys, axis=0, consolidate=False, only_slice=True ) def __setitem__(self, key, value): @@ -756,7 +758,7 @@ def _is_nested_tuple_indexer(self, tup: tuple) -> bool: ------- bool """ - if any(isinstance(ax, ABCMultiIndex) for ax in self.obj.axes): + if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False @@ -817,7 +819,7 @@ def _getitem_lowerdim(self, tup: tuple): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": + if isinstance(ax0, MultiIndex) and self.name != "iloc": with suppress(IndexingError): return self._handle_lowerdim_multi_index_axis0(tup) @@ -996,7 +998,7 @@ def _is_scalar_access(self, key: tuple) -> bool: return False ax = self.obj.axes[i] - if isinstance(ax, ABCMultiIndex): + if isinstance(ax, MultiIndex): return False if isinstance(k, str) and ax._supports_partial_string_indexing: @@ -1142,7 +1144,7 @@ def _getitem_axis(self, key, axis: int): elif is_list_like_indexer(key): # an iterable multi-selection - if not (isinstance(key, tuple) and isinstance(labels, ABCMultiIndex)): + if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): if hasattr(key, "ndim") and key.ndim > 1: raise ValueError("Cannot index with multidimensional key") @@ -1205,20 +1207,20 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): is_int_index = labels.is_integer() is_int_positional = is_integer(key) and not is_int_index - if is_scalar(key) or isinstance(labels, ABCMultiIndex): + if is_scalar(key) or isinstance(labels, MultiIndex): # Otherwise get_loc will raise InvalidIndexError # if we are a label return me try: return labels.get_loc(key) except LookupError: - if isinstance(key, tuple) and isinstance(labels, ABCMultiIndex): + if isinstance(key, tuple) and isinstance(labels, MultiIndex): if len(key) == labels.nlevels: return {"key": key} raise except InvalidIndexError: # GH35015, using datetime as column indices raises exception - if not isinstance(labels, ABCMultiIndex): + if not isinstance(labels, MultiIndex): raise except TypeError: pass @@ -1620,7 +1622,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # GH 10360, GH 27841 if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): for i, ax in zip(indexer, self.obj.axes): - if isinstance(ax, ABCMultiIndex) and not ( + if isinstance(ax, MultiIndex) and not ( is_integer(i) or com.is_null_slice(i) ): take_split_path = True @@ -1819,7 +1821,7 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str sub_indexer = list(indexer) pi = indexer[0] - multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex) + multiindex_indexer = isinstance(self.obj.columns, MultiIndex) unique_cols = value.columns.is_unique @@ -2163,8 +2165,8 @@ def _align_frame(self, indexer, df: DataFrame): # we have a multi-index and are trying to align # with a particular, level GH3738 if ( - isinstance(ax, ABCMultiIndex) - and isinstance(df.index, ABCMultiIndex) + isinstance(ax, MultiIndex) + and isinstance(df.index, MultiIndex) and ax.nlevels != df.index.nlevels ): raise TypeError( @@ -2428,7 +2430,7 @@ def is_nested_tuple(tup, labels) -> bool: for k in tup: if is_list_like(k) or isinstance(k, slice): - return isinstance(labels, ABCMultiIndex) + return isinstance(labels, MultiIndex) return False diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index d6b76510c68ab..37e07af71213e 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -6,7 +6,7 @@ 2) Use only functions exposed here (or in core.internals) """ -from typing import Optional +from __future__ import annotations import numpy as np @@ -19,10 +19,12 @@ ) from pandas.core.arrays import DatetimeArray +from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( Block, DatetimeTZBlock, check_ndim, + ensure_block_shape, extract_pandas_array, get_block_type, maybe_coerce_values, @@ -30,7 +32,7 @@ def make_block( - values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None + values, placement, klass=None, ndim=None, dtype: Dtype | None = None ) -> Block: """ This is a pseudo-public analogue to blocks.new_block. @@ -60,12 +62,17 @@ def make_block( placement = BlockPlacement(placement) ndim = maybe_infer_ndim(values, placement, ndim) + if is_datetime64tz_dtype(values.dtype): + # GH#41168 ensure we can pass 1D dt64tz values + values = extract_array(values, extract_numpy=True) + values = ensure_block_shape(values, ndim) + check_ndim(values, placement, ndim) values = maybe_coerce_values(values) return klass(values, ndim=ndim, placement=placement) -def maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int: +def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int: """ If `ndim` is not provided, infer it from placment and values. """ diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 7afb6e5d7e544..3a8ff8237b62f 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -11,16 +11,14 @@ from pandas._typing import ( DtypeObj, Shape, + final, ) from pandas.errors import AbstractMethodError from pandas.core.dtypes.cast import find_common_type from pandas.core.base import PandasObject -from pandas.core.indexes.api import ( - Index, - ensure_index, -) +from pandas.core.indexes.api import Index T = TypeVar("T", bound="DataManager") @@ -59,31 +57,26 @@ def reindex_indexer( ) -> T: raise AbstractMethodError(self) + @final def reindex_axis( - self, - new_index, + self: T, + new_index: Index, axis: int, - method=None, - limit=None, fill_value=None, - copy: bool = True, consolidate: bool = True, only_slice: bool = False, - ): + ) -> T: """ Conform data manager to new index. """ - new_index = ensure_index(new_index) - new_index, indexer = self.axes[axis].reindex( - new_index, method=method, limit=limit - ) + new_index, indexer = self.axes[axis].reindex(new_index) return self.reindex_indexer( new_index, indexer, axis=axis, fill_value=fill_value, - copy=copy, + copy=False, consolidate=consolidate, only_slice=only_slice, ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 603cc6a6ff1f2..61396fdf372d5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -42,6 +42,8 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, + is_1d_only_ea_obj, is_categorical_dtype, is_dtype_equal, is_extension_array_dtype, @@ -224,14 +226,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: # expected "ndarray") return self.values # type: ignore[return-value] - @final - def get_block_values_for_json(self) -> np.ndarray: - """ - This is used in the JSON C code. - """ - # TODO(EA2D): reshape will be unnecessary with 2D EAs - return np.asarray(self.values).reshape(self.shape) - @final @cache_readonly def fill_value(self): @@ -415,7 +409,11 @@ def _split_op_result(self, result) -> list[Block]: # if we get a 2D ExtensionArray, we need to split it into 1D pieces nbs = [] for i, loc in enumerate(self._mgr_locs): - vals = result[i] + if not is_1d_only_ea_obj(result): + vals = result[i : i + 1] + else: + vals = result[i] + block = self.make_block(values=vals, placement=loc) nbs.append(block) return nbs @@ -1670,7 +1668,7 @@ class NumericBlock(NumpyBlock): is_numeric = True -class NDArrayBackedExtensionBlock(EABackedBlock): +class NDArrayBackedExtensionBlock(libinternals.Block, EABackedBlock): """ Block backed by an NDArrayBackedExtensionArray """ @@ -1683,11 +1681,6 @@ def is_view(self) -> bool: # check the ndarray values of the DatetimeIndex values return self.values._ndarray.base is not None - def iget(self, key): - # GH#31649 we need to wrap scalars in Timestamp/Timedelta - # TODO(EA2D): this can be removed if we ever have 2D EA - return self.values.reshape(self.shape)[key] - def setitem(self, indexer, value): if not self._can_hold_element(value): # TODO: general case needs casting logic. @@ -1707,24 +1700,21 @@ def putmask(self, mask, new) -> list[Block]: if not self._can_hold_element(new): return self.astype(object).putmask(mask, new) - # TODO(EA2D): reshape unnecessary with 2D EAs - arr = self.values.reshape(self.shape) + arr = self.values arr.T.putmask(mask, new) return [self] def where(self, other, cond, errors="raise") -> list[Block]: # TODO(EA2D): reshape unnecessary with 2D EAs - arr = self.values.reshape(self.shape) + arr = self.values cond = extract_bool_array(cond) try: res_values = arr.T.where(cond, other).T except (ValueError, TypeError): - return super().where(other, cond, errors=errors) + return Block.where(self, other, cond, errors=errors) - # TODO(EA2D): reshape not needed with 2D EAs - res_values = res_values.reshape(self.values.shape) nb = self.make_block_same_class(res_values) return [nb] @@ -1748,15 +1738,13 @@ def diff(self, n: int, axis: int = 0) -> list[Block]: The arguments here are mimicking shift so they are called correctly by apply. """ - # TODO(EA2D): reshape not necessary with 2D EAs - values = self.values.reshape(self.shape) + values = self.values new_values = values - values.shift(n, axis=axis) return [self.make_block(new_values)] def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Block]: - # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs - values = self.values.reshape(self.shape) + values = self.values new_values = values.shift(periods, fill_value=fill_value, axis=axis) return [self.make_block_same_class(new_values)] @@ -1776,7 +1764,7 @@ def fillna( return [self.make_block_same_class(values=new_values)] -class DatetimeLikeBlock(libinternals.Block, NDArrayBackedExtensionBlock): +class DatetimeLikeBlock(NDArrayBackedExtensionBlock): """Block for datetime64[ns], timedelta64[ns].""" __slots__ = () @@ -1784,23 +1772,15 @@ class DatetimeLikeBlock(libinternals.Block, NDArrayBackedExtensionBlock): values: DatetimeArray | TimedeltaArray -class DatetimeTZBlock(ExtensionBlock, NDArrayBackedExtensionBlock): +class DatetimeTZBlock(DatetimeLikeBlock): """ implement a datetime64 block with a tz attribute """ values: DatetimeArray __slots__ = () is_extension = True - is_numeric = False - - diff = NDArrayBackedExtensionBlock.diff - where = NDArrayBackedExtensionBlock.where - putmask = NDArrayBackedExtensionBlock.putmask - fillna = NDArrayBackedExtensionBlock.fillna - - get_values = NDArrayBackedExtensionBlock.get_values - - is_view = NDArrayBackedExtensionBlock.is_view + _validate_ndim = True + _can_consolidate = False class ObjectBlock(NumpyBlock): @@ -1912,7 +1892,9 @@ def get_block_type(values, dtype: Dtype | None = None): cls = ExtensionBlock elif isinstance(dtype, CategoricalDtype): cls = CategoricalBlock - elif vtype is Timestamp: + # error: Non-overlapping identity check (left operand type: "Type[generic]", + # right operand type: "Type[Timestamp]") + elif vtype is Timestamp: # type: ignore[comparison-overlap] cls = DatetimeTZBlock elif isinstance(dtype, ExtensionDtype): # Note: need to be sure PandasArray is unwrapped before we get here @@ -1967,7 +1949,7 @@ def check_ndim(values, placement: BlockPlacement, ndim: int): f"values.ndim > ndim [{values.ndim} > {ndim}]" ) - elif isinstance(values.dtype, np.dtype): + elif not is_1d_only_ea_dtype(values.dtype): # TODO(EA2D): special case not needed with 2D EAs if values.ndim != ndim: raise ValueError( @@ -1981,7 +1963,7 @@ def check_ndim(values, placement: BlockPlacement, ndim: int): ) elif ndim == 2 and len(placement) != 1: # TODO(EA2D): special case unnecessary with 2D EAs - raise AssertionError("block.size != values.size") + raise ValueError("need to split") def extract_pandas_array( @@ -2026,8 +2008,9 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: """ Reshape if possible to have values.ndim == ndim. """ + if values.ndim < ndim: - if not is_extension_array_dtype(values.dtype): + if not is_1d_only_ea_dtype(values.dtype): # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 0b0013eeb7147..9642b30ab91ca 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -5,6 +5,7 @@ from typing import ( TYPE_CHECKING, Sequence, + cast, ) import numpy as np @@ -23,6 +24,8 @@ find_common_type, ) from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, + is_1d_only_ea_obj, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, @@ -40,7 +43,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, ) @@ -190,17 +192,17 @@ def concatenate_managers( blocks = [] for placement, join_units in concat_plan: + unit = join_units[0] + blk = unit.block if len(join_units) == 1 and not join_units[0].indexers: - b = join_units[0].block - values = b.values + values = blk.values if copy: values = values.copy() else: values = values.view() - b = b.make_block_same_class(values, placement=placement) + fastpath = True elif _is_uniform_join_units(join_units): - blk = join_units[0].block vals = [ju.block.values for ju in join_units] if not blk.is_extension: @@ -210,19 +212,21 @@ def concatenate_managers( values = np.concatenate(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs - values = concat_compat(vals) - values = ensure_block_shape(values, ndim=2) + values = concat_compat(vals, axis=1) + values = ensure_block_shape(values, blk.ndim) values = ensure_wrapped_if_datetimelike(values) - if blk.values.dtype == values.dtype: - # Fast-path - b = blk.make_block_same_class(values, placement=placement) - else: - b = new_block(values, placement=placement, ndim=blk.ndim) + fastpath = blk.values.dtype == values.dtype + else: + values = _concatenate_join_units(join_units, concat_axis, copy=copy) + fastpath = False + + if fastpath: + b = blk.make_block_same_class(values, placement=placement) else: - new_values = _concatenate_join_units(join_units, concat_axis, copy=copy) - b = new_block(new_values, placement=placement, ndim=len(axes)) + b = new_block(values, placement=placement, ndim=len(axes)) + blocks.append(b) return BlockManager(tuple(blocks), axes) @@ -412,13 +416,16 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: fill_value = None if is_datetime64tz_dtype(empty_dtype): - # TODO(EA2D): special case unneeded with 2D EAs - i8values = np.full(self.shape[1], fill_value.value) + i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) + elif is_extension_array_dtype(blk_dtype): pass - elif isinstance(empty_dtype, ExtensionDtype): + + elif is_1d_only_ea_dtype(empty_dtype): + empty_dtype = cast(ExtensionDtype, empty_dtype) cls = empty_dtype.construct_array_type() + missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols @@ -429,6 +436,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish + empty_dtype = cast(np.dtype, empty_dtype) missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) @@ -438,12 +446,10 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: # preserve these for validation in concat_compat return self.block.values - if self.block.is_bool and not isinstance(self.block.values, Categorical): + if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values - elif self.block.is_extension: - values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. @@ -493,15 +499,17 @@ def _concatenate_join_units( concat_values = concat_values.copy() else: concat_values = concat_values.copy() - elif any(isinstance(t, ExtensionArray) and t.ndim == 1 for t in to_concat): + + elif any(is_1d_only_ea_obj(t) for t in to_concat): + # TODO(EA2D): special case not needed if all EAs used HybridBlocks + # NB: we are still assuming here that Hybrid blocks have shape (1, N) # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) + # error: Invalid index type "Tuple[int, slice]" for # "Union[ExtensionArray, ndarray]"; expected type "Union[int, slice, ndarray]" to_concat = [ - t - if (isinstance(t, ExtensionArray) and t.ndim == 1) - else t[0, :] # type: ignore[index] + t if is_1d_only_ea_obj(t) else t[0, :] # type: ignore[index] for t in to_concat ] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) @@ -524,9 +532,11 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): elif dtype.kind in ["f", "c"]: return dtype.type("NaN") elif dtype.kind == "b": + # different from missing.na_value_for_dtype return None elif dtype.kind in ["i", "u"]: if not has_none_blocks: + # different from missing.na_value_for_dtype return None return np.nan elif dtype.kind == "O": diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2960fb292818a..884a2cec171de 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -32,7 +32,9 @@ maybe_upcast, ) from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, is_dtype_equal, is_extension_array_dtype, is_integer_dtype, @@ -55,9 +57,11 @@ ) from pandas.core.arrays import ( Categorical, - DatetimeArray, + ExtensionArray, + TimedeltaArray, ) from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, extract_array, sanitize_array, ) @@ -205,10 +209,11 @@ def fill_masked_arrays(data: MaskedRecords, arr_columns: Index) -> list[np.ndarr return new_arrays -def mgr_to_mgr(mgr, typ: str): +def mgr_to_mgr(mgr, typ: str, copy: bool = True): """ Convert to specific type of Manager. Does not copy if the type is already - correct. Does not guarantee a copy otherwise. + correct. Does not guarantee a copy otherwise. `copy` keyword only controls + whether conversion from Block->ArrayManager copies the 1D arrays. """ new_mgr: Manager @@ -227,10 +232,15 @@ def mgr_to_mgr(mgr, typ: str): new_mgr = mgr else: if mgr.ndim == 2: - arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))] + arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))] + if copy: + arrays = [arr.copy() for arr in arrays] new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) else: - new_mgr = SingleArrayManager([mgr.internal_values()], [mgr.index]) + array = mgr.internal_values() + if copy: + array = array.copy() + new_mgr = SingleArrayManager([array], [mgr.index]) else: raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") return new_mgr @@ -259,7 +269,8 @@ def ndarray_to_mgr( if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) - if is_extension_array_dtype(values) or isinstance(dtype, ExtensionDtype): + vdtype = getattr(values, "dtype", None) + if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: @@ -274,9 +285,18 @@ def ndarray_to_mgr( return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) - # by definition an array here - # the dtypes will be coerced to a single dtype - values = _prep_ndarray(values, copy=copy) + if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): + # i.e. Datetime64TZ + values = extract_array(values, extract_numpy=True) + if copy: + values = values.copy() + if values.ndim == 1: + values = values.reshape(-1, 1) + + else: + # by definition an array here + # the dtypes will be coerced to a single dtype + values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape @@ -304,10 +324,30 @@ def ndarray_to_mgr( index, columns = _get_axes( values.shape[0], values.shape[1], index=index, columns=columns ) - values = values.T _check_values_indices_shape_match(values, index, columns) + if typ == "array": + + if issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + if dtype is None and is_object_dtype(values.dtype): + arrays = [ + ensure_wrapped_if_datetimelike( + maybe_infer_to_datetimelike(values[:, i].copy()) + ) + for i in range(values.shape[1]) + ] + else: + if is_datetime_or_timedelta_dtype(values.dtype): + values = ensure_wrapped_if_datetimelike(values) + arrays = [values[:, i].copy() for i in range(values.shape[1])] + + return ArrayManager(arrays, [index, columns], verify_integrity=False) + + values = values.T + # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type @@ -320,7 +360,6 @@ def ndarray_to_mgr( dvals_list = [ensure_block_shape(dval, 2) for dval in dvals_list] # TODO: What about re-joining object columns? - dvals_list = [maybe_squeeze_dt64tz(x) for x in dvals_list] block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) @@ -328,12 +367,10 @@ def ndarray_to_mgr( else: datelike_vals = maybe_infer_to_datetimelike(values) - datelike_vals = maybe_squeeze_dt64tz(datelike_vals) nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: - new_values = maybe_squeeze_dt64tz(values) - nb = new_block(new_values, placement=slice(len(columns)), ndim=2) + nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: @@ -349,31 +386,17 @@ def _check_values_indices_shape_match( Check that the shape implied by our axes matches the actual shape of the data. """ - if values.shape[0] != len(columns): + if values.shape[1] != len(columns) or values.shape[0] != len(index): # Could let this raise in Block constructor, but we get a more # helpful exception message this way. - if values.shape[1] == 0: + if values.shape[0] == 0: raise ValueError("Empty data passed with indices specified.") - passed = values.T.shape + passed = values.shape implied = (len(index), len(columns)) raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") -def maybe_squeeze_dt64tz(dta: ArrayLike) -> ArrayLike: - """ - If we have a tzaware DatetimeArray with shape (1, N), squeeze to (N,) - """ - # TODO(EA2D): kludge not needed with 2D EAs - if isinstance(dta, DatetimeArray) and dta.ndim == 2 and dta.tz is not None: - assert dta.shape[0] == 1 - # error: Incompatible types in assignment (expression has type - # "Union[DatetimeLikeArrayMixin, Union[Any, NaTType]]", variable has - # type "Union[ExtensionArray, ndarray]") - dta = dta[0] # type: ignore[assignment] - return dta - - def dict_to_mgr( data: dict, index, @@ -396,7 +419,6 @@ def dict_to_mgr( arrays = Series(data, index=columns, dtype=object) data_names = arrays.index - missing = arrays.isna() if index is None: # GH10856 @@ -481,13 +503,23 @@ def treat_as_nested(data) -> bool: """ Check if we should use nested_data_to_arrays. """ - return len(data) > 0 and is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1 + return ( + len(data) > 0 + and is_list_like(data[0]) + and getattr(data[0], "ndim", 1) == 1 + and not (isinstance(data, ExtensionArray) and data.ndim == 2) + ) # --------------------------------------------------------------------- def _prep_ndarray(values, copy: bool = True) -> np.ndarray: + if isinstance(values, TimedeltaArray): + # On older numpy, np.asarray below apparently does not call __array__, + # so nanoseconds get dropped. + values = values._ndarray + if not isinstance(values, (np.ndarray, ABCSeries, Index)): if len(values) == 0: return np.empty((0, 0), dtype=object) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 97d605e2fa2d1..373d3566e1e8a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1116,6 +1116,21 @@ def fast_xs(self, loc: int) -> ArrayLike: return result + def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: + assert isinstance(slobj, slice), type(slobj) + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(slobj) + elif axis == 1: + new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks] + else: + raise IndexError("Requested axis not found in manager") + + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis]._getitem_slice(slobj) + + return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) + def iget(self, i: int) -> SingleBlockManager: """ Return the data as a SingleBlockManager. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 54588eafc3fa0..19fd48a772493 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1056,7 +1056,7 @@ def nanargmax( [ 6., 7., nan], [ 9., 10., nan]]) >>> nanops.nanargmax(arr, axis=1) - array([2, 2, 1, 1], dtype=int64) + array([2, 2, 1, 1]) """ values, mask, _, _, _ = _get_values(values, True, fill_value_typ="-inf", mask=mask) # error: Need type annotation for 'result' @@ -1102,7 +1102,7 @@ def nanargmin( [nan, 7., 8.], [nan, 10., 11.]]) >>> nanops.nanargmin(arr, axis=1) - array([0, 0, 1, 1], dtype=int64) + array([0, 0, 1, 1]) """ values, mask, _, _, _ = _get_values(values, True, fill_value_typ="+inf", mask=mask) # error: Need type annotation for 'result' @@ -1598,7 +1598,7 @@ def _ensure_numeric(x): elif not (is_float(x) or is_integer(x) or is_complex(x)): try: x = float(x) - except ValueError: + except (TypeError, ValueError): # e.g. "1+1j" or "foo" try: x = complex(x) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f6bde348888a1..9cccf1cff60a1 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -35,6 +35,7 @@ comparison_op, get_array_op, logical_op, + maybe_prepare_scalar_for_op, ) from pandas.core.ops.common import ( # noqa:F401 get_op_result_name, @@ -428,6 +429,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): axis = self._get_axis_number(axis) if axis is not None else 1 + other = maybe_prepare_scalar_for_op(other, self.shape) self, other = align_method_FRAME(self, other, axis, flex=True, level=level) if isinstance(other, ABCDataFrame): diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index ba9da8d648597..39c6fa13f79a4 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -2,7 +2,7 @@ Functions for arithmetic and comparison operations on NumPy arrays and ExtensionArrays. """ -from datetime import timedelta +import datetime from functools import partial import operator from typing import Any @@ -10,11 +10,13 @@ import numpy as np from pandas._libs import ( + NaT, Timedelta, Timestamp, lib, ops as libops, ) +from pandas._libs.tslibs import BaseOffset from pandas._typing import ( ArrayLike, Shape, @@ -154,8 +156,14 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): ------ TypeError : invalid operation """ + if isinstance(right, str): + # can never use numexpr + func = op + else: + func = partial(expressions.evaluate, op) + try: - result = expressions.evaluate(op, left, right) + result = func(left, right) except TypeError: if is_object_dtype(left) or is_object_dtype(right) and not is_cmp: # For object dtype, fallback to a masked operation (only operating @@ -194,21 +202,26 @@ def arithmetic_op(left: ArrayLike, right: Any, op): ndarray or ExtensionArray Or a 2-tuple of these in the case of divmod or rdivmod. """ - - # NB: We assume that extract_array has already been called - # on `left` and `right`. + # NB: We assume that extract_array and ensure_wrapped_if_datetimelike + # have already been called on `left` and `right`, + # and `maybe_prepare_scalar_for_op` has already been called on `right` # We need to special-case datetime64/timedelta64 dtypes (e.g. because numpy # casts integer dtypes to timedelta64 when operating with timedelta64 - GH#22390) - lvalues = ensure_wrapped_if_datetimelike(left) - rvalues = ensure_wrapped_if_datetimelike(right) - rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) - - if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): - # Timedelta is included because numexpr will fail on it, see GH#31457 - res_values = op(lvalues, rvalues) + if ( + should_extension_dispatch(left, right) + or isinstance(right, (Timedelta, BaseOffset, Timestamp)) + or right is NaT + ): + # Timedelta/Timestamp and other custom scalars are included in the check + # because numexpr will fail on it, see GH#31457 + res_values = op(left, right) else: - res_values = _na_arithmetic_op(lvalues, rvalues, op) + # TODO we should handle EAs consistently and move this check before the if/else + # (https://github.com/pandas-dev/pandas/issues/41165) + _bool_arith_check(op, left, right) + + res_values = _na_arithmetic_op(left, right, op) return res_values @@ -249,7 +262,10 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) - if should_extension_dispatch(lvalues, rvalues): + if should_extension_dispatch(lvalues, rvalues) or ( + (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) + and not is_object_dtype(lvalues.dtype) + ): # Call the method on lvalues res_values = op(lvalues, rvalues) @@ -264,7 +280,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # GH#36377 going through the numexpr path would incorrectly raise return invalid_comparison(lvalues, rvalues, op) - elif is_object_dtype(lvalues.dtype): + elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: @@ -422,7 +438,7 @@ def get_array_op(op): raise NotImplementedError(op_name) -def _maybe_upcast_for_op(obj, shape: Shape): +def maybe_prepare_scalar_for_op(obj, shape: Shape): """ Cast non-pandas objects to pandas types to unify behavior of arithmetic and comparison operations. @@ -441,11 +457,14 @@ def _maybe_upcast_for_op(obj, shape: Shape): Be careful to call this *after* determining the `name` attribute to be attached to the result of the arithmetic operation. """ - if type(obj) is timedelta: + if type(obj) is datetime.timedelta: # GH#22390 cast up to Timedelta to rely on Timedelta # implementation; otherwise operation against numeric-dtype # raises TypeError return Timedelta(obj) + elif type(obj) is datetime.datetime: + # cast up to Timestamp to rely on Timestamp implementation, see Timedelta above + return Timestamp(obj) elif isinstance(obj, np.datetime64): # GH#28080 numpy casts integer-dtype to datetime64 when doing # array[int] + datetime64, which we do not allow @@ -476,3 +495,28 @@ def _maybe_upcast_for_op(obj, shape: Shape): return Timedelta(obj) return obj + + +_BOOL_OP_NOT_ALLOWED = { + operator.truediv, + roperator.rtruediv, + operator.floordiv, + roperator.rfloordiv, + operator.pow, + roperator.rpow, +} + + +def _bool_arith_check(op, a, b): + """ + In contrast to numpy, pandas raises an error for certain operations + with booleans. + """ + if op in _BOOL_OP_NOT_ALLOWED: + if is_bool_dtype(a.dtype) and ( + is_bool_dtype(b) or isinstance(b, (bool, np.bool_)) + ): + op_name = op.__name__.strip("_").lstrip("r") + raise NotImplementedError( + f"operator '{op_name}' not implemented for bool dtypes" + ) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 58003c10db9e0..e69de29bb2d1d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,2072 +0,0 @@ -from __future__ import annotations - -import copy -from datetime import timedelta -from textwrap import dedent -from typing import ( - Callable, - no_type_check, -) - -import numpy as np - -from pandas._libs import lib -from pandas._libs.tslibs import ( - IncompatibleFrequency, - NaT, - Period, - Timedelta, - Timestamp, - to_offset, -) -from pandas._typing import ( - T, - TimedeltaConvertibleTypes, - TimestampConvertibleTypes, - final, -) -from pandas.compat.numpy import function as nv -from pandas.errors import AbstractMethodError -from pandas.util._decorators import ( - Appender, - Substitution, - doc, -) - -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCSeries, -) - -import pandas.core.algorithms as algos -from pandas.core.apply import ResamplerWindowApply -from pandas.core.base import ( - DataError, - PandasObject, -) -import pandas.core.common as com -from pandas.core.generic import ( - NDFrame, - _shared_docs, -) -from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import ( - BaseGroupBy, - GroupBy, - _pipe_template, - get_groupby, -) -from pandas.core.groupby.grouper import Grouper -from pandas.core.groupby.ops import BinGrouper -from pandas.core.indexes.api import Index -from pandas.core.indexes.datetimes import ( - DatetimeIndex, - date_range, -) -from pandas.core.indexes.period import ( - PeriodIndex, - period_range, -) -from pandas.core.indexes.timedeltas import ( - TimedeltaIndex, - timedelta_range, -) - -from pandas.tseries.frequencies import ( - is_subperiod, - is_superperiod, -) -from pandas.tseries.offsets import ( - DateOffset, - Day, - Nano, - Tick, -) - -_shared_docs_kwargs: dict[str, str] = {} - - -class Resampler(BaseGroupBy, PandasObject): - """ - Class for resampling datetimelike data, a groupby-like operation. - See aggregate, transform, and apply functions on this object. - - It's easiest to use obj.resample(...) to use Resampler. - - Parameters - ---------- - obj : pandas object - groupby : a TimeGrouper object - axis : int, default 0 - kind : str or None - 'period', 'timestamp' to override default index treatment - - Returns - ------- - a Resampler of the appropriate type - - Notes - ----- - After resampling, see aggregate, apply, and transform functions. - """ - - # to the groupby descriptor - _attributes = [ - "freq", - "axis", - "closed", - "label", - "convention", - "loffset", - "kind", - "origin", - "offset", - ] - - def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): - self.groupby = groupby - self.keys = None - self.sort = True - self.axis = axis - self.kind = kind - self.squeeze = False - self.group_keys = True - self.as_index = True - self.exclusions = set() - self.binner = None - # error: Incompatible types in assignment (expression has type "None", variable - # has type "BaseGrouper") - self.grouper = None # type: ignore[assignment] - - if self.groupby is not None: - self.groupby._set_grouper(self._convert_obj(obj), sort=True) - - @final - def _shallow_copy(self, obj, **kwargs): - """ - return a new object with the replacement attributes - """ - if isinstance(obj, self._constructor): - obj = obj.obj - for attr in self._attributes: - if attr not in kwargs: - kwargs[attr] = getattr(self, attr) - return self._constructor(obj, **kwargs) - - def __str__(self) -> str: - """ - Provide a nice str repr of our rolling object. - """ - attrs = ( - f"{k}={getattr(self.groupby, k)}" - for k in self._attributes - if getattr(self.groupby, k, None) is not None - ) - return f"{type(self).__name__} [{', '.join(attrs)}]" - - def __getattr__(self, attr: str): - if attr in self._internal_names_set: - return object.__getattribute__(self, attr) - if attr in self._attributes: - return getattr(self.groupby, attr) - if attr in self.obj: - return self[attr] - - return object.__getattribute__(self, attr) - - def __iter__(self): - """ - Resampler iterator. - - Returns - ------- - Generator yielding sequence of (name, subsetted object) - for each group. - - See Also - -------- - GroupBy.__iter__ : Generator yielding sequence for each group. - """ - self._set_binner() - return super().__iter__() - - @property - def obj(self): - return self.groupby.obj - - @property - def ax(self): - return self.groupby.ax - - @property - def _typ(self) -> str: - """ - Masquerade for compat as a Series or a DataFrame. - """ - if isinstance(self._selected_obj, ABCSeries): - return "series" - return "dataframe" - - @property - def _from_selection(self) -> bool: - """ - Is the resampling from a DataFrame column or MultiIndex level. - """ - # upsampling and PeriodIndex resampling do not work - # with selection, this state used to catch and raise an error - return self.groupby is not None and ( - self.groupby.key is not None or self.groupby.level is not None - ) - - def _convert_obj(self, obj): - """ - Provide any conversions for the object in order to correctly handle. - - Parameters - ---------- - obj : the object to be resampled - - Returns - ------- - obj : converted object - """ - return obj._consolidate() - - def _get_binner_for_time(self): - raise AbstractMethodError(self) - - def _set_binner(self): - """ - Setup our binners. - - Cache these as we are an immutable object - """ - if self.binner is None: - self.binner, self.grouper = self._get_binner() - - def _get_binner(self): - """ - Create the BinGrouper, assume that self.set_grouper(obj) - has already been called. - """ - binner, bins, binlabels = self._get_binner_for_time() - assert len(bins) == len(binlabels) - bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) - return binner, bin_grouper - - def _assure_grouper(self): - """ - Make sure that we are creating our binner & grouper. - """ - self._set_binner() - - @Substitution( - klass="Resampler", - examples=""" - >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, - ... index=pd.date_range('2012-08-02', periods=4)) - >>> df - A - 2012-08-02 1 - 2012-08-03 2 - 2012-08-04 3 - 2012-08-05 4 - - To get the difference between each 2-day period's maximum and minimum - value in one pass, you can do - - >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) - A - 2012-08-02 1 - 2012-08-04 1""", - ) - @Appender(_pipe_template) - def pipe( - self, - func: Callable[..., T] | tuple[Callable[..., T], str], - *args, - **kwargs, - ) -> T: - return super().pipe(func, *args, **kwargs) - - _agg_see_also_doc = dedent( - """ - See Also - -------- - DataFrame.groupby.aggregate : Aggregate using callable, string, dict, - or list of string/callables. - DataFrame.resample.transform : Transforms the Series on each group - based on the given function. - DataFrame.aggregate: Aggregate using one or more - operations over the specified axis. - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> s = pd.Series([1,2,3,4,5], - index=pd.date_range('20130101', periods=5,freq='s')) - 2013-01-01 00:00:00 1 - 2013-01-01 00:00:01 2 - 2013-01-01 00:00:02 3 - 2013-01-01 00:00:03 4 - 2013-01-01 00:00:04 5 - Freq: S, dtype: int64 - - >>> r = s.resample('2s') - DatetimeIndexResampler [freq=<2 * Seconds>, axis=0, closed=left, - label=left, convention=start] - - >>> r.agg(np.sum) - 2013-01-01 00:00:00 3 - 2013-01-01 00:00:02 7 - 2013-01-01 00:00:04 5 - Freq: 2S, dtype: int64 - - >>> r.agg(['sum','mean','max']) - sum mean max - 2013-01-01 00:00:00 3 1.5 2 - 2013-01-01 00:00:02 7 3.5 4 - 2013-01-01 00:00:04 5 5.0 5 - - >>> r.agg({'result' : lambda x: x.mean() / x.std(), - 'total' : np.sum}) - total result - 2013-01-01 00:00:00 3 2.121320 - 2013-01-01 00:00:02 7 4.949747 - 2013-01-01 00:00:04 5 NaN - """ - ) - - @doc( - _shared_docs["aggregate"], - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - klass="DataFrame", - axis="", - ) - def aggregate(self, func, *args, **kwargs): - - self._set_binner() - result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() - if result is None: - how = func - grouper = None - result = self._groupby_and_aggregate(how, grouper, *args, **kwargs) - - result = self._apply_loffset(result) - return result - - agg = aggregate - apply = aggregate - - def transform(self, arg, *args, **kwargs): - """ - Call function producing a like-indexed Series on each group and return - a Series with the transformed values. - - Parameters - ---------- - arg : function - To apply to each group. Should return a Series with the same index. - - Returns - ------- - transformed : Series - - Examples - -------- - >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) - """ - return self._selected_obj.groupby(self.groupby).transform(arg, *args, **kwargs) - - def _downsample(self, f): - raise AbstractMethodError(self) - - def _upsample(self, f, limit=None, fill_value=None): - raise AbstractMethodError(self) - - def _gotitem(self, key, ndim: int, subset=None): - """ - Sub-classes to define. Return a sliced object. - - Parameters - ---------- - key : string / list of selections - ndim : {1, 2} - requested ndim of result - subset : object, default None - subset to act on - """ - self._set_binner() - grouper = self.grouper - if subset is None: - subset = self.obj - grouped = get_groupby(subset, by=None, grouper=grouper, axis=self.axis) - - # try the key selection - try: - return grouped[key] - except KeyError: - return grouped - - def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): - """ - Re-evaluate the obj with a groupby aggregation. - """ - if grouper is None: - self._set_binner() - grouper = self.grouper - - obj = self._selected_obj - - grouped = get_groupby(obj, by=None, grouper=grouper, axis=self.axis) - - try: - if isinstance(obj, ABCDataFrame) and callable(how): - # Check if the function is reducing or not. - result = grouped._aggregate_item_by_item(how, *args, **kwargs) - else: - result = grouped.aggregate(how, *args, **kwargs) - except (DataError, AttributeError, KeyError): - # we have a non-reducing function; try to evaluate - # alternatively we want to evaluate only a column of the input - result = grouped.apply(how, *args, **kwargs) - except ValueError as err: - if "Must produce aggregated value" in str(err): - # raised in _aggregate_named - pass - elif "len(index) != len(labels)" in str(err): - # raised in libgroupby validation - pass - elif "No objects to concatenate" in str(err): - # raised in concat call - # In tests this is reached via either - # _apply_to_column_groupbys (ohlc) or DataFrameGroupBy.nunique - pass - else: - raise - - # we have a non-reducing function - # try to evaluate - result = grouped.apply(how, *args, **kwargs) - - result = self._apply_loffset(result) - return self._wrap_result(result) - - def _apply_loffset(self, result): - """ - If loffset is set, offset the result index. - - This is NOT an idempotent routine, it will be applied - exactly once to the result. - - Parameters - ---------- - result : Series or DataFrame - the result of resample - """ - # error: Cannot determine type of 'loffset' - needs_offset = ( - isinstance( - self.loffset, # type: ignore[has-type] - (DateOffset, timedelta, np.timedelta64), - ) - and isinstance(result.index, DatetimeIndex) - and len(result.index) > 0 - ) - - if needs_offset: - # error: Cannot determine type of 'loffset' - result.index = result.index + self.loffset # type: ignore[has-type] - - self.loffset = None - return result - - def _get_resampler_for_grouping(self, groupby, **kwargs): - """ - Return the correct class for resampling with groupby. - """ - return self._resampler_for_grouping(self, groupby=groupby, **kwargs) - - def _wrap_result(self, result): - """ - Potentially wrap any results. - """ - if isinstance(result, ABCSeries) and self._selection is not None: - result.name = self._selection - - if isinstance(result, ABCSeries) and result.empty: - obj = self.obj - # When index is all NaT, result is empty but index is not - result.index = _asfreq_compat(obj.index[:0], freq=self.freq) - result.name = getattr(obj, "name", None) - - return result - - def pad(self, limit=None): - """ - Forward fill the values. - - Parameters - ---------- - limit : int, optional - Limit of how many values to fill. - - Returns - ------- - An upsampled Series. - - See Also - -------- - Series.fillna: Fill NA/NaN values using the specified method. - DataFrame.fillna: Fill NA/NaN values using the specified method. - """ - return self._upsample("pad", limit=limit) - - ffill = pad - - def nearest(self, limit=None): - """ - Resample by using the nearest value. - - When resampling data, missing values may appear (e.g., when the - resampling frequency is higher than the original frequency). - The `nearest` method will replace ``NaN`` values that appeared in - the resampled data with the value from the nearest member of the - sequence, based on the index value. - Missing values that existed in the original data will not be modified. - If `limit` is given, fill only this many values in each direction for - each of the original values. - - Parameters - ---------- - limit : int, optional - Limit of how many values to fill. - - Returns - ------- - Series or DataFrame - An upsampled Series or DataFrame with ``NaN`` values filled with - their nearest value. - - See Also - -------- - backfill : Backward fill the new missing values in the resampled data. - pad : Forward fill ``NaN`` values. - - Examples - -------- - >>> s = pd.Series([1, 2], - ... index=pd.date_range('20180101', - ... periods=2, - ... freq='1h')) - >>> s - 2018-01-01 00:00:00 1 - 2018-01-01 01:00:00 2 - Freq: H, dtype: int64 - - >>> s.resample('15min').nearest() - 2018-01-01 00:00:00 1 - 2018-01-01 00:15:00 1 - 2018-01-01 00:30:00 2 - 2018-01-01 00:45:00 2 - 2018-01-01 01:00:00 2 - Freq: 15T, dtype: int64 - - Limit the number of upsampled values imputed by the nearest: - - >>> s.resample('15min').nearest(limit=1) - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:15:00 1.0 - 2018-01-01 00:30:00 NaN - 2018-01-01 00:45:00 2.0 - 2018-01-01 01:00:00 2.0 - Freq: 15T, dtype: float64 - """ - return self._upsample("nearest", limit=limit) - - def backfill(self, limit=None): - """ - Backward fill the new missing values in the resampled data. - - In statistics, imputation is the process of replacing missing data with - substituted values [1]_. When resampling data, missing values may - appear (e.g., when the resampling frequency is higher than the original - frequency). The backward fill will replace NaN values that appeared in - the resampled data with the next value in the original sequence. - Missing values that existed in the original data will not be modified. - - Parameters - ---------- - limit : int, optional - Limit of how many values to fill. - - Returns - ------- - Series, DataFrame - An upsampled Series or DataFrame with backward filled NaN values. - - See Also - -------- - bfill : Alias of backfill. - fillna : Fill NaN values using the specified method, which can be - 'backfill'. - nearest : Fill NaN values with nearest neighbor starting from center. - pad : Forward fill NaN values. - Series.fillna : Fill NaN values in the Series using the - specified method, which can be 'backfill'. - DataFrame.fillna : Fill NaN values in the DataFrame using the - specified method, which can be 'backfill'. - - References - ---------- - .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) - - Examples - -------- - Resampling a Series: - - >>> s = pd.Series([1, 2, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) - >>> s - 2018-01-01 00:00:00 1 - 2018-01-01 01:00:00 2 - 2018-01-01 02:00:00 3 - Freq: H, dtype: int64 - - >>> s.resample('30min').backfill() - 2018-01-01 00:00:00 1 - 2018-01-01 00:30:00 2 - 2018-01-01 01:00:00 2 - 2018-01-01 01:30:00 3 - 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 - - >>> s.resample('15min').backfill(limit=2) - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:15:00 NaN - 2018-01-01 00:30:00 2.0 - 2018-01-01 00:45:00 2.0 - 2018-01-01 01:00:00 2.0 - 2018-01-01 01:15:00 NaN - 2018-01-01 01:30:00 3.0 - 2018-01-01 01:45:00 3.0 - 2018-01-01 02:00:00 3.0 - Freq: 15T, dtype: float64 - - Resampling a DataFrame that has missing values: - - >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, - ... index=pd.date_range('20180101', periods=3, - ... freq='h')) - >>> df - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 02:00:00 6.0 5 - - >>> df.resample('30min').backfill() - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 00:30:00 NaN 3 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 01:30:00 6.0 5 - 2018-01-01 02:00:00 6.0 5 - - >>> df.resample('15min').backfill(limit=2) - a b - 2018-01-01 00:00:00 2.0 1.0 - 2018-01-01 00:15:00 NaN NaN - 2018-01-01 00:30:00 NaN 3.0 - 2018-01-01 00:45:00 NaN 3.0 - 2018-01-01 01:00:00 NaN 3.0 - 2018-01-01 01:15:00 NaN NaN - 2018-01-01 01:30:00 6.0 5.0 - 2018-01-01 01:45:00 6.0 5.0 - 2018-01-01 02:00:00 6.0 5.0 - """ - return self._upsample("backfill", limit=limit) - - bfill = backfill - - def fillna(self, method, limit=None): - """ - Fill missing values introduced by upsampling. - - In statistics, imputation is the process of replacing missing data with - substituted values [1]_. When resampling data, missing values may - appear (e.g., when the resampling frequency is higher than the original - frequency). - - Missing values that existed in the original data will - not be modified. - - Parameters - ---------- - method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'} - Method to use for filling holes in resampled data - - * 'pad' or 'ffill': use previous valid observation to fill gap - (forward fill). - * 'backfill' or 'bfill': use next valid observation to fill gap. - * 'nearest': use nearest valid observation to fill gap. - - limit : int, optional - Limit of how many consecutive missing values to fill. - - Returns - ------- - Series or DataFrame - An upsampled Series or DataFrame with missing values filled. - - See Also - -------- - backfill : Backward fill NaN values in the resampled data. - pad : Forward fill NaN values in the resampled data. - nearest : Fill NaN values in the resampled data - with nearest neighbor starting from center. - interpolate : Fill NaN values using interpolation. - Series.fillna : Fill NaN values in the Series using the - specified method, which can be 'bfill' and 'ffill'. - DataFrame.fillna : Fill NaN values in the DataFrame using the - specified method, which can be 'bfill' and 'ffill'. - - References - ---------- - .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) - - Examples - -------- - Resampling a Series: - - >>> s = pd.Series([1, 2, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) - >>> s - 2018-01-01 00:00:00 1 - 2018-01-01 01:00:00 2 - 2018-01-01 02:00:00 3 - Freq: H, dtype: int64 - - Without filling the missing values you get: - - >>> s.resample("30min").asfreq() - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:30:00 NaN - 2018-01-01 01:00:00 2.0 - 2018-01-01 01:30:00 NaN - 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 - - >>> s.resample('30min').fillna("backfill") - 2018-01-01 00:00:00 1 - 2018-01-01 00:30:00 2 - 2018-01-01 01:00:00 2 - 2018-01-01 01:30:00 3 - 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 - - >>> s.resample('15min').fillna("backfill", limit=2) - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:15:00 NaN - 2018-01-01 00:30:00 2.0 - 2018-01-01 00:45:00 2.0 - 2018-01-01 01:00:00 2.0 - 2018-01-01 01:15:00 NaN - 2018-01-01 01:30:00 3.0 - 2018-01-01 01:45:00 3.0 - 2018-01-01 02:00:00 3.0 - Freq: 15T, dtype: float64 - - >>> s.resample('30min').fillna("pad") - 2018-01-01 00:00:00 1 - 2018-01-01 00:30:00 1 - 2018-01-01 01:00:00 2 - 2018-01-01 01:30:00 2 - 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 - - >>> s.resample('30min').fillna("nearest") - 2018-01-01 00:00:00 1 - 2018-01-01 00:30:00 2 - 2018-01-01 01:00:00 2 - 2018-01-01 01:30:00 3 - 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 - - Missing values present before the upsampling are not affected. - - >>> sm = pd.Series([1, None, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) - >>> sm - 2018-01-01 00:00:00 1.0 - 2018-01-01 01:00:00 NaN - 2018-01-01 02:00:00 3.0 - Freq: H, dtype: float64 - - >>> sm.resample('30min').fillna('backfill') - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:30:00 NaN - 2018-01-01 01:00:00 NaN - 2018-01-01 01:30:00 3.0 - 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 - - >>> sm.resample('30min').fillna('pad') - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:30:00 1.0 - 2018-01-01 01:00:00 NaN - 2018-01-01 01:30:00 NaN - 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 - - >>> sm.resample('30min').fillna('nearest') - 2018-01-01 00:00:00 1.0 - 2018-01-01 00:30:00 NaN - 2018-01-01 01:00:00 NaN - 2018-01-01 01:30:00 3.0 - 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 - - DataFrame resampling is done column-wise. All the same options are - available. - - >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, - ... index=pd.date_range('20180101', periods=3, - ... freq='h')) - >>> df - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 02:00:00 6.0 5 - - >>> df.resample('30min').fillna("bfill") - a b - 2018-01-01 00:00:00 2.0 1 - 2018-01-01 00:30:00 NaN 3 - 2018-01-01 01:00:00 NaN 3 - 2018-01-01 01:30:00 6.0 5 - 2018-01-01 02:00:00 6.0 5 - """ - return self._upsample(method, limit=limit) - - @doc(NDFrame.interpolate, **_shared_docs_kwargs) - def interpolate( - self, - method="linear", - axis=0, - limit=None, - inplace=False, - limit_direction="forward", - limit_area=None, - downcast=None, - **kwargs, - ): - """ - Interpolate values according to different methods. - """ - result = self._upsample("asfreq") - return result.interpolate( - method=method, - axis=axis, - limit=limit, - inplace=inplace, - limit_direction=limit_direction, - limit_area=limit_area, - downcast=downcast, - **kwargs, - ) - - def asfreq(self, fill_value=None): - """ - Return the values at the new freq, essentially a reindex. - - Parameters - ---------- - fill_value : scalar, optional - Value to use for missing values, applied during upsampling (note - this does not fill NaNs that already were present). - - Returns - ------- - DataFrame or Series - Values at the specified freq. - - See Also - -------- - Series.asfreq: Convert TimeSeries to specified frequency. - DataFrame.asfreq: Convert TimeSeries to specified frequency. - """ - return self._upsample("asfreq", fill_value=fill_value) - - def std(self, ddof=1, *args, **kwargs): - """ - Compute standard deviation of groups, excluding missing values. - - Parameters - ---------- - ddof : int, default 1 - Degrees of freedom. - - Returns - ------- - DataFrame or Series - Standard deviation of values within each group. - """ - nv.validate_resampler_func("std", args, kwargs) - # error: Unexpected keyword argument "ddof" for "_downsample" - return self._downsample("std", ddof=ddof) # type: ignore[call-arg] - - def var(self, ddof=1, *args, **kwargs): - """ - Compute variance of groups, excluding missing values. - - Parameters - ---------- - ddof : int, default 1 - Degrees of freedom. - - Returns - ------- - DataFrame or Series - Variance of values within each group. - """ - nv.validate_resampler_func("var", args, kwargs) - # error: Unexpected keyword argument "ddof" for "_downsample" - return self._downsample("var", ddof=ddof) # type: ignore[call-arg] - - @doc(GroupBy.size) - def size(self): - result = self._downsample("size") - if not len(self.ax): - from pandas import Series - - if self._selected_obj.ndim == 1: - name = self._selected_obj.name - else: - name = None - result = Series([], index=result.index, dtype="int64", name=name) - return result - - @doc(GroupBy.count) - def count(self): - result = self._downsample("count") - if not len(self.ax): - if self._selected_obj.ndim == 1: - result = type(self._selected_obj)( - [], index=result.index, dtype="int64", name=self._selected_obj.name - ) - else: - from pandas import DataFrame - - result = DataFrame( - [], index=result.index, columns=result.columns, dtype="int64" - ) - - return result - - def quantile(self, q=0.5, **kwargs): - """ - Return value at the given quantile. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - q : float or array-like, default 0.5 (50% quantile) - - Returns - ------- - DataFrame or Series - Quantile of values within each group. - - See Also - -------- - Series.quantile - Return a series, where the index is q and the values are the quantiles. - DataFrame.quantile - Return a DataFrame, where the columns are the columns of self, - and the values are the quantiles. - DataFrameGroupBy.quantile - Return a DataFrame, where the coulmns are groupby columns, - and the values are its quantiles. - """ - # error: Unexpected keyword argument "q" for "_downsample" - # error: Too many arguments for "_downsample" - return self._downsample("quantile", q=q, **kwargs) # type: ignore[call-arg] - - -# downsample methods -for method in ["sum", "prod", "min", "max", "first", "last"]: - - def f(self, _method=method, min_count=0, *args, **kwargs): - nv.validate_resampler_func(_method, args, kwargs) - return self._downsample(_method, min_count=min_count) - - f.__doc__ = getattr(GroupBy, method).__doc__ - setattr(Resampler, method, f) - - -# downsample methods -for method in ["mean", "sem", "median", "ohlc"]: - - def g(self, _method=method, *args, **kwargs): - nv.validate_resampler_func(_method, args, kwargs) - return self._downsample(_method) - - g.__doc__ = getattr(GroupBy, method).__doc__ - setattr(Resampler, method, g) - - -# series only methods -for method in ["nunique"]: - - def h(self, _method=method): - return self._downsample(_method) - - h.__doc__ = getattr(SeriesGroupBy, method).__doc__ - setattr(Resampler, method, h) - - -class _GroupByMixin(PandasObject): - """ - Provide the groupby facilities. - """ - - _attributes: list[str] - - def __init__(self, obj, *args, **kwargs): - - parent = kwargs.pop("parent", None) - groupby = kwargs.pop("groupby", None) - if parent is None: - parent = obj - - # initialize our GroupByMixin object with - # the resampler attributes - for attr in self._attributes: - setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) - - # error: Too many arguments for "__init__" of "object" - super().__init__(None) # type: ignore[call-arg] - self._groupby = groupby - self._groupby.mutated = True - self._groupby.grouper.mutated = True - self.groupby = copy.copy(parent.groupby) - - @no_type_check - def _apply(self, f, grouper=None, *args, **kwargs): - """ - Dispatch to _upsample; we are stripping all of the _upsample kwargs and - performing the original function call on the grouped object. - """ - - def func(x): - x = self._shallow_copy(x, groupby=self.groupby) - - if isinstance(f, str): - return getattr(x, f)(**kwargs) - - return x.apply(f, *args, **kwargs) - - result = self._groupby.apply(func) - return self._wrap_result(result) - - _upsample = _apply - _downsample = _apply - _groupby_and_aggregate = _apply - - @final - def _gotitem(self, key, ndim, subset=None): - """ - Sub-classes to define. Return a sliced object. - - Parameters - ---------- - key : string / list of selections - ndim : {1, 2} - requested ndim of result - subset : object, default None - subset to act on - """ - # create a new object to prevent aliasing - if subset is None: - # error: "GotItemMixin" has no attribute "obj" - subset = self.obj # type: ignore[attr-defined] - - # we need to make a shallow copy of ourselves - # with the same groupby - kwargs = {attr: getattr(self, attr) for attr in self._attributes} - - # Try to select from a DataFrame, falling back to a Series - try: - groupby = self._groupby[key] - except IndexError: - groupby = self._groupby - - self = type(self)(subset, groupby=groupby, parent=self, **kwargs) - self._reset_cache() - if subset.ndim == 2 and ( - lib.is_scalar(key) and key in subset or lib.is_list_like(key) - ): - self._selection = key - return self - - -class DatetimeIndexResampler(Resampler): - @property - def _resampler_for_grouping(self): - return DatetimeIndexResamplerGroupby - - def _get_binner_for_time(self): - - # this is how we are actually creating the bins - if self.kind == "period": - return self.groupby._get_time_period_bins(self.ax) - return self.groupby._get_time_bins(self.ax) - - def _downsample(self, how, **kwargs): - """ - Downsample the cython defined function. - - Parameters - ---------- - how : string / cython mapped function - **kwargs : kw args passed to how function - """ - self._set_binner() - how = com.get_cython_func(how) or how - ax = self.ax - obj = self._selected_obj - - if not len(ax): - # reset to the new freq - obj = obj.copy() - obj.index = obj.index._with_freq(self.freq) - assert obj.index.freq == self.freq, (obj.index.freq, self.freq) - return obj - - # do we have a regular frequency - - # error: "BaseGrouper" has no attribute "binlabels" - if ( - (ax.freq is not None or ax.inferred_freq is not None) - and len(self.grouper.binlabels) > len(ax) # type: ignore[attr-defined] - and how is None - ): - - # let's do an asfreq - return self.asfreq() - - # we are downsampling - # we want to call the actual grouper method here - result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs) - - result = self._apply_loffset(result) - return self._wrap_result(result) - - def _adjust_binner_for_upsample(self, binner): - """ - Adjust our binner when upsampling. - - The range of a new index should not be outside specified range - """ - if self.closed == "right": - binner = binner[1:] - else: - binner = binner[:-1] - return binner - - def _upsample(self, method, limit=None, fill_value=None): - """ - Parameters - ---------- - method : string {'backfill', 'bfill', 'pad', - 'ffill', 'asfreq'} method for upsampling - limit : int, default None - Maximum size gap to fill when reindexing - fill_value : scalar, default None - Value to use for missing values - - See Also - -------- - .fillna: Fill NA/NaN values using the specified method. - - """ - self._set_binner() - if self.axis: - raise AssertionError("axis must be 0") - if self._from_selection: - raise ValueError( - "Upsampling from level= or on= selection " - "is not supported, use .set_index(...) " - "to explicitly set index to datetime-like" - ) - - ax = self.ax - obj = self._selected_obj - binner = self.binner - res_index = self._adjust_binner_for_upsample(binner) - - # if we have the same frequency as our axis, then we are equal sampling - if ( - limit is None - and to_offset(ax.inferred_freq) == self.freq - and len(obj) == len(res_index) - ): - result = obj.copy() - result.index = res_index - else: - result = obj.reindex( - res_index, method=method, limit=limit, fill_value=fill_value - ) - - result = self._apply_loffset(result) - return self._wrap_result(result) - - def _wrap_result(self, result): - result = super()._wrap_result(result) - - # we may have a different kind that we were asked originally - # convert if needed - if self.kind == "period" and not isinstance(result.index, PeriodIndex): - result.index = result.index.to_period(self.freq) - return result - - -class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): - """ - Provides a resample of a groupby implementation - """ - - @property - def _constructor(self): - return DatetimeIndexResampler - - -class PeriodIndexResampler(DatetimeIndexResampler): - @property - def _resampler_for_grouping(self): - return PeriodIndexResamplerGroupby - - def _get_binner_for_time(self): - if self.kind == "timestamp": - return super()._get_binner_for_time() - return self.groupby._get_period_bins(self.ax) - - def _convert_obj(self, obj): - obj = super()._convert_obj(obj) - - if self._from_selection: - # see GH 14008, GH 12871 - msg = ( - "Resampling from level= or on= selection " - "with a PeriodIndex is not currently supported, " - "use .set_index(...) to explicitly set index" - ) - raise NotImplementedError(msg) - - if self.loffset is not None: - # Cannot apply loffset/timedelta to PeriodIndex -> convert to - # timestamps - self.kind = "timestamp" - - # convert to timestamp - if self.kind == "timestamp": - obj = obj.to_timestamp(how=self.convention) - - return obj - - def _downsample(self, how, **kwargs): - """ - Downsample the cython defined function. - - Parameters - ---------- - how : string / cython mapped function - **kwargs : kw args passed to how function - """ - # we may need to actually resample as if we are timestamps - if self.kind == "timestamp": - return super()._downsample(how, **kwargs) - - how = com.get_cython_func(how) or how - ax = self.ax - - if is_subperiod(ax.freq, self.freq): - # Downsampling - return self._groupby_and_aggregate(how, grouper=self.grouper, **kwargs) - elif is_superperiod(ax.freq, self.freq): - if how == "ohlc": - # GH #13083 - # upsampling to subperiods is handled as an asfreq, which works - # for pure aggregating/reducing methods - # OHLC reduces along the time dimension, but creates multiple - # values for each period -> handle by _groupby_and_aggregate() - return self._groupby_and_aggregate(how, grouper=self.grouper) - return self.asfreq() - elif ax.freq == self.freq: - return self.asfreq() - - raise IncompatibleFrequency( - f"Frequency {ax.freq} cannot be resampled to {self.freq}, " - "as they are not sub or super periods" - ) - - def _upsample(self, method, limit=None, fill_value=None): - """ - Parameters - ---------- - method : {'backfill', 'bfill', 'pad', 'ffill'} - Method for upsampling. - limit : int, default None - Maximum size gap to fill when reindexing. - fill_value : scalar, default None - Value to use for missing values. - - See Also - -------- - .fillna: Fill NA/NaN values using the specified method. - - """ - # we may need to actually resample as if we are timestamps - if self.kind == "timestamp": - return super()._upsample(method, limit=limit, fill_value=fill_value) - - self._set_binner() - ax = self.ax - obj = self.obj - new_index = self.binner - - # Start vs. end of period - memb = ax.asfreq(self.freq, how=self.convention) - - # Get the fill indexer - indexer = memb.get_indexer(new_index, method=method, limit=limit) - return self._wrap_result( - _take_new_index(obj, indexer, new_index, axis=self.axis) - ) - - -class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): - """ - Provides a resample of a groupby implementation. - """ - - @property - def _constructor(self): - return PeriodIndexResampler - - -class TimedeltaIndexResampler(DatetimeIndexResampler): - @property - def _resampler_for_grouping(self): - return TimedeltaIndexResamplerGroupby - - def _get_binner_for_time(self): - return self.groupby._get_time_delta_bins(self.ax) - - def _adjust_binner_for_upsample(self, binner): - """ - Adjust our binner when upsampling. - - The range of a new index is allowed to be greater than original range - so we don't need to change the length of a binner, GH 13022 - """ - return binner - - -class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): - """ - Provides a resample of a groupby implementation. - """ - - @property - def _constructor(self): - return TimedeltaIndexResampler - - -def get_resampler(obj, kind=None, **kwds): - """ - Create a TimeGrouper and return our resampler. - """ - tg = TimeGrouper(**kwds) - return tg._get_resampler(obj, kind=kind) - - -get_resampler.__doc__ = Resampler.__doc__ - - -def get_resampler_for_grouping( - groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs -): - """ - Return our appropriate resampler when grouping as well. - """ - # .resample uses 'on' similar to how .groupby uses 'key' - kwargs["key"] = kwargs.pop("on", None) - - tg = TimeGrouper(freq=rule, **kwargs) - resampler = tg._get_resampler(groupby.obj, kind=kind) - return resampler._get_resampler_for_grouping(groupby=groupby) - - -class TimeGrouper(Grouper): - """ - Custom groupby class for time-interval grouping. - - Parameters - ---------- - freq : pandas date offset or offset alias for identifying bin edges - closed : closed end of interval; 'left' or 'right' - label : interval boundary to use for labeling; 'left' or 'right' - convention : {'start', 'end', 'e', 's'} - If axis is PeriodIndex - """ - - _attributes = Grouper._attributes + ( - "closed", - "label", - "how", - "loffset", - "kind", - "convention", - "origin", - "offset", - ) - - def __init__( - self, - freq="Min", - closed: str | None = None, - label: str | None = None, - how="mean", - axis=0, - fill_method=None, - limit=None, - loffset=None, - kind: str | None = None, - convention: str | None = None, - base: int | None = None, - origin: str | TimestampConvertibleTypes = "start_day", - offset: TimedeltaConvertibleTypes | None = None, - **kwargs, - ): - # Check for correctness of the keyword arguments which would - # otherwise silently use the default if misspelled - if label not in {None, "left", "right"}: - raise ValueError(f"Unsupported value {label} for `label`") - if closed not in {None, "left", "right"}: - raise ValueError(f"Unsupported value {closed} for `closed`") - if convention not in {None, "start", "end", "e", "s"}: - raise ValueError(f"Unsupported value {convention} for `convention`") - - freq = to_offset(freq) - - end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"} - rule = freq.rule_code - if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): - if closed is None: - closed = "right" - if label is None: - label = "right" - else: - # The backward resample sets ``closed`` to ``'right'`` by default - # since the last value should be considered as the edge point for - # the last bin. When origin in "end" or "end_day", the value for a - # specific ``Timestamp`` index stands for the resample result from - # the current ``Timestamp`` minus ``freq`` to the current - # ``Timestamp`` with a right close. - if origin in ["end", "end_day"]: - if closed is None: - closed = "right" - if label is None: - label = "right" - else: - if closed is None: - closed = "left" - if label is None: - label = "left" - - self.closed = closed - self.label = label - self.kind = kind - - self.convention = convention or "E" - self.convention = self.convention.lower() - - self.how = how - self.fill_method = fill_method - self.limit = limit - - if origin in ("epoch", "start", "start_day", "end", "end_day"): - self.origin = origin - else: - try: - self.origin = Timestamp(origin) - except Exception as e: - raise ValueError( - "'origin' should be equal to 'epoch', 'start', 'start_day', " - "'end', 'end_day' or " - f"should be a Timestamp convertible type. Got '{origin}' instead." - ) from e - - try: - self.offset = Timedelta(offset) if offset is not None else None - except Exception as e: - raise ValueError( - "'offset' should be a Timedelta convertible type. " - f"Got '{offset}' instead." - ) from e - - # always sort time groupers - kwargs["sort"] = True - - # Handle deprecated arguments since v1.1.0 of `base` and `loffset` (GH #31809) - if base is not None and offset is not None: - raise ValueError("'offset' and 'base' cannot be present at the same time") - - if base and isinstance(freq, Tick): - # this conversion handle the default behavior of base and the - # special case of GH #10530. Indeed in case when dealing with - # a TimedeltaIndex base was treated as a 'pure' offset even though - # the default behavior of base was equivalent of a modulo on - # freq_nanos. - self.offset = Timedelta(base * freq.nanos // freq.n) - - if isinstance(loffset, str): - loffset = to_offset(loffset) - self.loffset = loffset - - super().__init__(freq=freq, axis=axis, **kwargs) - - def _get_resampler(self, obj, kind=None): - """ - Return my resampler or raise if we have an invalid axis. - - Parameters - ---------- - obj : input object - kind : string, optional - 'period','timestamp','timedelta' are valid - - Returns - ------- - a Resampler - - Raises - ------ - TypeError if incompatible axis - - """ - self._set_grouper(obj) - - ax = self.ax - if isinstance(ax, DatetimeIndex): - return DatetimeIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) - elif isinstance(ax, PeriodIndex) or kind == "period": - return PeriodIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) - elif isinstance(ax, TimedeltaIndex): - return TimedeltaIndexResampler(obj, groupby=self, axis=self.axis) - - raise TypeError( - "Only valid with DatetimeIndex, " - "TimedeltaIndex or PeriodIndex, " - f"but got an instance of '{type(ax).__name__}'" - ) - - def _get_grouper(self, obj, validate: bool = True): - # create the resampler and return our binner - r = self._get_resampler(obj) - r._set_binner() - return r.binner, r.grouper, r.obj - - def _get_time_bins(self, ax): - if not isinstance(ax, DatetimeIndex): - raise TypeError( - "axis must be a DatetimeIndex, but got " - f"an instance of {type(ax).__name__}" - ) - - if len(ax) == 0: - binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name) - return binner, [], labels - - first, last = _get_timestamp_range_edges( - ax.min(), - ax.max(), - self.freq, - closed=self.closed, - origin=self.origin, - offset=self.offset, - ) - # GH #12037 - # use first/last directly instead of call replace() on them - # because replace() will swallow the nanosecond part - # thus last bin maybe slightly before the end if the end contains - # nanosecond part and lead to `Values falls after last bin` error - # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback - # has noted that ambiguous=True provides the most sensible result - binner = labels = date_range( - freq=self.freq, - start=first, - end=last, - tz=ax.tz, - name=ax.name, - ambiguous=True, - nonexistent="shift_forward", - ) - - ax_values = ax.asi8 - binner, bin_edges = self._adjust_bin_edges(binner, ax_values) - - # general version, knowing nothing about relative frequencies - bins = lib.generate_bins_dt64( - ax_values, bin_edges, self.closed, hasnans=ax.hasnans - ) - - if self.closed == "right": - labels = binner - if self.label == "right": - labels = labels[1:] - elif self.label == "right": - labels = labels[1:] - - if ax.hasnans: - binner = binner.insert(0, NaT) - labels = labels.insert(0, NaT) - - # if we end up with more labels than bins - # adjust the labels - # GH4076 - if len(bins) < len(labels): - labels = labels[: len(bins)] - - return binner, bins, labels - - def _adjust_bin_edges(self, binner, ax_values): - # Some hacks for > daily data, see #1471, #1458, #1483 - - if self.freq != "D" and is_superperiod(self.freq, "D"): - if self.closed == "right": - # GH 21459, GH 9119: Adjust the bins relative to the wall time - bin_edges = binner.tz_localize(None) - bin_edges = bin_edges + timedelta(1) - Nano(1) - bin_edges = bin_edges.tz_localize(binner.tz).asi8 - else: - bin_edges = binner.asi8 - - # intraday values on last day - if bin_edges[-2] > ax_values.max(): - bin_edges = bin_edges[:-1] - binner = binner[:-1] - else: - bin_edges = binner.asi8 - return binner, bin_edges - - def _get_time_delta_bins(self, ax): - if not isinstance(ax, TimedeltaIndex): - raise TypeError( - "axis must be a TimedeltaIndex, but got " - f"an instance of {type(ax).__name__}" - ) - - if not len(ax): - binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name) - return binner, [], labels - - start, end = ax.min(), ax.max() - labels = binner = timedelta_range( - start=start, end=end, freq=self.freq, name=ax.name - ) - - end_stamps = labels + self.freq - bins = ax.searchsorted(end_stamps, side="left") - - if self.offset: - # GH 10530 & 31809 - labels += self.offset - if self.loffset: - # GH 33498 - labels += self.loffset - - return binner, bins, labels - - def _get_time_period_bins(self, ax: DatetimeIndex): - if not isinstance(ax, DatetimeIndex): - raise TypeError( - "axis must be a DatetimeIndex, but got " - f"an instance of {type(ax).__name__}" - ) - - freq = self.freq - - if not len(ax): - binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name) - return binner, [], labels - - labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name) - - end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp() - if ax.tz: - end_stamps = end_stamps.tz_localize(ax.tz) - bins = ax.searchsorted(end_stamps, side="left") - - return binner, bins, labels - - def _get_period_bins(self, ax: PeriodIndex): - if not isinstance(ax, PeriodIndex): - raise TypeError( - "axis must be a PeriodIndex, but got " - f"an instance of {type(ax).__name__}" - ) - - memb = ax.asfreq(self.freq, how=self.convention) - - # NaT handling as in pandas._lib.lib.generate_bins_dt64() - nat_count = 0 - if memb.hasnans: - nat_count = np.sum(memb._isnan) - memb = memb[~memb._isnan] - - if not len(memb): - # index contains no valid (non-NaT) values - bins = np.array([], dtype=np.int64) - binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) - if len(ax) > 0: - # index is all NaT - binner, bins, labels = _insert_nat_bin(binner, bins, labels, len(ax)) - return binner, bins, labels - - freq_mult = self.freq.n - - start = ax.min().asfreq(self.freq, how=self.convention) - end = ax.max().asfreq(self.freq, how="end") - bin_shift = 0 - - if isinstance(self.freq, Tick): - # GH 23882 & 31809: get adjusted bin edge labels with 'origin' - # and 'origin' support. This call only makes sense if the freq is a - # Tick since offset and origin are only used in those cases. - # Not doing this check could create an extra empty bin. - p_start, end = _get_period_range_edges( - start, - end, - self.freq, - closed=self.closed, - origin=self.origin, - offset=self.offset, - ) - - # Get offset for bin edge (not label edge) adjustment - start_offset = Period(start, self.freq) - Period(p_start, self.freq) - bin_shift = start_offset.n % freq_mult - start = p_start - - labels = binner = period_range( - start=start, end=end, freq=self.freq, name=ax.name - ) - - i8 = memb.asi8 - - # when upsampling to subperiods, we need to generate enough bins - expected_bins_count = len(binner) * freq_mult - i8_extend = expected_bins_count - (i8[-1] - i8[0]) - rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) - rng += freq_mult - # adjust bin edge indexes to account for base - rng -= bin_shift - - # Wrap in PeriodArray for PeriodArray.searchsorted - prng = type(memb._data)(rng, dtype=memb.dtype) - bins = memb.searchsorted(prng, side="left") - - if nat_count > 0: - binner, bins, labels = _insert_nat_bin(binner, bins, labels, nat_count) - - return binner, bins, labels - - -def _take_new_index(obj, indexer, new_index, axis=0): - - if isinstance(obj, ABCSeries): - new_values = algos.take_nd(obj._values, indexer) - return obj._constructor(new_values, index=new_index, name=obj.name) - elif isinstance(obj, ABCDataFrame): - if axis == 1: - raise NotImplementedError("axis 1 is not supported") - return obj._constructor( - obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) - ) - else: - raise ValueError("'obj' should be either a Series or a DataFrame") - - -def _get_timestamp_range_edges( - first, last, freq, closed="left", origin="start_day", offset=None -): - """ - Adjust the `first` Timestamp to the preceding Timestamp that resides on - the provided offset. Adjust the `last` Timestamp to the following - Timestamp that resides on the provided offset. Input Timestamps that - already reside on the offset will be adjusted depending on the type of - offset and the `closed` parameter. - - Parameters - ---------- - first : pd.Timestamp - The beginning Timestamp of the range to be adjusted. - last : pd.Timestamp - The ending Timestamp of the range to be adjusted. - freq : pd.DateOffset - The dateoffset to which the Timestamps will be adjusted. - closed : {'right', 'left'}, default None - Which side of bin interval is closed. - origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day' - The timestamp on which to adjust the grouping. The timezone of origin must - match the timezone of the index. - If a timestamp is not used, these values are also supported: - - - 'epoch': `origin` is 1970-01-01 - - 'start': `origin` is the first value of the timeseries - - 'start_day': `origin` is the first day at midnight of the timeseries - offset : pd.Timedelta, default is None - An offset timedelta added to the origin. - - Returns - ------- - A tuple of length 2, containing the adjusted pd.Timestamp objects. - """ - if isinstance(freq, Tick): - index_tz = first.tz - if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): - raise ValueError("The origin must have the same timezone as the index.") - elif origin == "epoch": - # set the epoch based on the timezone to have similar bins results when - # resampling on the same kind of indexes on different timezones - origin = Timestamp("1970-01-01", tz=index_tz) - - if isinstance(freq, Day): - # _adjust_dates_anchored assumes 'D' means 24H, but first/last - # might contain a DST transition (23H, 24H, or 25H). - # So "pretend" the dates are naive when adjusting the endpoints - first = first.tz_localize(None) - last = last.tz_localize(None) - if isinstance(origin, Timestamp): - origin = origin.tz_localize(None) - - first, last = _adjust_dates_anchored( - first, last, freq, closed=closed, origin=origin, offset=offset - ) - if isinstance(freq, Day): - first = first.tz_localize(index_tz) - last = last.tz_localize(index_tz) - else: - first = first.normalize() - last = last.normalize() - - if closed == "left": - first = Timestamp(freq.rollback(first)) - else: - first = Timestamp(first - freq) - - last = Timestamp(last + freq) - - return first, last - - -def _get_period_range_edges( - first, last, freq, closed="left", origin="start_day", offset=None -): - """ - Adjust the provided `first` and `last` Periods to the respective Period of - the given offset that encompasses them. - - Parameters - ---------- - first : pd.Period - The beginning Period of the range to be adjusted. - last : pd.Period - The ending Period of the range to be adjusted. - freq : pd.DateOffset - The freq to which the Periods will be adjusted. - closed : {'right', 'left'}, default None - Which side of bin interval is closed. - origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day' - The timestamp on which to adjust the grouping. The timezone of origin must - match the timezone of the index. - - If a timestamp is not used, these values are also supported: - - - 'epoch': `origin` is 1970-01-01 - - 'start': `origin` is the first value of the timeseries - - 'start_day': `origin` is the first day at midnight of the timeseries - offset : pd.Timedelta, default is None - An offset timedelta added to the origin. - - Returns - ------- - A tuple of length 2, containing the adjusted pd.Period objects. - """ - if not all(isinstance(obj, Period) for obj in [first, last]): - raise TypeError("'first' and 'last' must be instances of type Period") - - # GH 23882 - first = first.to_timestamp() - last = last.to_timestamp() - adjust_first = not freq.is_on_offset(first) - adjust_last = freq.is_on_offset(last) - - first, last = _get_timestamp_range_edges( - first, last, freq, closed=closed, origin=origin, offset=offset - ) - - first = (first + int(adjust_first) * freq).to_period(freq) - last = (last - int(adjust_last) * freq).to_period(freq) - return first, last - - -def _insert_nat_bin( - binner: PeriodIndex, bins: np.ndarray, labels: PeriodIndex, nat_count: int -) -> tuple[PeriodIndex, np.ndarray, PeriodIndex]: - # NaT handling as in pandas._lib.lib.generate_bins_dt64() - # shift bins by the number of NaT - assert nat_count > 0 - bins += nat_count - bins = np.insert(bins, 0, nat_count) - binner = binner.insert(0, NaT) - labels = labels.insert(0, NaT) - return binner, bins, labels - - -def _adjust_dates_anchored( - first, last, freq, closed="right", origin="start_day", offset=None -): - # First and last offsets should be calculated from the start day to fix an - # error cause by resampling across multiple days when a one day period is - # not a multiple of the frequency. See GH 8683 - # To handle frequencies that are not multiple or divisible by a day we let - # the possibility to define a fixed origin timestamp. See GH 31809 - origin_nanos = 0 # origin == "epoch" - if origin == "start_day": - origin_nanos = first.normalize().value - elif origin == "start": - origin_nanos = first.value - elif isinstance(origin, Timestamp): - origin_nanos = origin.value - elif origin in ["end", "end_day"]: - origin = last if origin == "end" else last.ceil("D") - sub_freq_times = (origin.value - first.value) // freq.nanos - if closed == "left": - sub_freq_times += 1 - first = origin - sub_freq_times * freq - origin_nanos = first.value - origin_nanos += offset.value if offset else 0 - - # GH 10117 & GH 19375. If first and last contain timezone information, - # Perform the calculation in UTC in order to avoid localizing on an - # Ambiguous or Nonexistent time. - first_tzinfo = first.tzinfo - last_tzinfo = last.tzinfo - if first_tzinfo is not None: - first = first.tz_convert("UTC") - if last_tzinfo is not None: - last = last.tz_convert("UTC") - - foffset = (first.value - origin_nanos) % freq.nanos - loffset = (last.value - origin_nanos) % freq.nanos - - if closed == "right": - if foffset > 0: - # roll back - fresult = first.value - foffset - else: - fresult = first.value - freq.nanos - - if loffset > 0: - # roll forward - lresult = last.value + (freq.nanos - loffset) - else: - # already the end of the road - lresult = last.value - else: # closed == 'left' - if foffset > 0: - fresult = first.value - foffset - else: - # start of the road - fresult = first.value - - if loffset > 0: - # roll forward - lresult = last.value + (freq.nanos - loffset) - else: - lresult = last.value + freq.nanos - fresult = Timestamp(fresult) - lresult = Timestamp(lresult) - if first_tzinfo is not None: - fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo) - if last_tzinfo is not None: - lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo) - return fresult, lresult - - -def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): - """ - Utility frequency conversion method for Series/DataFrame. - - See :meth:`pandas.NDFrame.asfreq` for full documentation. - """ - if isinstance(obj.index, PeriodIndex): - if method is not None: - raise NotImplementedError("'method' argument is not supported") - - if how is None: - how = "E" - - new_obj = obj.copy() - new_obj.index = obj.index.asfreq(freq, how=how) - - elif len(obj.index) == 0: - new_obj = obj.copy() - - new_obj.index = _asfreq_compat(obj.index, freq) - else: - dti = date_range(obj.index.min(), obj.index.max(), freq=freq) - dti.name = obj.index.name - new_obj = obj.reindex(dti, method=method, fill_value=fill_value) - if normalize: - new_obj.index = new_obj.index.normalize() - - return new_obj - - -def _asfreq_compat(index, freq): - """ - Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex. - - Parameters - ---------- - index : PeriodIndex, DatetimeIndex, or TimedeltaIndex - freq : DateOffset - - Returns - ------- - same type as index - """ - if len(index) != 0: - # This should never be reached, always checked by the caller - raise ValueError( - "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex" - ) - new_index: Index - if isinstance(index, PeriodIndex): - new_index = index.asfreq(freq=freq) - elif isinstance(index, DatetimeIndex): - new_index = DatetimeIndex([], dtype=index.dtype, freq=freq, name=index.name) - elif isinstance(index, TimedeltaIndex): - new_index = TimedeltaIndex([], dtype=index.dtype, freq=freq, name=index.name) - else: # pragma: no cover - raise TypeError(type(index)) - return new_index diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d889e84cb9045..d1e076da9293d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( ensure_platform_int, + is_1d_only_ea_dtype, is_bool_dtype, is_extension_array_dtype, is_integer, @@ -130,7 +131,12 @@ def __init__(self, index: MultiIndex, level=-1, constructor=None): self._make_selectors() @cache_readonly - def _indexer_and_to_sort(self): + def _indexer_and_to_sort( + self, + ) -> tuple[ + np.ndarray, # np.ndarray[np.intp] + list[np.ndarray], # each has _some_ signed integer dtype + ]: v = self.level codes = list(self.index.codes) @@ -142,7 +148,6 @@ def _indexer_and_to_sort(self): ngroups = len(obs_ids) indexer = get_group_index_sorter(comp_index, ngroups) - indexer = ensure_platform_int(indexer) return indexer, to_sort @cache_readonly @@ -438,7 +443,7 @@ def unstack(obj, level, fill_value=None): f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" ) else: - if is_extension_array_dtype(obj.dtype): + if is_1d_only_ea_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor_expanddim diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 41e1ff41d9ba2..7b9c3883d74e3 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -24,8 +24,8 @@ is_datetime_or_timedelta_dtype, is_extension_array_dtype, is_integer, - is_integer_dtype, is_list_like, + is_numeric_dtype, is_scalar, is_timedelta64_dtype, ) @@ -488,7 +488,7 @@ def _coerce_to_type(x): # Will properly support in the future. # https://github.com/pandas-dev/pandas/pull/31290 # https://github.com/pandas-dev/pandas/issues/31389 - elif is_extension_array_dtype(x.dtype) and is_integer_dtype(x.dtype): + elif is_extension_array_dtype(x.dtype) and is_numeric_dtype(x.dtype): x = x.to_numpy(dtype=np.float64, na_value=np.nan) if dtype is not None: diff --git a/pandas/core/series.py b/pandas/core/series.py index 440bc4c89e647..9f0b9040d1ae2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -60,7 +60,7 @@ from pandas.core.dtypes.cast import ( convert_dtypes, maybe_box_native, - maybe_cast_pointwise_result, + maybe_cast_result, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -1019,7 +1019,7 @@ def _get_value(self, label, takeable: bool = False): loc = self.index.get_loc(label) return self.index._get_values_for_loc(self, loc, label) - def __setitem__(self, key, value) -> None: + def __setitem__(self, key, value): key = com.apply_if_callable(key, self) cacher_needs_updating = self._check_is_chained_assignment_possible() @@ -1058,7 +1058,7 @@ def __setitem__(self, key, value) -> None: if cacher_needs_updating: self._maybe_update_cacher() - def _set_with_engine(self, key, value) -> None: + def _set_with_engine(self, key, value): # fails with AttributeError for IntervalIndex loc = self.index._engine.get_loc(key) # error: Argument 1 to "validate_numeric_casting" has incompatible type @@ -1094,7 +1094,7 @@ def _set_with(self, key, value): else: self.loc[key] = value - def _set_labels(self, key, value) -> None: + def _set_labels(self, key, value): key = com.asarray_tuplesafe(key) indexer: np.ndarray = self.index.get_indexer(key) mask = indexer == -1 @@ -1102,7 +1102,7 @@ def _set_labels(self, key, value) -> None: raise KeyError(f"{key[mask]} not in index") self._set_values(indexer, value) - def _set_values(self, key, value) -> None: + def _set_values(self, key, value): if isinstance(key, Series): key = key._values @@ -3070,26 +3070,22 @@ def combine(self, other, func, fill_value=None) -> Series: # so do this element by element new_index = self.index.union(other.index) new_name = ops.get_op_result_name(self, other) - new_values = np.empty(len(new_index), dtype=object) - for i, idx in enumerate(new_index): + new_values = [] + for idx in new_index: lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) with np.errstate(all="ignore"): - new_values[i] = func(lv, rv) + new_values.append(func(lv, rv)) else: # Assume that other is a scalar, so apply the function for # each element in the Series new_index = self.index - new_values = np.empty(len(new_index), dtype=object) with np.errstate(all="ignore"): - new_values[:] = [func(lv, other) for lv in self._values] + new_values = [func(lv, other) for lv in self._values] new_name = self.name - # try_float=False is to match _aggregate_series_pure_python - res_values = lib.maybe_convert_objects(new_values, try_float=False) - res_values = maybe_cast_pointwise_result( - res_values, self.dtype, same_dtype=False - ) + res_values = sanitize_array(new_values, None) + res_values = maybe_cast_result(res_values, self.dtype, same_dtype=False) return self._constructor(res_values, index=new_index, name=new_name) def combine_first(self, other) -> Series: diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index a4ee4bb636450..a3fa24c7ee1e0 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -42,7 +42,7 @@ `agg` is an alias for `aggregate`. Use the alias. Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`udf-mutation` +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. A passed user-defined-function will be passed a Series for evaluation. @@ -303,7 +303,7 @@ Notes ----- Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`udf-mutation` +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. Examples diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 71963ec4a2123..f5cd390f077a6 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -182,7 +182,7 @@ def maybe_lift(lab, size): return out -def get_compressed_ids(labels, sizes): +def get_compressed_ids(labels, sizes) -> tuple[np.ndarray, np.ndarray]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -195,7 +195,10 @@ def get_compressed_ids(labels, sizes): Returns ------- - tuple of (comp_ids, obs_group_ids) + np.ndarray[np.intp] + comp_ids + np.ndarray[np.int64] + obs_group_ids """ ids = get_group_index(labels, sizes, sort=True, xnull=False) return compress_group_index(ids, sort=True) @@ -229,7 +232,7 @@ def decons_group_index(comp_labels, shape): return label_list[::-1] -def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): +def decons_obs_group_ids(comp_ids: np.ndarray, obs_ids, shape, labels, xnull: bool): """ Reconstruct labels from observed group ids. @@ -254,7 +257,8 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): return [i8copy(lab[i]) for lab in labels] -def indexer_from_factorized(labels, shape, compress: bool = True): +def indexer_from_factorized(labels, shape, compress: bool = True) -> np.ndarray: + # returned ndarray is np.intp ids = get_group_index(labels, shape, sort=True, xnull=False) if not compress: @@ -268,7 +272,7 @@ def indexer_from_factorized(labels, shape, compress: bool = True): def lexsort_indexer( keys, orders=None, na_position: str = "last", key: Callable | None = None -): +) -> np.ndarray: """ Performs lexical sorting on a set of keys @@ -288,6 +292,10 @@ def lexsort_indexer( Callable key function applied to every element in keys before sorting .. versionadded:: 1.0.0 + + Returns + ------- + np.ndarray[np.intp] """ from pandas.core.arrays import Categorical @@ -352,6 +360,10 @@ def nargsort( key : Optional[Callable], default None mask : Optional[np.ndarray], default None Passed when called by ExtensionArray.argsort. + + Returns + ------- + np.ndarray[np.intp] """ if key is not None: @@ -396,7 +408,7 @@ def nargsort( indexer = np.concatenate([nan_idx, indexer]) else: raise ValueError(f"invalid na_position: {na_position}") - return indexer + return ensure_platform_int(indexer) def nargminmax(values, method: str, axis: int = 0): @@ -636,7 +648,9 @@ def get_group_index_sorter( return ensure_platform_int(sorter) -def compress_group_index(group_index, sort: bool = True): +def compress_group_index( + group_index: np.ndarray, sort: bool = True +) -> tuple[np.ndarray, np.ndarray]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -656,12 +670,25 @@ def compress_group_index(group_index, sort: bool = True): return ensure_int64(comp_ids), ensure_int64(obs_group_ids) -def _reorder_by_uniques(uniques, labels): +def _reorder_by_uniques( + uniques: np.ndarray, labels: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: + """ + Parameters + ---------- + uniques : np.ndarray[np.int64] + labels : np.ndarray[np.intp] + + Returns + ------- + np.ndarray[np.int64] + np.ndarray[np.intp] + """ # sorter is index where elements ought to go sorter = uniques.argsort() # reverse_indexer is where elements came from - reverse_indexer = np.empty(len(sorter), dtype=np.int64) + reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 0b5613e302175..85a58d3d99795 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3002,8 +3002,9 @@ def _result_dtype(arr): # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype - if isinstance(arr.dtype, StringDtype): + if isinstance(arr.dtype, (StringDtype, ArrowStringDtype)): return arr.dtype.name else: return object diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 102cdf4334510..4eb469f52fb19 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -252,9 +252,7 @@ def _convert_and_box_cache( from pandas import Series result = Series(arg).map(cache_array) - # error: Argument 1 to "_box_as_indexlike" has incompatible type "Series"; expected - # "Union[ExtensionArray, ndarray]" - return _box_as_indexlike(result, utc=None, name=name) # type: ignore[arg-type] + return _box_as_indexlike(result._values, utc=None, name=name) def _return_parsed_timezone_results(result: np.ndarray, timezones, tz, name) -> Index: @@ -368,13 +366,11 @@ def _convert_listlike_datetimes( arg, _ = maybe_convert_dtype(arg, copy=False) except TypeError: if errors == "coerce": - result = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) - return DatetimeIndex(result, name=name) + npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) + return DatetimeIndex(npvalues, name=name) elif errors == "ignore": - # error: Incompatible types in assignment (expression has type - # "Index", variable has type "ExtensionArray") - result = Index(arg, name=name) # type: ignore[assignment] - return result + idx = Index(arg, name=name) + return idx raise arg = ensure_object(arg) @@ -393,37 +389,30 @@ def _convert_listlike_datetimes( require_iso8601 = not infer_datetime_format format = None - # error: Incompatible types in assignment (expression has type "None", variable has - # type "ExtensionArray") - result = None # type: ignore[assignment] - if format is not None: - # error: Incompatible types in assignment (expression has type - # "Optional[Index]", variable has type "ndarray") - result = _to_datetime_with_format( # type: ignore[assignment] + res = _to_datetime_with_format( arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format ) - if result is not None: - return result - - if result is None: - assert format is None or infer_datetime_format - utc = tz == "utc" - result, tz_parsed = objects_to_datetime64ns( - arg, - dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, - errors=errors, - require_iso8601=require_iso8601, - allow_object=True, - ) + if res is not None: + return res - if tz_parsed is not None: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) - return DatetimeIndex._simple_new(dta, name=name) + assert format is None or infer_datetime_format + utc = tz == "utc" + result, tz_parsed = objects_to_datetime64ns( + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + require_iso8601=require_iso8601, + allow_object=True, + ) + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + return DatetimeIndex._simple_new(dta, name=name) utc = tz == "utc" return _box_as_indexlike(result, utc=utc, name=name) @@ -509,13 +498,11 @@ def _to_datetime_with_format( # fallback if result is None: - # error: Incompatible types in assignment (expression has type - # "Optional[Index]", variable has type "Optional[ndarray]") - result = _array_strptime_with_fallback( # type: ignore[assignment] + res = _array_strptime_with_fallback( arg, name, tz, fmt, exact, errors, infer_datetime_format ) - if result is not None: - return result + if res is not None: + return res except ValueError as e: # Fallback to try to convert datetime objects if timezone-aware @@ -628,16 +615,16 @@ def _adjust_to_origin(arg, origin, unit): if offset.tz is not None: raise ValueError(f"origin offset {offset} must be tz-naive") - offset -= Timestamp(0) + td_offset = offset - Timestamp(0) # convert the offset to the unit of the arg # this should be lossless in terms of precision - offset = offset // Timedelta(1, unit=unit) + ioffset = td_offset // Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)): arg = np.asarray(arg) - arg = arg + offset + arg = arg + ioffset return arg @@ -855,8 +842,19 @@ def to_datetime( >>> pd.to_datetime([1, 2, 3], unit='D', ... origin=pd.Timestamp('1960-01-01')) - DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], \ -dtype='datetime64[ns]', freq=None) + DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], + dtype='datetime64[ns]', freq=None) + + In case input is list-like and the elements of input are of mixed + timezones, return will have object type Index if utc=False. + + >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500']) + Index([2018-10-26 12:00:00-05:30, 2018-10-26 12:00:00-05:00], dtype='object') + + >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], + ... utc=True) + DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) """ if arg is None: return None @@ -876,13 +874,17 @@ def to_datetime( infer_datetime_format=infer_datetime_format, ) + result: Timestamp | NaTType | Series | Index + if isinstance(arg, Timestamp): result = arg if tz is not None: if arg.tz is not None: - result = result.tz_convert(tz) + # error: Too many arguments for "tz_convert" of "NaTType" + result = result.tz_convert(tz) # type: ignore[call-arg] else: - result = result.tz_localize(tz) + # error: Too many arguments for "tz_localize" of "NaTType" + result = result.tz_localize(tz) # type: ignore[call-arg] elif isinstance(arg, ABCSeries): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: @@ -917,7 +919,10 @@ def to_datetime( else: result = convert_listlike(np.array([arg]), format)[0] - return result + # error: Incompatible return value type (got "Union[Timestamp, NaTType, + # Series, Index]", expected "Union[DatetimeIndex, Series, float, str, + # NaTType, None]") + return result # type: ignore[return-value] # mappings for assembling units diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index d84dea7ee622c..9407efd0bef2b 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -80,7 +80,7 @@ def roll_apply( return roll_apply -def generate_numba_groupby_ewma_func( +def generate_numba_ewma_func( engine_kwargs: Optional[Dict[str, bool]], com: float, adjust: bool, @@ -88,7 +88,7 @@ def generate_numba_groupby_ewma_func( deltas: np.ndarray, ): """ - Generate a numba jitted groupby ewma function specified by values + Generate a numba jitted ewma function specified by values from engine_kwargs. Parameters @@ -106,14 +106,14 @@ def generate_numba_groupby_ewma_func( """ nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - cache_key = (lambda x: x, "groupby_ewma") + cache_key = (lambda x: x, "ewma") if cache_key in NUMBA_FUNC_CACHE: return NUMBA_FUNC_CACHE[cache_key] numba = import_optional_dependency("numba") @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def groupby_ewma( + def ewma( values: np.ndarray, begin: np.ndarray, end: np.ndarray, @@ -121,15 +121,15 @@ def groupby_ewma( ) -> np.ndarray: result = np.empty(len(values)) alpha = 1.0 / (1.0 + com) + old_wt_factor = 1.0 - alpha + new_wt = 1.0 if adjust else alpha + for i in numba.prange(len(begin)): start = begin[i] stop = end[i] window = values[start:stop] sub_result = np.empty(len(window)) - old_wt_factor = 1.0 - alpha - new_wt = 1.0 if adjust else alpha - weighted_avg = window[0] nobs = int(not np.isnan(weighted_avg)) sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan @@ -166,7 +166,7 @@ def groupby_ewma( return result - return groupby_ewma + return ewma def generate_numba_table_func( diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e4710254d9311..1c85385c587a5 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -43,10 +43,7 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, - ABCPeriodIndex, ABCSeries, - ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import notna @@ -58,8 +55,11 @@ ) import pandas.core.common as com from pandas.core.indexes.api import ( + DatetimeIndex, Index, MultiIndex, + PeriodIndex, + TimedeltaIndex, ) from pandas.core.internals import ArrayManager from pandas.core.reshape.concat import concat @@ -381,10 +381,11 @@ def _apply_series( """ obj = self._create_data(self._selected_obj) - try: + if name == "count": # GH 12541: Special case for count where we support date-like types - input = obj.values if name != "count" else notna(obj.values).astype(int) - values = self._prep_values(input) + obj = notna(obj).astype(int) + try: + values = self._prep_values(obj._values) except (TypeError, NotImplementedError) as err: raise DataError("No numeric types to aggregate") from err @@ -1051,7 +1052,10 @@ def aggregate(self, func, *args, **kwargs): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = window_aggregations.roll_weighted_sum - return self._apply(window_func, name="sum", **kwargs) + # error: Argument 1 to "_apply" of "Window" has incompatible type + # "Callable[[ndarray, ndarray, int], ndarray]"; expected + # "Callable[[ndarray, int, int], ndarray]" + return self._apply(window_func, name="sum", **kwargs) # type: ignore[arg-type] @doc( template_header, @@ -1068,7 +1072,10 @@ def sum(self, *args, **kwargs): def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) window_func = window_aggregations.roll_weighted_mean - return self._apply(window_func, name="mean", **kwargs) + # error: Argument 1 to "_apply" of "Window" has incompatible type + # "Callable[[ndarray, ndarray, int], ndarray]"; expected + # "Callable[[ndarray, int, int], ndarray]" + return self._apply(window_func, name="mean", **kwargs) # type: ignore[arg-type] @doc( template_header, @@ -1455,9 +1462,7 @@ def validate(self): # we allow rolling on a datetimelike index if ( self.obj.empty - or isinstance( - self._on, (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) - ) + or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) ) and isinstance(self.window, (str, BaseOffset, timedelta)): self._validate_monotonic() @@ -1470,7 +1475,7 @@ def validate(self): f"passed window {self.window} is not " "compatible with a datetimelike index" ) from err - if isinstance(self._on, ABCPeriodIndex): + if isinstance(self._on, PeriodIndex): self._win_freq_i8 = freq.nanos / (self._on.freq.nanos / self._on.freq.n) else: self._win_freq_i8 = freq.nanos diff --git a/pandas/io/common.py b/pandas/io/common.py index 00966d39dd99d..06b00a9cbb4eb 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -618,7 +618,12 @@ def get_handle( # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( - handle, memory_map, ioargs.encoding, ioargs.mode, errors + handle, + memory_map, + ioargs.encoding, + ioargs.mode, + errors, + ioargs.compression["method"] not in _compression_to_extension, ) is_path = isinstance(handle, str) @@ -820,7 +825,18 @@ class _MMapWrapper(abc.Iterator): """ - def __init__(self, f: IO): + def __init__( + self, + f: IO, + encoding: str = "utf-8", + errors: str = "strict", + decode: bool = True, + ): + self.encoding = encoding + self.errors = errors + self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors) + self.decode = decode + self.attributes = {} for attribute in ("seekable", "readable", "writeable"): if not hasattr(f, attribute): @@ -836,19 +852,30 @@ def __getattr__(self, name: str): def __iter__(self) -> _MMapWrapper: return self + def read(self, size: int = -1) -> str | bytes: + # CSV c-engine uses read instead of iterating + content: bytes = self.mmap.read(size) + if self.decode: + # memory mapping is applied before compression. Encoding should + # be applied to the de-compressed data. + return content.decode(self.encoding, errors=self.errors) + return content + def __next__(self) -> str: newbytes = self.mmap.readline() # readline returns bytes, not str, but Python's CSV reader # expects str, so convert the output to str before continuing - newline = newbytes.decode("utf-8") + newline = self.decoder.decode(newbytes) # mmap doesn't raise if reading past the allocated # data but instead returns an empty string, so raise # if that is returned if newline == "": raise StopIteration - return newline + + # IncrementalDecoder seems to push newline to the next line + return newline.lstrip("\n") def _maybe_memory_map( @@ -857,6 +884,7 @@ def _maybe_memory_map( encoding: str, mode: str, errors: str | None, + decode: bool, ) -> tuple[FileOrBuffer, bool, list[Buffer]]: """Try to memory map file/buffer.""" handles: list[Buffer] = [] @@ -877,7 +905,10 @@ def _maybe_memory_map( try: # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" - wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type] + wrapped = cast( + mmap.mmap, + _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] + ) handle.close() handles.remove(handle) handles.append(wrapped) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 7eefd26b194ab..3c9dd90c0a0cb 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -551,7 +551,11 @@ def parse( header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) - has_index_names = is_list_like(header) and len(header) > 1 + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + has_index_names = ( + is_list_like(header) and len(header) > 1 and index_col is not None + ) if is_list_like(index_col): # Forward fill values for MultiIndex index. @@ -664,6 +668,15 @@ class ExcelWriter(metaclass=abc.ABCMeta): be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". .. versionadded:: 1.2.0 + if_sheet_exists : {'error', 'new', 'replace'}, default 'error' + How to behave when trying to write to a sheet that already + exists (append mode only). + + * error: raise a ValueError. + * new: Create a new sheet, with a name determined by the engine. + * replace: Delete the contents of the sheet before writing to it. + + .. versionadded:: 1.3.0 engine_kwargs : dict, optional Keyword arguments to be passed into the engine. @@ -760,6 +773,7 @@ def __new__( datetime_format=None, mode: str = "w", storage_options: StorageOptions = None, + if_sheet_exists: str | None = None, engine_kwargs: dict | None = None, **kwargs, ): @@ -861,6 +875,7 @@ def __init__( datetime_format=None, mode: str = "w", storage_options: StorageOptions = None, + if_sheet_exists: str | None = None, engine_kwargs: dict | None = None, **kwargs, ): @@ -896,6 +911,17 @@ def __init__( self.mode = mode + if if_sheet_exists not in [None, "error", "new", "replace"]: + raise ValueError( + f"'{if_sheet_exists}' is not valid for if_sheet_exists. " + "Valid options are 'error', 'new' and 'replace'." + ) + if if_sheet_exists and "r+" not in mode: + raise ValueError("if_sheet_exists is only valid in append mode (mode='a')") + if if_sheet_exists is None: + if_sheet_exists = "error" + self.if_sheet_exists = if_sheet_exists + def __fspath__(self): return getattr(self.handles.handle, "name", "") diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 1324485f49bdb..c105465cddd95 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,7 +1,4 @@ -from typing import ( - List, - cast, -) +from typing import List import numpy as np @@ -200,10 +197,9 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: cell_value = cell.attributes.get((OFFICENS, "date-value")) return pd.to_datetime(cell_value) elif cell_type == "time": - result = pd.to_datetime(str(cell)) - result = cast(pd.Timestamp, result) + stamp = pd.to_datetime(str(cell)) # error: Item "str" of "Union[float, str, NaTType]" has no attribute "time" - return result.time() # type: ignore[union-attr] + return stamp.time() # type: ignore[union-attr] else: self.close() raise ValueError(f"Unrecognized type {cell_type}") diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index bfd1bcf466a7a..7b6634fff1c16 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -30,6 +30,7 @@ def __init__( datetime_format=None, mode: str = "w", storage_options: StorageOptions = None, + if_sheet_exists: Optional[str] = None, engine_kwargs: Optional[Dict[str, Any]] = None, ): from odf.opendocument import OpenDocumentSpreadsheet @@ -41,6 +42,7 @@ def __init__( path, mode=mode, storage_options=storage_options, + if_sheet_exists=if_sheet_exists, engine_kwargs=engine_kwargs, ) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 860971a7967da..a99f8e2625602 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -37,6 +37,7 @@ def __init__( datetime_format=None, mode: str = "w", storage_options: StorageOptions = None, + if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, ): # Use the openpyxl module as the Excel writer. @@ -46,6 +47,7 @@ def __init__( path, mode=mode, storage_options=storage_options, + if_sheet_exists=if_sheet_exists, engine_kwargs=engine_kwargs, ) @@ -56,6 +58,8 @@ def __init__( self.book = load_workbook(self.handles.handle) self.handles.handle.seek(0) + self.sheets = {name: self.book[name] for name in self.book.sheetnames} + else: # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -414,8 +418,26 @@ def write_cells( _style_cache: dict[str, dict[str, Serialisable]] = {} - if sheet_name in self.sheets: - wks = self.sheets[sheet_name] + if sheet_name in self.sheets and self.if_sheet_exists != "new": + if "r+" in self.mode: + if self.if_sheet_exists == "replace": + old_wks = self.sheets[sheet_name] + target_index = self.book.index(old_wks) + del self.book[sheet_name] + wks = self.book.create_sheet(sheet_name, target_index) + self.sheets[sheet_name] = wks + elif self.if_sheet_exists == "error": + raise ValueError( + f"Sheet '{sheet_name}' already exists and " + f"if_sheet_exists is set to 'error'." + ) + else: + raise ValueError( + f"'{self.if_sheet_exists}' is not valid for if_sheet_exists. " + "Valid options are 'error', 'new' and 'replace'." + ) + else: + wks = self.sheets[sheet_name] else: wks = self.book.create_sheet() wks.title = sheet_name diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 6e1b064534707..27b3ae3fab9bc 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -177,6 +177,7 @@ def __init__( datetime_format=None, mode: str = "w", storage_options: StorageOptions = None, + if_sheet_exists: Optional[str] = None, engine_kwargs: Optional[Dict[str, Any]] = None, ): # Use the xlsxwriter module as the Excel writer. @@ -194,6 +195,7 @@ def __init__( datetime_format=datetime_format, mode=mode, storage_options=storage_options, + if_sheet_exists=if_sheet_exists, engine_kwargs=engine_kwargs, ) diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 776baf66536b1..8d5bd4a9608d4 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -28,6 +28,7 @@ def __init__( encoding=None, mode: str = "w", storage_options: StorageOptions = None, + if_sheet_exists: Optional[str] = None, engine_kwargs: Optional[Dict[str, Any]] = None, ): # Use the xlwt module as the Excel writer. @@ -40,6 +41,7 @@ def __init__( path, mode=mode, storage_options=storage_options, + if_sheet_exists=if_sheet_exists, engine_kwargs=engine_kwargs, ) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ba406a1ef117c..9d653c9a5f97c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1318,7 +1318,6 @@ def _format(x): "ExtensionArray formatting should use ExtensionArrayFormatter" ) inferred = lib.map_infer(vals, is_float) - inferred = cast(np.ndarray, inferred) is_float_type = ( inferred # vals may have 2 or more dimensions diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index ff25bb1411189..02e1369a05b93 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -200,7 +200,46 @@ def _repr_html_(self) -> str: """ Hooks into Jupyter notebook rich display system. """ - return self.render() + return self._render_html() + + def render(self, **kwargs) -> str: + """ + Render the ``Styler`` including all applied styles to HTML. + + Parameters + ---------- + **kwargs + Any additional keyword arguments are passed + through to ``self.template.render``. + This is useful when you need to provide + additional variables for a custom template. + + Returns + ------- + rendered : str + The rendered HTML. + + Notes + ----- + Styler objects have defined the ``_repr_html_`` method + which automatically calls ``self.render()`` when it's the + last item in a Notebook cell. When calling ``Styler.render()`` + directly, wrap the result in ``IPython.display.HTML`` to view + the rendered HTML in the notebook. + + Pandas uses the following keys in render. Arguments passed + in ``**kwargs`` take precedence, so think carefully if you want + to override them: + + * head + * cellstyle + * body + * uuid + * table_styles + * caption + * table_attributes + """ + return self._render_html(**kwargs) def set_tooltips( self, @@ -572,7 +611,6 @@ def apply( See Also -------- - Styler.where: Apply CSS-styles based on a conditional function elementwise. Styler.applymap: Apply a CSS-styling function elementwise. Notes @@ -630,7 +668,6 @@ def applymap(self, func: Callable, subset=None, **kwargs) -> Styler: See Also -------- - Styler.where: Apply CSS-styles based on a conditional function elementwise. Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. Notes @@ -662,6 +699,8 @@ def where( """ Apply CSS-styles based on a conditional function elementwise. + .. deprecated:: 1.3.0 + Updates the HTML representation with a style which is selected in accordance with the return value of a function. @@ -689,13 +728,31 @@ def where( Styler.applymap: Apply a CSS-styling function elementwise. Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. - Examples - -------- - >>> def cond(v): - ... return v > 1 and v != 4 + Notes + ----- + This method is deprecated. + + This method is a convenience wrapper for :meth:`Styler.applymap`, which we + recommend using instead. + + The example: >>> df = pd.DataFrame([[1, 2], [3, 4]]) - >>> df.style.where(cond, value='color:red;', other='font-size:2em;') + >>> def cond(v, limit=4): + ... return v > 1 and v != limit + >>> df.style.where(cond, value='color:green;', other='color:red;') + + should be refactored to: + >>> def style_func(v, value, other, limit=4): + ... cond = v > 1 and v != limit + ... return value if cond else other + >>> df.style.applymap(style_func, value='color:green;', other='color:red;') """ + warnings.warn( + "this method is deprecated in favour of `Styler.applymap()`", + FutureWarning, + stacklevel=2, + ) + if other is None: other = "" diff --git a/pandas/io/orc.py b/pandas/io/orc.py index db14a07e4b91b..6bdb4df806b5c 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,10 +1,10 @@ """ orc compat """ from __future__ import annotations -import distutils from typing import TYPE_CHECKING from pandas._typing import FilePathOrBuffer +from pandas.compat._optional import import_optional_dependency from pandas.io.common import get_handle @@ -42,13 +42,16 @@ def read_orc( Returns ------- DataFrame + + Notes + ------- + Before using this function you should read the :ref:`user guide about ORC ` + and :ref:`install optional dependencies `. """ # we require a newer version of pyarrow than we support for parquet - import pyarrow - if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": - raise ImportError("pyarrow must be >= 0.13.0 for read_orc") + orc = import_optional_dependency("pyarrow.orc") with get_handle(path, "rb", is_text=False) as handles: - orc_file = pyarrow.orc.ORCFile(handles.handle) + orc_file = orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index fcb077eee0624..3801a29fec39e 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -231,6 +231,9 @@ def read( "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " f"({self.api.__version__} is installed" ) + manager = get_option("mode.data_manager") + if manager == "array": + to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, @@ -239,9 +242,12 @@ def read( mode="rb", ) try: - return self.api.parquet.read_table( + result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs ).to_pandas(**to_pandas_kwargs) + if manager == "array": + result = result._as_manager("array", copy=False) + return result finally: if handles is not None: handles.close() diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index abf6128699a21..fbf2a53207f75 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -1,5 +1,22 @@ +from __future__ import annotations + +import warnings + +import numpy as np + import pandas._libs.parsers as parsers -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + ArrayLike, + FilePathOrBuffer, +) +from pandas.errors import DtypeWarning + +from pandas.core.dtypes.common import ( + is_categorical_dtype, + pandas_dtype, +) +from pandas.core.dtypes.concat import union_categoricals +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.indexes.api import ensure_index_from_sequences @@ -10,12 +27,16 @@ class CParserWrapper(ParserBase): + low_memory: bool + def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) + self.low_memory = kwds.pop("low_memory", False) + # #2442 # error: Cannot determine type of 'index_col' kwds["allow_leading_cols"] = ( @@ -30,26 +51,8 @@ def __init__(self, src: FilePathOrBuffer, **kwds): assert self.handles is not None for key in ("storage_options", "encoding", "memory_map", "compression"): kwds.pop(key, None) - if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): - # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "RawIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "BufferedIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "TextIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "TextIOWrapper" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - - # error: Item "mmap" of "Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" - self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] + kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) try: self._reader = parsers.TextReader(self.handles.handle, **kwds) except Exception: @@ -206,7 +209,13 @@ def set_error_bad_lines(self, status): def read(self, nrows=None): try: - data = self._reader.read(nrows) + if self.low_memory: + chunks = self._reader.read_low_memory(nrows) + # destructive to chunks + data = _concatenate_chunks(chunks) + + else: + data = self._reader.read(nrows) except StopIteration: # error: Cannot determine type of '_first_chunk' if self._first_chunk: # type: ignore[has-type] @@ -313,7 +322,76 @@ def _get_index_names(self): return names, idx_names - def _maybe_parse_dates(self, values, index, try_parse_dates=True): + def _maybe_parse_dates(self, values, index: int, try_parse_dates=True): if try_parse_dates and self._should_parse_dates(index): values = self._date_conv(values) return values + + +def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: + """ + Concatenate chunks of data read with low_memory=True. + + The tricky part is handling Categoricals, where different chunks + may have different inferred categories. + """ + names = list(chunks[0].keys()) + warning_columns = [] + + result = {} + for name in names: + arrs = [chunk.pop(name) for chunk in chunks] + # Check each arr for consistent types. + dtypes = {a.dtype for a in arrs} + # TODO: shouldn't we exclude all EA dtypes here? + numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} + if len(numpy_dtypes) > 1: + # error: Argument 1 to "find_common_type" has incompatible type + # "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type, + # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, + # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]" + common_type = np.find_common_type( + numpy_dtypes, # type: ignore[arg-type] + [], + ) + if common_type == object: + warning_columns.append(str(name)) + + dtype = dtypes.pop() + if is_categorical_dtype(dtype): + result[name] = union_categoricals(arrs, sort_categories=False) + else: + if isinstance(dtype, ExtensionDtype): + # TODO: concat_compat? + array_type = dtype.construct_array_type() + # error: Argument 1 to "_concat_same_type" of "ExtensionArray" + # has incompatible type "List[Union[ExtensionArray, ndarray]]"; + # expected "Sequence[ExtensionArray]" + result[name] = array_type._concat_same_type( + arrs # type: ignore[arg-type] + ) + else: + result[name] = np.concatenate(arrs) + + if warning_columns: + warning_names = ",".join(warning_columns) + warning_message = " ".join( + [ + f"Columns ({warning_names}) have mixed types." + f"Specify dtype option on import or set low_memory=False." + ] + ) + warnings.warn(warning_message, DtypeWarning, stacklevel=8) + return result + + +def ensure_dtype_objs(dtype): + """ + Ensure we have either None, a dtype object, or a dictionary mapping to + dtype objects. + """ + if isinstance(dtype, dict): + dtype = {k: pandas_dtype(dtype[k]) for k in dtype} + elif dtype is not None: + dtype = pandas_dtype(dtype) + return dtype diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 9f62d63c680f6..a6d38eab99977 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -443,7 +443,8 @@ def _infer_columns(self): ic = len(sic) if sic is not None else 0 unnamed_count = len(this_unnamed_cols) - if lc != unnamed_count and lc - ic > unnamed_count: + # if wrong number of blanks or no index, not our format + if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0: clear_buffer = False this_columns = [None] * lc self.buf = [self.buf[-1]] diff --git a/pandas/io/sql.py b/pandas/io/sql.py index d797fa51984d6..04a7ccb538a67 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -27,6 +27,8 @@ import pandas._libs.lib as lib from pandas._typing import DtypeArg +from pandas.compat._optional import import_optional_dependency +from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -36,6 +38,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna +from pandas import get_option from pandas.core.api import ( DataFrame, Series, @@ -643,6 +646,8 @@ def to_sql( chunksize: int | None = None, dtype: DtypeArg | None = None, method: str | None = None, + engine: str = "auto", + **engine_kwargs, ) -> None: """ Write records stored in a DataFrame to a SQL database. @@ -689,6 +694,16 @@ def to_sql( section :ref:`insert method `. .. versionadded:: 0.24.0 + + engine : {'auto', 'sqlalchemy'}, default 'auto' + SQL engine library to use. If 'auto', then the option + ``io.sql.engine`` is used. The default ``io.sql.engine`` + behavior is 'sqlalchemy' + + .. versionadded:: 1.3.0 + + **engine_kwargs + Any additional kwargs are passed to the engine. """ if if_exists not in ("fail", "replace", "append"): raise ValueError(f"'{if_exists}' is not valid for if_exists") @@ -712,6 +727,8 @@ def to_sql( chunksize=chunksize, dtype=dtype, method=method, + engine=engine, + **engine_kwargs, ) @@ -1283,6 +1300,91 @@ def to_sql( ) +class BaseEngine: + def insert_records( + self, + table: SQLTable, + con, + frame, + name, + index=True, + schema=None, + chunksize=None, + method=None, + **engine_kwargs, + ): + """ + Inserts data into already-prepared table + """ + raise AbstractMethodError(self) + + +class SQLAlchemyEngine(BaseEngine): + def __init__(self): + import_optional_dependency( + "sqlalchemy", extra="sqlalchemy is required for SQL support." + ) + + def insert_records( + self, + table: SQLTable, + con, + frame, + name, + index=True, + schema=None, + chunksize=None, + method=None, + **engine_kwargs, + ): + from sqlalchemy import exc + + try: + table.insert(chunksize=chunksize, method=method) + except exc.SQLAlchemyError as err: + # GH34431 + # https://stackoverflow.com/a/67358288/6067848 + msg = r"""(\(1054, "Unknown column 'inf(e0)?' in 'field list'"\))(?# + )|inf can not be used with MySQL""" + err_text = str(err.orig) + if re.search(msg, err_text): + raise ValueError("inf cannot be used with MySQL") from err + else: + raise err + + +def get_engine(engine: str) -> BaseEngine: + """ return our implementation """ + if engine == "auto": + engine = get_option("io.sql.engine") + + if engine == "auto": + # try engines in this order + engine_classes = [SQLAlchemyEngine] + + error_msgs = "" + for engine_class in engine_classes: + try: + return engine_class() + except ImportError as err: + error_msgs += "\n - " + str(err) + + raise ImportError( + "Unable to find a usable engine; " + "tried using: 'sqlalchemy'.\n" + "A suitable version of " + "sqlalchemy is required for sql I/O " + "support.\n" + "Trying to import the above resulted in these errors:" + f"{error_msgs}" + ) + + elif engine == "sqlalchemy": + return SQLAlchemyEngine() + + raise ValueError("engine must be one of 'auto', 'sqlalchemy'") + + class SQLDatabase(PandasSQL): """ This class enables conversion between DataFrame and SQL databases @@ -1504,7 +1606,7 @@ def read_query( read_sql = read_query - def to_sql( + def prep_table( self, frame, name, @@ -1512,50 +1614,10 @@ def to_sql( index=True, index_label=None, schema=None, - chunksize=None, dtype: DtypeArg | None = None, - method=None, - ): + ) -> SQLTable: """ - Write records stored in a DataFrame to a SQL database. - - Parameters - ---------- - frame : DataFrame - name : string - Name of SQL table. - if_exists : {'fail', 'replace', 'append'}, default 'fail' - - fail: If table exists, do nothing. - - replace: If table exists, drop it, recreate it, and insert data. - - append: If table exists, insert data. Create if does not exist. - index : bool, default True - Write DataFrame index as a column. - index_label : string or sequence, default None - Column label for index column(s). If None is given (default) and - `index` is True, then the index names are used. - A sequence should be given if the DataFrame uses MultiIndex. - schema : string, default None - Name of SQL schema in database to write to (if database flavor - supports this). If specified, this overwrites the default - schema of the SQLDatabase object. - chunksize : int, default None - If not None, then rows will be written in batches of this size at a - time. If None, all rows will be written at once. - dtype : single type or dict of column name to SQL type, default None - Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type. If all columns are of the same type, one - single value can be used. - method : {None', 'multi', callable}, default None - Controls the SQL insertion clause used: - - * None : Uses standard SQL ``INSERT`` clause (one per row). - * 'multi': Pass multiple values in a single ``INSERT`` clause. - * callable with signature ``(pd_table, conn, keys, data_iter)``. - - Details and a sample callable implementation can be found in the - section :ref:`insert method `. - - .. versionadded:: 0.24.0 + Prepares table in the database for data insertion. Creates it if needed, etc. """ if dtype: if not is_dict_like(dtype): @@ -1589,15 +1651,17 @@ def to_sql( dtype=dtype, ) table.create() + return table - from sqlalchemy.exc import SQLAlchemyError - - try: - table.insert(chunksize, method=method) - except SQLAlchemyError as err: - # GH 34431 36465 - raise ValueError("inf cannot be used with MySQL") from err - + def check_case_sensitive( + self, + name, + schema, + ): + """ + Checks table name for issues with case-sensitivity. + Method is called after data is inserted. + """ if not name.isdigit() and not name.islower(): # check for potentially case sensitivity issues (GH7815) # Only check when name is not a number and name is not lower case @@ -1623,6 +1687,97 @@ def to_sql( ) warnings.warn(msg, UserWarning) + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype: DtypeArg | None = None, + method=None, + engine="auto", + **engine_kwargs, + ): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + If not None, then rows will be written in batches of this size at a + time. If None, all rows will be written at once. + dtype : single type or dict of column name to SQL type, default None + Optional specifying the datatype for columns. The SQL type should + be a SQLAlchemy type. If all columns are of the same type, one + single value can be used. + method : {None', 'multi', callable}, default None + Controls the SQL insertion clause used: + + * None : Uses standard SQL ``INSERT`` clause (one per row). + * 'multi': Pass multiple values in a single ``INSERT`` clause. + * callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + + .. versionadded:: 0.24.0 + + engine : {'auto', 'sqlalchemy'}, default 'auto' + SQL engine library to use. If 'auto', then the option + ``io.sql.engine`` is used. The default ``io.sql.engine`` + behavior is 'sqlalchemy' + + .. versionadded:: 1.3.0 + + **engine_kwargs + Any additional kwargs are passed to the engine. + """ + sql_engine = get_engine(engine) + + table = self.prep_table( + frame=frame, + name=name, + if_exists=if_exists, + index=index, + index_label=index_label, + schema=schema, + dtype=dtype, + ) + + sql_engine.insert_records( + table=table, + con=self.connectable, + frame=frame, + name=name, + index=index, + schema=schema, + chunksize=chunksize, + method=method, + **engine_kwargs, + ) + + self.check_case_sensitive(name=name, schema=schema) + @property def tables(self): return self.meta.tables @@ -2008,6 +2163,7 @@ def to_sql( chunksize=None, dtype: DtypeArg | None = None, method=None, + **kwargs, ): """ Write records stored in a DataFrame to a SQL database. diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 6a81e3ae43b5d..21f30c1311e17 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -101,7 +101,7 @@ def _validate_color_args(self): self._boxes_c = colors[0] self._whiskers_c = colors[0] self._medians_c = colors[2] - self._caps_c = "k" # mpl default + self._caps_c = colors[0] def _get_colors(self, num_colors=None, color_kwds="color"): pass diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index d90592c68e351..1e97db152c294 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -9,6 +9,18 @@ UInt64Index, ) import pandas._testing as tm +from pandas.core.computation import expressions as expr + + +@pytest.fixture( + autouse=True, scope="module", params=[0, 1000000], ids=["numexpr", "python"] +) +def switch_numexpr_min_elements(request): + _MIN_ELEMENTS = expr._MIN_ELEMENTS + expr._MIN_ELEMENTS = request.param + yield request.param + expr._MIN_ELEMENTS = _MIN_ELEMENTS + # ------------------------------------------------------------------ # Helper Functions diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index bdd954c1e2222..9e1d13eac5039 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -27,6 +27,7 @@ ) import pandas._testing as tm from pandas.core import ops +from pandas.core.computation import expressions as expr @pytest.fixture(params=[Index, Series, tm.to_array]) @@ -391,7 +392,7 @@ def test_div_negative_zero(self, zero, numeric_idx, op): # ------------------------------------------------------------------ @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) - def test_ser_div_ser(self, dtype1, any_real_dtype): + def test_ser_div_ser(self, switch_numexpr_min_elements, dtype1, any_real_dtype): # no longer do integer div for any ops, but deal with the 0's dtype2 = any_real_dtype @@ -405,6 +406,11 @@ def test_ser_div_ser(self, dtype1, any_real_dtype): name=None, ) expected.iloc[0:3] = np.inf + if first.dtype == "int64" and second.dtype == "float32": + # when using numexpr, the casting rules are slightly different + # and int64/float32 combo results in float32 instead of float64 + if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: + expected = expected.astype("float32") result = first / second tm.assert_series_equal(result, expected) @@ -890,7 +896,13 @@ def test_series_frame_radd_bug(self): # really raise this time now = pd.Timestamp.now().to_pydatetime() - msg = "unsupported operand type" + msg = "|".join( + [ + "unsupported operand type", + # wrong error message, see https://github.com/numpy/numpy/issues/18832 + "Concatenation operation", + ] + ) with pytest.raises(TypeError, match=msg): now + ts diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 56d474497a166..7bb86987456f1 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -317,16 +317,24 @@ def test_validate_inplace_raises(self, value): cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) with pytest.raises(ValueError, match=msg): - cat.rename_categories(["X", "Y", "Z"], inplace=value) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + cat.rename_categories(["X", "Y", "Z"], inplace=value) with pytest.raises(ValueError, match=msg): - cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value) with pytest.raises(ValueError, match=msg): - cat.add_categories(new_categories=["D", "E", "F"], inplace=value) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + cat.add_categories(new_categories=["D", "E", "F"], inplace=value) with pytest.raises(ValueError, match=msg): - cat.remove_categories(removals=["D", "E", "F"], inplace=value) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + cat.remove_categories(removals=["D", "E", "F"], inplace=value) with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning): diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index a6dea639488a2..10e29dc82c050 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -82,7 +82,10 @@ def test_rename_categories(self): tm.assert_categorical_equal(result, expected) # and now inplace - res = cat.rename_categories([1, 2, 3], inplace=True) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + res = cat.rename_categories([1, 2, 3], inplace=True) + assert res is None tm.assert_numpy_array_equal( cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) @@ -114,7 +117,10 @@ def test_rename_categories_dict(self): tm.assert_index_equal(res.categories, expected) # Test for inplace - res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True) + assert res is None tm.assert_index_equal(cat.categories, expected) @@ -153,7 +159,10 @@ def test_reorder_categories(self): tm.assert_categorical_equal(res, new) # inplace == True - res = cat.reorder_categories(["c", "b", "a"], inplace=True) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + res = cat.reorder_categories(["c", "b", "a"], inplace=True) + assert res is None tm.assert_categorical_equal(cat, new) @@ -188,7 +197,10 @@ def test_add_categories(self): tm.assert_categorical_equal(res, new) # inplace == True - res = cat.add_categories("d", inplace=True) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + res = cat.add_categories("d", inplace=True) + tm.assert_categorical_equal(cat, new) assert res is None @@ -354,7 +366,10 @@ def test_remove_categories(self): tm.assert_categorical_equal(res, new) # inplace == True - res = cat.remove_categories("c", inplace=True) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + res = cat.remove_categories("c", inplace=True) + tm.assert_categorical_equal(cat, new) assert res is None diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index d8fca91c5516a..fde45a1e39bb2 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -271,6 +271,13 @@ def test_arrow_table_roundtrip(breaks): expected = pd.concat([df, df], ignore_index=True) tm.assert_frame_equal(result, expected) + # GH-41040 + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + tm.assert_frame_equal(result, expected[0:0]) + @pyarrow_skip @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 8bb32dec2cc0e..d64dd6fa24d2c 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas.util._test_decorators as td @@ -5,6 +6,10 @@ import pandas as pd import pandas._testing as tm +pa = pytest.importorskip("pyarrow", minversion="0.15.0") + +from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] arrays += [pd.array([True, False, True, None], dtype="boolean")] @@ -15,10 +20,8 @@ def data(request): return request.param -@td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(data): # protocol added in 0.15.0 - import pyarrow as pa arr = pa.array(data) expected = pa.array( @@ -31,7 +34,6 @@ def test_arrow_array(data): @td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_roundtrip(data): # roundtrip possible from arrow 0.16.0 - import pyarrow as pa df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -41,11 +43,25 @@ def test_arrow_roundtrip(data): tm.assert_frame_equal(result, df) +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_load_from_zero_chunks(data): + # GH-41040 + + df = pd.DataFrame({"a": data[0:0]}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + table = pa.table( + [pa.chunked_array([], type=table.field("a").type)], schema=table.schema + ) + result = table.to_pandas() + assert result["a"].dtype == data.dtype + tm.assert_frame_equal(result, df) + + @td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_from_arrow_uint(): # https://github.com/pandas-dev/pandas/issues/31896 # possible mismatch in types - import pyarrow as pa dtype = pd.UInt32Dtype() result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) @@ -55,12 +71,113 @@ def test_arrow_from_arrow_uint(): @td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_sliced(): +def test_arrow_sliced(data): # https://github.com/pandas-dev/pandas/issues/38525 - import pyarrow as pa - df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")}) + df = pd.DataFrame({"a": data}) table = pa.table(df) result = table.slice(2, None).to_pandas() expected = df.iloc[2:].reset_index(drop=True) tm.assert_frame_equal(result, expected) + + # no missing values + df2 = df.fillna(data[0]) + table = pa.table(df2) + result = table.slice(2, None).to_pandas() + expected = df2.iloc[2:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def np_dtype_to_arrays(any_real_dtype): + np_dtype = np.dtype(any_real_dtype) + pa_type = pa.from_numpy_dtype(np_dtype) + + # None ensures the creation of a bitmask buffer. + pa_array = pa.array([0, 1, 2, None], type=pa_type) + # Since masked Arrow buffer slots are not required to contain a specific + # value, assert only the first three values of the created np.array + np_expected = np.array([0, 1, 2], dtype=np_dtype) + mask_expected = np.array([True, True, True, False]) + return np_dtype, pa_array, np_expected, mask_expected + + +def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): + """ + Test conversion from pyarrow array to numpy array. + + Modifies the pyarrow buffer to contain padding and offset, which are + considered valid buffers by pyarrow. + + Also tests empty pyarrow arrays with non empty buffers. + See https://github.com/pandas-dev/pandas/issues/40896 + """ + np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays + data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + mask_buffer = pa_array.buffers()[0] + data_buffer = pa_array.buffers()[1] + data_buffer_bytes = pa_array.buffers()[1].to_pybytes() + + # Add trailing padding to the buffer. + data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") + pa_array_trail = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer, data_buffer_trail], + offset=pa_array.offset, + ) + pa_array_trail.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + # Add offset to the buffer. + offset = b"\x00" * (pa_array.type.bit_width // 8) + data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) + mask_buffer_offset = pa.py_buffer(b"\x0E") + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer_offset, data_buffer_offset], + offset=pa_array.offset + 1, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + # Empty array + np_expected_empty = np.array([], dtype=np_dtype) + mask_expected_empty = np.array([], dtype=np.bool_) + + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=0, + buffers=[mask_buffer, data_buffer], + offset=pa_array.offset, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected_empty) + tm.assert_numpy_array_equal(mask, mask_expected_empty) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_from_arrow_type_error(request, data): + # ensure that __from_arrow__ returns a TypeError when getting a wrong + # array type + if data.dtype != "boolean": + # TODO numeric dtypes cast any incoming array to the correct dtype + # instead of erroring + request.node.add_marker( + pytest.mark.xfail(reason="numeric dtypes don't error but cast") + ) + + arr = pa.array(data).cast("string") + with pytest.raises(TypeError, match=None): + # we don't test the exact error message, only the fact that it raises + # a TypeError is relevant + data.dtype.__from_arrow__(arr) diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index f4e803cf4405f..398972a682504 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -100,6 +100,26 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pyarrow_skip +def test_arrow_load_from_zero_chunks(): + # GH-41040 + import pyarrow as pa + + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([], freq="D") + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + @pyarrow_skip def test_arrow_table_roundtrip_without_metadata(): import pyarrow as pa diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index e073871f96bb4..a96e5b07b7f7e 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1311,3 +1311,25 @@ def test_dropna(fill_value): df = pd.DataFrame({"a": [0, 1], "b": arr}) expected_df = pd.DataFrame({"a": [1], "b": exp}, index=pd.Int64Index([1])) tm.assert_equal(df.dropna(), expected_df) + + +class TestMinMax: + plain_data = np.arange(5).astype(float) + data_neg = plain_data * (-1) + data_NaN = SparseArray(np.array([0, 1, 2, np.nan, 4])) + data_all_NaN = SparseArray(np.array([np.nan, np.nan, np.nan, np.nan, np.nan])) + + @pytest.mark.parametrize( + "raw_data,max_expected,min_expected", + [ + (plain_data, [4], [0]), + (data_neg, [0], [-4]), + (data_NaN, [4], [0]), + (data_all_NaN, [np.nan], [np.nan]), + ], + ) + def test_maxmin(self, raw_data, max_expected, min_expected): + max_result = SparseArray(raw_data).max() + min_result = SparseArray(raw_data).min() + assert max_result in max_expected + assert min_result in min_expected diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2b2db49c62ba2..749f3d0aee8a5 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -1,7 +1,4 @@ -""" -This module tests the functionality of StringArray and ArrowStringArray. -Tests for the str accessors are in pandas/tests/strings/test_string_array.py -""" +import operator import numpy as np import pytest @@ -91,6 +88,23 @@ def test_setitem_with_scalar_string(dtype): tm.assert_extension_array_equal(arr, expected) +@pytest.mark.parametrize( + "input, method", + [ + (["a", "b", "c"], operator.methodcaller("capitalize")), + (["a b", "a bc. de"], operator.methodcaller("capitalize")), + ], +) +def test_string_methods(input, method, dtype): + a = pd.Series(input, dtype=dtype) + b = pd.Series(input, dtype="object") + result = method(a.str) + expected = method(b.str) + + assert result.dtype.name == dtype + tm.assert_series_equal(result.astype(object), expected) + + def test_astype_roundtrip(dtype, request): if dtype == "arrow_string": reason = "ValueError: Could not convert object to NumPy datetime" @@ -476,7 +490,12 @@ def test_arrow_roundtrip(dtype, dtype_object): assert result.loc[2, "a"] is pd.NA -def test_value_counts_na(dtype): +def test_value_counts_na(dtype, request): + if dtype == "arrow_string": + reason = "TypeError: boolean value of NA is ambiguous" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") @@ -487,7 +506,12 @@ def test_value_counts_na(dtype): tm.assert_series_equal(result, expected) -def test_value_counts_with_normalize(dtype): +def test_value_counts_with_normalize(dtype, request): + if dtype == "arrow_string": + reason = "TypeError: boolean value of NA is ambiguous" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) result = s.value_counts(normalize=True) expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3 diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 62d368264752b..771d60b000a7d 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1150,7 +1150,7 @@ def test_array_interface(self, arr1d): tm.assert_numpy_array_equal(result, arr.asi8) # to other dtypes - msg = r"float\(\) argument must be a string or a number, not 'Period'" + msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): np.asarray(arr, dtype="float64") diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8e6c330475e68..b9c1113e7f441 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -226,6 +226,16 @@ def test_fillna_2d(self): res4 = dta2.fillna(method="backfill") tm.assert_extension_array_equal(res4, expected2) + # test the DataFrame method while we're here + df = pd.DataFrame(dta) + res = df.fillna(method="pad") + expected = pd.DataFrame(expected1) + tm.assert_frame_equal(res, expected) + + res = df.fillna(method="backfill") + expected = pd.DataFrame(expected2) + tm.assert_frame_equal(res, expected) + def test_array_interface_tz(self): tz = "US/Central" data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz)) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 907991b97ead1..d1e6409307915 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -31,6 +31,7 @@ from pandas.core.dtypes.common import ( ensure_int32, is_bool, + is_complex, is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, @@ -614,6 +615,69 @@ def test_maybe_convert_objects_bool_nan(self): out = lib.maybe_convert_objects(ind.values, safe=1) tm.assert_numpy_array_equal(out, exp) + @pytest.mark.parametrize( + "data0", + [ + True, + 1, + 1.0, + 1.0 + 1.0j, + np.int8(1), + np.int16(1), + np.int32(1), + np.int64(1), + np.float16(1), + np.float32(1), + np.float64(1), + np.complex64(1), + np.complex128(1), + ], + ) + @pytest.mark.parametrize( + "data1", + [ + True, + 1, + 1.0, + 1.0 + 1.0j, + np.int8(1), + np.int16(1), + np.int32(1), + np.int64(1), + np.float16(1), + np.float32(1), + np.float64(1), + np.complex64(1), + np.complex128(1), + ], + ) + def test_maybe_convert_objects_itemsize(self, data0, data1): + # GH 40908 + data = [data0, data1] + arr = np.array(data, dtype="object") + + common_kind = np.find_common_type( + [type(data0), type(data1)], scalar_types=[] + ).kind + kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind + kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind + if kind0 != "python" and kind1 != "python": + kind = common_kind + itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize) + elif is_bool(data0) or is_bool(data1): + kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object" + itemsize = "" + elif is_complex(data0) or is_complex(data1): + kind = common_kind + itemsize = 16 + else: + kind = common_kind + itemsize = 8 + + expected = np.array(data, dtype=f"{kind}{itemsize}") + result = lib.maybe_convert_objects(arr) + tm.assert_numpy_array_equal(result, expected) + def test_mixed_dtypes_remain_object_array(self): # GH14956 arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7c5ef5b3b27d3..99a5666926e10 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas.core.internals import ObjectBlock from pandas.tests.extension.base.base import BaseExtensionTests @@ -43,10 +45,21 @@ def test_astype_str(self, data): expected = pd.Series([str(x) for x in data[:5]], dtype=str) self.assert_series_equal(result, expected) - def test_astype_string(self, data): + @pytest.mark.parametrize( + "nullable_string_dtype", + [ + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ], + ) + def test_astype_string(self, data, nullable_string_dtype): # GH-33465 - result = pd.Series(data[:5]).astype("string") - expected = pd.Series([str(x) for x in data[:5]], dtype="string") + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + result = pd.Series(data[:5]).astype(nullable_string_dtype) + expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) self.assert_series_equal(result, expected) def test_to_numpy(self, data): diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 56c3f8216f033..6e4ed7b77cad8 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -3,7 +3,10 @@ import pandas as pd from pandas.api.extensions import ExtensionArray -from pandas.core.internals import ExtensionBlock +from pandas.core.internals.blocks import ( + DatetimeTZBlock, + ExtensionBlock, +) from pandas.tests.extension.base.base import BaseExtensionTests @@ -26,14 +29,14 @@ def test_series_constructor(self, data): assert result.dtype == data.dtype assert len(result) == len(data) if hasattr(result._mgr, "blocks"): - assert isinstance(result._mgr.blocks[0], ExtensionBlock) + assert isinstance(result._mgr.blocks[0], (ExtensionBlock, DatetimeTZBlock)) assert result._mgr.array is data # Series[EA] is unboxed / boxed correctly result2 = pd.Series(result) assert result2.dtype == data.dtype if hasattr(result._mgr, "blocks"): - assert isinstance(result2._mgr.blocks[0], ExtensionBlock) + assert isinstance(result2._mgr.blocks[0], (ExtensionBlock, DatetimeTZBlock)) def test_series_constructor_no_data_with_index(self, dtype, na_value): result = pd.Series(index=[1, 2, 3], dtype=dtype) @@ -68,7 +71,7 @@ def test_dataframe_constructor_from_dict(self, data, from_series): assert result.dtypes["A"] == data.dtype assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): - assert isinstance(result._mgr.blocks[0], ExtensionBlock) + assert isinstance(result._mgr.blocks[0], (ExtensionBlock, DatetimeTZBlock)) assert isinstance(result._mgr.arrays[0], ExtensionArray) def test_dataframe_from_series(self, data): @@ -76,7 +79,7 @@ def test_dataframe_from_series(self, data): assert result.dtypes[0] == data.dtype assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): - assert isinstance(result._mgr.blocks[0], ExtensionBlock) + assert isinstance(result._mgr.blocks[0], (ExtensionBlock, DatetimeTZBlock)) assert isinstance(result._mgr.arrays[0], ExtensionArray) def test_series_given_mismatched_index_raises(self, data): diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 759277a47f62b..f0d3fb7ff9e1b 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -16,10 +16,6 @@ import numpy as np import pytest -from pandas.compat import ( - IS64, - is_platform_windows, -) from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import is_object_dtype @@ -428,9 +424,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ]: mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch") request.node.add_marker(mark) - elif is_platform_windows() or not IS64: - mark = pytest.mark.xfail(reason="results are int32, expected int64") - request.node.add_marker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index e8d0a789e7cbd..35ad9f3e9693b 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -117,7 +117,7 @@ def test_from_records_sequencelike(self): result = DataFrame.from_records(tuples, exclude=exclude) result.columns = [columns[i] for i in sorted(columns_to_test)] tm.assert_series_equal(result["C"], df["C"]) - tm.assert_series_equal(result["E1"], df["E1"].astype("float64")) + tm.assert_series_equal(result["E1"], df["E1"]) def test_from_records_sequencelike_empty(self): # empty case diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 3fa8295084718..4004e595c832f 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -791,6 +791,34 @@ def test_setitem_slice_position(self): expected = DataFrame(arr) tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc]) + @pytest.mark.parametrize("box", [Series, np.array, list]) + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_setitem_broadcasting_rhs(self, n, box, indexer): + # GH#40440 + # TODO: Add pandas array as box after GH#40933 is fixed + df = DataFrame([[1, 3, 5]] + [[2, 4, 6]] * n, columns=["a", "b", "c"]) + indexer(df)[1:] = box([10, 11, 12]) + expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"]) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc]) + @pytest.mark.parametrize("box", [Series, np.array, list]) + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_setitem_broadcasting_rhs_mixed_dtypes(self, n, box, indexer): + # GH#40440 + # TODO: Add pandas array as box after GH#40933 is fixed + df = DataFrame( + [[1, 3, 5], ["x", "y", "z"]] + [[2, 4, 6]] * n, columns=["a", "b", "c"] + ) + indexer(df)[1:] = box([10, 11, 12]) + expected = DataFrame( + [[1, 3, 5]] + [[10, 11, 12]] * (n + 1), + columns=["a", "b", "c"], + dtype="object", + ) + tm.assert_frame_equal(df, expected) + class TestDataFrameSetItemCallable: def test_setitem_callable(self): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 544960113fafc..1583b3f91bea2 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -428,11 +428,27 @@ def test_astype_to_incorrect_datetimelike(self, unit): other = f"m8[{unit}]" df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) - msg = fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]" + msg = "|".join( + [ + # BlockManager path + fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]", + # ArrayManager path + "cannot astype a datetimelike from " + fr"\[datetime64\[ns\]\] to \[timedelta64\[{unit}\]\]", + ] + ) with pytest.raises(TypeError, match=msg): df.astype(other) - msg = fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]" + msg = "|".join( + [ + # BlockManager path + fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]", + # ArrayManager path + "cannot astype a timedelta from " + fr"\[timedelta64\[ns\]\] to \[datetime64\[{unit}\]\]", + ] + ) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 8a2374a414482..6525109da4394 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -144,17 +144,25 @@ def test_clip_with_na_args(self, float_frame): tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) - # GH#19992 + # GH#19992 and adjusted in GH#40420 df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) result = df.clip(lower=[4, 5, np.nan], axis=0) expected = DataFrame( - {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} + {"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]} ) tm.assert_frame_equal(result, expected) result = df.clip(lower=[4, 5, np.nan], axis=1) expected = DataFrame( - {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} + {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]} ) tm.assert_frame_equal(result, expected) + + # GH#40420 + data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} + df = DataFrame(data) + t = Series([2, -4, np.NaN, 6, 3]) + result = df.clip(lower=t, axis=0) + expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index d8f93f047e74b..e6ed60dc2bb08 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -10,6 +10,8 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p20 + import pandas as pd from pandas import ( DataFrame, @@ -1514,8 +1516,14 @@ def test_replace_commutative(self, df, to_replace, exp): np.float64(1), ], ) - def test_replace_replacer_dtype(self, replacer): + def test_replace_replacer_dtype(self, request, replacer): # GH26632 + if np.isscalar(replacer) and replacer.dtype.itemsize < 8: + request.node.add_marker( + pytest.mark.xfail( + np_version_under1p20, reason="np.putmask doesn't coerce dtype" + ) + ) df = DataFrame(["a"]) result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer]) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 430abd9700a23..62dc400f8de9f 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -96,15 +96,18 @@ def test_set_index_cast_datetimeindex(self): idf = df.set_index("A") assert isinstance(idf.index, DatetimeIndex) - def test_set_index_dst(self): + def test_set_index_dst(self, using_array_manager): di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific") df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() # single level res = df.set_index("index") exp = DataFrame( - data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=Index(di, name="index") + data={"a": [0, 1, 2], "b": [3, 4, 5]}, + index=Index(di, name="index"), ) + if not using_array_manager: + exp.index = exp.index._with_freq(None) tm.assert_frame_equal(res, exp) # GH#12920 diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index c6816fa6481f4..b9f6e72acf71b 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -12,11 +12,13 @@ import pandas as pd from pandas import ( DataFrame, + Index, MultiIndex, Series, ) import pandas._testing as tm import pandas.core.common as com +from pandas.core.computation import expressions as expr from pandas.core.computation.expressions import ( _MIN_ELEMENTS, NUMEXPR_INSTALLED, @@ -27,6 +29,16 @@ ) +@pytest.fixture( + autouse=True, scope="module", params=[0, 1000000], ids=["numexpr", "python"] +) +def switch_numexpr_min_elements(request): + _MIN_ELEMENTS = expr._MIN_ELEMENTS + expr._MIN_ELEMENTS = request.param + yield request.param + expr._MIN_ELEMENTS = _MIN_ELEMENTS + + class DummyElement: def __init__(self, value, dtype): self.value = value @@ -174,9 +186,19 @@ def test_timestamp_compare(self): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("20010109"), df) # nats - expected = left_f(df, pd.Timestamp("nat")) - result = right_f(pd.Timestamp("nat"), df) - tm.assert_frame_equal(result, expected) + if left in ["eq", "ne"]: + expected = left_f(df, pd.Timestamp("nat")) + result = right_f(pd.Timestamp("nat"), df) + tm.assert_frame_equal(result, expected) + else: + msg = ( + "'(<|>)=?' not supported between " + "instances of 'numpy.ndarray' and 'NaTType'" + ) + with pytest.raises(TypeError, match=msg): + left_f(df, pd.Timestamp("nat")) + with pytest.raises(TypeError, match=msg): + right_f(pd.Timestamp("nat"), df) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, @@ -504,7 +526,12 @@ def f(x, y): @pytest.mark.parametrize("op", ["__add__", "__sub__", "__mul__"]) def test_arith_flex_frame_mixed( - self, op, int_frame, mixed_int_frame, mixed_float_frame + self, + op, + int_frame, + mixed_int_frame, + mixed_float_frame, + switch_numexpr_min_elements, ): f = getattr(operator, op) @@ -518,6 +545,12 @@ def test_arith_flex_frame_mixed( dtype = {"B": "uint64", "C": None} elif op in ["__add__", "__mul__"]: dtype = {"C": None} + if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: + # when using numexpr, the casting rules are slightly different: + # in the `2 + mixed_int_frame` operation, int32 column becomes + # and int64 column (not preserving dtype in operation with Python + # scalar), and then the int32/int64 combo results in int64 result + dtype["A"] = (2 + mixed_int_frame)["A"].dtype tm.assert_frame_equal(result, expected) _check_mixed_int(result, dtype=dtype) @@ -882,7 +915,7 @@ def test_frame_with_frame_reindex(self): ], ids=lambda x: x.__name__, ) - def test_binop_other(self, op, value, dtype): + def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): skip = { (operator.truediv, "bool"), @@ -931,16 +964,18 @@ def test_binop_other(self, op, value, dtype): elif (op, dtype) in skip: if op in [operator.add, operator.mul]: - with tm.assert_produces_warning(UserWarning): + if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: # "evaluating in Python space because ..." + warn = UserWarning + else: + warn = None + with tm.assert_produces_warning(warn): op(s, e.value) else: msg = "operator '.*' not implemented for .* dtypes" with pytest.raises(NotImplementedError, match=msg): - with tm.assert_produces_warning(UserWarning): - # "evaluating in Python space because ..." - op(s, e.value) + op(s, e.value) else: # FIXME: Since dispatching to Series, this test no longer @@ -1782,3 +1817,22 @@ def test_inplace_arithmetic_series_update(): expected = DataFrame({"A": [2, 3, 4]}) tm.assert_frame_equal(df, expected) + + +def test_arithemetic_multiindex_align(): + """ + Regression test for: https://github.com/pandas-dev/pandas/issues/33765 + """ + df1 = DataFrame( + [[1]], + index=["a"], + columns=MultiIndex.from_product([[0], [1]], names=["a", "b"]), + ) + df2 = DataFrame([[1]], index=["a"], columns=Index([0], name="a")) + expected = DataFrame( + [[0]], + index=["a"], + columns=MultiIndex.from_product([[0], [1]], names=["a", "b"]), + ) + result = df1 - df2 + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 748aa462cddae..ba0acdc4f947b 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -45,7 +45,7 @@ def test_setitem_invalidates_datetime_index_freq(self): ts = dti[1] df = DataFrame({"B": dti}) - assert df["B"]._values.freq == "D" + assert df["B"]._values.freq is None df.iloc[1, 0] = pd.NaT assert df["B"]._values.freq is None diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ca68885fdc470..ab240531a7505 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -46,6 +46,7 @@ ) import pandas._testing as tm from pandas.arrays import ( + DatetimeArray, IntervalArray, PeriodArray, SparseArray, @@ -1011,6 +1012,7 @@ def test_constructor_maskedrecarray_dtype(self): alt = DataFrame({name: data[name] for name in data.dtype.names}, dtype=int) tm.assert_frame_equal(result, alt) + @pytest.mark.slow def test_constructor_mrecarray(self): # Ensure mrecarray produces frame identical to dict of masked arrays # from GH3479 @@ -1924,12 +1926,12 @@ def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes - expected = Series([np.dtype("int64")] * 5) + expected = Series([np.dtype("int")] * 5) tm.assert_series_equal(result, expected) df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)]) result = df.dtypes - expected = Series([np.dtype("int64")] * 5) + expected = Series([np.dtype("int32")] * 5) tm.assert_series_equal(result, expected) # overflow issue? (we always expected int64 upcasting here) @@ -2569,6 +2571,13 @@ def test_construction_from_set_raises(self, typ): with pytest.raises(TypeError, match=msg): Series(values) + def test_construction_from_ndarray_datetimelike(self): + # ensure the underlying arrays are properly wrapped as EA when + # constructed from 2D ndarray + arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3) + df = DataFrame(arr) + assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays) + def get1(obj): if isinstance(obj, Series): diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 823ce7435f229..6e5cb3add43df 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -52,33 +52,46 @@ def test_nonzero_single_element(self): s = Series([False]) assert not s.bool() - msg = "The truth value of a Series is ambiguous" + @pytest.mark.parametrize("data", [np.nan, pd.NaT, True, False]) + def test_nonzero_single_element_raise_1(self, data): # single item nan to raise - for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False])]: - with pytest.raises(ValueError, match=msg): - bool(s) + series = Series([data]) + + msg = "The truth value of a Series is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(series) + + @pytest.mark.parametrize("data", [np.nan, pd.NaT]) + def test_nonzero_single_element_raise_2(self, data): + series = Series([data]) msg = "bool cannot act on a non-boolean single element Series" - for s in [Series([np.nan]), Series([pd.NaT])]: - with pytest.raises(ValueError, match=msg): - s.bool() + with pytest.raises(ValueError, match=msg): + series.bool() + @pytest.mark.parametrize("data", [(True, True), (False, False)]) + def test_nonzero_multiple_element_raise(self, data): # multiple bool are still an error + series = Series([data]) + msg = "The truth value of a Series is ambiguous" - for s in [Series([True, True]), Series([False, False])]: - with pytest.raises(ValueError, match=msg): - bool(s) - with pytest.raises(ValueError, match=msg): - s.bool() + with pytest.raises(ValueError, match=msg): + bool(series) + with pytest.raises(ValueError, match=msg): + series.bool() + @pytest.mark.parametrize("data", [1, 0, "a", 0.0]) + def test_nonbool_single_element_raise(self, data): # single non-bool are an error - for s in [Series([1]), Series([0]), Series(["a"]), Series([0.0])]: - msg = "The truth value of a Series is ambiguous" - with pytest.raises(ValueError, match=msg): - bool(s) - msg = "bool cannot act on a non-boolean single element Series" - with pytest.raises(ValueError, match=msg): - s.bool() + series = Series([data]) + + msg = "The truth value of a Series is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(series) + + msg = "bool cannot act on a non-boolean single element Series" + with pytest.raises(ValueError, match=msg): + series.bool() def test_metadata_propagation_indiv_resample(self): # resample diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 28344897a686f..b601ba92886d9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -234,11 +234,10 @@ def test_aggregate_item_by_item(df): K = len(result.columns) # GH5782 - # odd comparisons can result here, so cast to make easy - exp = Series(np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo") + exp = Series(np.array([foo] * K), index=list("BCD"), name="foo") tm.assert_series_equal(result.xs("foo"), exp) - exp = Series(np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar") + exp = Series(np.array([bar] * K), index=list("BCD"), name="bar") tm.assert_almost_equal(result.xs("bar"), exp) def aggfun(ser): @@ -442,6 +441,57 @@ def test_bool_agg_dtype(op): assert is_integer_dtype(result) +@pytest.mark.parametrize( + "keys, agg_index", + [ + (["a"], Index([1], name="a")), + (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])), + ], +) +@pytest.mark.parametrize( + "input_dtype", ["bool", "int32", "int64", "float32", "float64"] +) +@pytest.mark.parametrize( + "result_dtype", ["bool", "int32", "int64", "float32", "float64"] +) +@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"]) +def test_callable_result_dtype_frame( + keys, agg_index, input_dtype, result_dtype, method +): + # GH 21240 + df = DataFrame({"a": [1], "b": [2], "c": [True]}) + df["c"] = df["c"].astype(input_dtype) + op = getattr(df.groupby(keys)[["c"]], method) + result = op(lambda x: x.astype(result_dtype).iloc[0]) + expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype( + result_dtype + ) + if method == "apply": + expected.columns.names = [0] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "keys, agg_index", + [ + (["a"], Index([1], name="a")), + (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])), + ], +) +@pytest.mark.parametrize("input", [True, 1, 1.0]) +@pytest.mark.parametrize("dtype", [bool, int, float]) +@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"]) +def test_callable_result_dtype_series(keys, agg_index, input, dtype, method): + # GH 21240 + df = DataFrame({"a": [1], "b": [2], "c": [input]}) + op = getattr(df.groupby(keys)["c"], method) + result = op(lambda x: x.astype(dtype).iloc[0]) + expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype) + tm.assert_series_equal(result, expected) + + def test_order_aggregate_multiple_funcs(): # GH 25692 df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) @@ -849,7 +899,11 @@ def test_multiindex_custom_func(func): data = [[1, 4, 2], [5, 7, 1]] df = DataFrame(data, columns=MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) result = df.groupby(np.array([0, 1])).agg(func) - expected_dict = {(1, 3): {0: 1, 1: 5}, (1, 4): {0: 4, 1: 7}, (2, 3): {0: 2, 1: 1}} + expected_dict = { + (1, 3): {0: 1.0, 1: 5.0}, + (1, 4): {0: 4.0, 1: 7.0}, + (2, 3): {0: 2.0, 1: 1.0}, + } expected = DataFrame(expected_dict) tm.assert_frame_equal(result, expected) @@ -1105,6 +1159,11 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): expected_df = DataFrame(data=exp_data, index=cat_index) + if "cat_ord" in expected_df: + # ordered categorical columns should be preserved + dtype = input_df["cat_ord"].dtype + expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype) + tm.assert_frame_equal(result_df, expected_df) @@ -1149,6 +1208,10 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): multi_index = MultiIndex.from_tuples(tuple(multi_index_list)) expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index) + for col in expected_df.columns: + if isinstance(col, tuple) and "cat_ord" in col: + # ordered categorical should be preserved + expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype) tm.assert_frame_equal(result_df, expected_df) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 4a8aabe41b754..ded10ab11d5a8 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -196,6 +196,9 @@ def test_cython_agg_empty_buckets(op, targop, observed): g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) + if observed and op not in ("min", "max"): + # TODO: GH 41137 + expected = expected.astype("int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index da438826a939a..7349664614614 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -800,6 +800,12 @@ def test_preserve_on_ordered_ops(func, values): ).set_index("payload") tm.assert_frame_equal(result, expected) + # we should also preserve categorical for SeriesGroupBy + sgb = df.groupby("payload")["col"] + result = getattr(sgb, func)() + expected = expected["col"] + tm.assert_series_equal(result, expected) + def test_categorical_no_compress(): data = Series(np.random.randn(9)) @@ -1494,7 +1500,11 @@ def test_groupy_first_returned_categorical_instead_of_dataframe(func): df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()}) df_grouped = df.groupby("A")["B"] result = getattr(df_grouped, func)() - expected = Series(["b"], index=Index([1997], name="A"), name="B") + + # ordered categorical dtype should be preserved + expected = Series( + ["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype + ) tm.assert_series_equal(result, expected) @@ -1561,7 +1571,15 @@ def test_agg_cython_category_not_implemented_fallback(): df["col_cat"] = df["col_num"].astype("category") result = df.groupby("col_num").col_cat.first() - expected = Series([1, 2, 3], index=Index([1, 2, 3], name="col_num"), name="col_cat") + + # ordered categorical dtype should definitely be preserved; + # this is unordered, so is less-clear case (if anything, it should raise) + expected = Series( + [1, 2, 3], + index=Index([1, 2, 3], name="col_num"), + name="col_cat", + dtype=df["col_cat"].dtype, + ) tm.assert_series_equal(result, expected) result = df.groupby("col_num").agg({"col_cat": "first"}) @@ -1576,6 +1594,10 @@ def test_aggregate_categorical_lost_index(func: str): df = DataFrame({"A": [1997], "B": ds}) result = df.groupby("A").agg({"B": func}) expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) + + # ordered categorical dtype should be preserved + expected["B"] = expected["B"].astype(ds.dtype) + tm.assert_frame_equal(result, expected) @@ -1597,7 +1619,7 @@ def test_aggregate_categorical_with_isnan(): index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B")) expected = DataFrame( data={ - "numerical_col": [1.0, 0.0], + "numerical_col": [1, 0], "object_col": [0, 0], "categorical_col": [0, 0], }, @@ -1653,6 +1675,9 @@ def test_categorical_transform(): expected["status"] = expected["status"].astype(delivery_status_type) + # .transform(max) should preserve ordered categoricals + expected["last_status"] = expected["last_status"].astype(delivery_status_type) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 46985ff956788..3f43c34b6eb34 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -22,20 +22,31 @@ @pytest.fixture( - params=[np.int32, np.int64, np.float32, np.float64], - ids=["np.int32", "np.int64", "np.float32", "np.float64"], + params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], + ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], ) -def numpy_dtypes_for_minmax(request): +def dtypes_for_minmax(request): """ - Fixture of numpy dtypes with min and max values used for testing + Fixture of dtypes with min and max values used for testing cummin and cummax """ dtype = request.param + + np_type = dtype + if dtype == "Int64": + np_type = np.int64 + elif dtype == "Float64": + np_type = np.float64 + min_val = ( - np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min + np.iinfo(np_type).min + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).min ) max_val = ( - np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max + np.iinfo(np_type).max + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).max ) return (dtype, min_val, max_val) @@ -58,6 +69,37 @@ def test_max_min_non_numeric(): assert "ss" in result +def test_max_min_object_multiple_columns(using_array_manager): + # GH#41111 case where the aggregation is valid for some columns but not + # others; we split object blocks column-wise, consistent with + # DataFrame._reduce + + df = DataFrame( + { + "A": [1, 1, 2, 2, 3], + "B": [1, "foo", 2, "bar", False], + "C": ["a", "b", "c", "d", "e"], + } + ) + df._consolidate_inplace() # should already be consolidate, but double-check + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + gb = df.groupby("A") + + result = gb.max(numeric_only=False) + # "max" is valid for column "C" but not for "B" + ei = Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + result = gb.min(numeric_only=False) + # "min" is valid for column "C" but not for "B" + ei = Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + def test_min_date_with_nans(): # GH26321 dates = pd.to_datetime( @@ -397,7 +439,8 @@ def test_median_empty_bins(observed): result = df.groupby(bins, observed=observed).median() expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) - tm.assert_frame_equal(result, expected) + # TODO: GH 41137 + tm.assert_frame_equal(result, expected, check_dtype=False) @pytest.mark.parametrize( @@ -577,7 +620,7 @@ def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - result = getattr(df.groupby(labels), op)().astype(float) + result = getattr(df.groupby(labels), op)() expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) @@ -727,9 +770,9 @@ def test_numpy_compat(func): getattr(g, func)(foo=1) -def test_cummin(numpy_dtypes_for_minmax): - dtype = numpy_dtypes_for_minmax[0] - min_val = numpy_dtypes_for_minmax[1] +def test_cummin(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -775,19 +818,24 @@ def test_cummin(numpy_dtypes_for_minmax): tm.assert_series_equal(result, expected) -def test_cummin_all_nan_column(): +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) +def test_cummin_max_all_nan_column(method, dtype): base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) + base_df["B"] = base_df["B"].astype(dtype) + grouped = base_df.groupby("A") - expected = DataFrame({"B": [np.nan] * 8}) - result = base_df.groupby("A").cummin() + expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) + result = getattr(grouped, method)() tm.assert_frame_equal(expected, result) - result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + + result = getattr(grouped["B"], method)().to_frame() tm.assert_frame_equal(expected, result) -def test_cummax(numpy_dtypes_for_minmax): - dtype = numpy_dtypes_for_minmax[0] - max_val = numpy_dtypes_for_minmax[2] +def test_cummax(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -831,14 +879,20 @@ def test_cummax(numpy_dtypes_for_minmax): tm.assert_series_equal(result, expected) -def test_cummax_all_nan_column(): - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) +@td.skip_if_32bit +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize( + "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2 ** 53 + 1)] +) +def test_nullable_int_not_cast_as_float(method, dtype, val): + data = [val, pd.NA] + df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) + grouped = df.groupby("grp") - expected = DataFrame({"B": [np.nan] * 8}) - result = base_df.groupby("A").cummax() - tm.assert_frame_equal(expected, result) - result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(expected, result) + result = grouped.transform(method) + expected = DataFrame({"b": data}, dtype=dtype) + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -1024,6 +1078,7 @@ def test_describe_with_duplicate_output_column_names(as_index): "c": [10, 20, 30, 40, 50, 60], }, columns=["a", "b", "b"], + copy=False, ) expected = ( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c5620d6d8c06c..abfa2a23a4402 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -99,10 +99,7 @@ def max_value(group): applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = Series( - [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], - index=["A", "B", "C", "D", "value"], - ) + expected = df.dtypes tm.assert_series_equal(result, expected) @@ -302,10 +299,9 @@ def f(x): return float(len(x)) agged = grouped.agg(f) - expected = Series([4, 2], index=["bar", "foo"]) + expected = Series([4.0, 2.0], index=["bar", "foo"]) - tm.assert_series_equal(agged, expected, check_dtype=False) - assert issubclass(agged.dtype.type, np.dtype(dtype).type) + tm.assert_series_equal(agged, expected) def test_indices_concatenation_order(): @@ -2023,6 +2019,12 @@ def test_groupby_crash_on_nunique(axis): tm.assert_frame_equal(result, expected) + # same thing, but empty columns + gb = df[[]].groupby(axis=axis_number, level=0) + res = gb.nunique() + exp = expected[[]] + tm.assert_frame_equal(res, exp) + def test_groupby_list_level(): # GH 9790 diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index da88ea5f05107..e07c5f404a02a 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -578,3 +578,25 @@ def test_rank_pct_equal_values_on_group_transition(use_nan): expected = Series([1 / 3, 2 / 3, 1, 1], name="val") tm.assert_series_equal(result, expected) + + +def test_rank_multiindex(): + # GH27721 + df = concat( + { + "a": DataFrame({"col1": [1, 2], "col2": [3, 4]}), + "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}), + }, + axis=1, + ) + + result = df.groupby(level=0, axis=1).rank(axis=1, ascending=False, method="first") + expected = concat( + { + "a": DataFrame({"col1": [2.0, 2.0], "col2": [1.0, 1.0]}), + "b": DataFrame({"col3": [2.0, 2.0], "col4": [1.0, 1.0]}), + }, + axis=1, + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 9350a3fcd3036..b22e4749bfdfc 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -242,7 +242,7 @@ def test_transform_bug(): # transforming on a datetime column df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)}) result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False)) - expected = Series(np.arange(5, 0, step=-1), name="B") + expected = Series(np.arange(5, 0, step=-1), name="B", dtype="float64") tm.assert_series_equal(result, expected) @@ -493,7 +493,7 @@ def test_groupby_transform_with_int(): ) with np.errstate(all="ignore"): result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) - expected = DataFrame({"B": np.nan, "C": [-1, 0, 1, -1, 0, 1]}) + expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]}) tm.assert_frame_equal(result, expected) # int that needs float conversion @@ -509,9 +509,9 @@ def test_groupby_transform_with_int(): expected = DataFrame({"B": np.nan, "C": concat([s1, s2])}) tm.assert_frame_equal(result, expected) - # int downcasting + # int doesn't get downcasted result = df.groupby("A").transform(lambda x: x * 2 / 2) - expected = DataFrame({"B": 1, "C": [2, 3, 4, 10, 5, -1]}) + expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]}) tm.assert_frame_equal(result, expected) @@ -631,7 +631,7 @@ def test_groupby_cum_skipna(op, skipna, input, exp): tm.assert_series_equal(expected, result) -@pytest.mark.arm_slow +@pytest.mark.slow @pytest.mark.parametrize( "op, args, targop", [ diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 678344f5b6909..dc6fb9910161c 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -17,13 +17,17 @@ class TestCategoricalIndex(Base): - _holder = CategoricalIndex + _index_cls = CategoricalIndex + + @pytest.fixture + def simple_index(self) -> CategoricalIndex: + return self._index_cls(list("aabbca"), categories=list("cab"), ordered=False) @pytest.fixture def index(self, request): return tm.makeCategoricalIndex(100) - def create_index(self, categories=None, ordered=False): + def create_index(self, *, categories=None, ordered=False): if categories is None: categories = list("cab") return CategoricalIndex(list("aabbca"), categories=categories, ordered=ordered) @@ -33,9 +37,9 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is True - def test_insert(self): + def test_insert(self, simple_index): - ci = self.create_index() + ci = simple_index categories = ci.categories # test 0th element @@ -70,9 +74,9 @@ def test_insert_na_mismatched_dtype(self): expected = Index([pd.NaT, 0, 1, 1], dtype=object) tm.assert_index_equal(result, expected) - def test_delete(self): + def test_delete(self, simple_index): - ci = self.create_index() + ci = simple_index categories = ci.categories result = ci.delete(0) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index ab2b2db7eec53..6139d8af48d98 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -31,7 +31,11 @@ class Base: """ base class for index sub-class tests """ - _holder: Type[Index] + _index_cls: Type[Index] + + @pytest.fixture + def simple_index(self) -> Index: + raise NotImplementedError("Method not implemented") def create_index(self) -> Index: raise NotImplementedError("Method not implemented") @@ -45,12 +49,12 @@ def test_pickle_compat_construction(self): r"__new__\(\) takes at least 2 arguments \(1 given\)" ) with pytest.raises(TypeError, match=msg): - self._holder() + self._index_cls() @pytest.mark.parametrize("name", [None, "new_name"]) - def test_to_frame(self, name): + def test_to_frame(self, name, simple_index): # see GH-15230, GH-22580 - idx = self.create_index() + idx = simple_index if name: idx_name = name @@ -67,10 +71,10 @@ def test_to_frame(self, name): df = idx.to_frame(index=False, name=idx_name) assert df.index is not idx - def test_shift(self): + def test_shift(self, simple_index): # GH8083 test the base class for shift - idx = self.create_index() + idx = simple_index msg = ( f"This method is only implemented for DatetimeIndex, PeriodIndex and " f"TimedeltaIndex; Got type {type(idx).__name__}" @@ -80,18 +84,18 @@ def test_shift(self): with pytest.raises(NotImplementedError, match=msg): idx.shift(1, 2) - def test_constructor_name_unhashable(self): + def test_constructor_name_unhashable(self, simple_index): # GH#29069 check that name is hashable # See also same-named test in tests.series.test_constructors - idx = self.create_index() + idx = simple_index with pytest.raises(TypeError, match="Index.name must be a hashable type"): type(idx)(idx, name=[]) - def test_create_index_existing_name(self): + def test_create_index_existing_name(self, simple_index): # GH11193, when an existing index is passed, and a new name is not # specified, the new index should inherit the previous object name - expected = self.create_index() + expected = simple_index if not isinstance(expected, MultiIndex): expected.name = "foo" result = Index(expected) @@ -140,9 +144,9 @@ def test_create_index_existing_name(self): ), ) - def test_numeric_compat(self): + def test_numeric_compat(self, simple_index): - idx = self.create_index() + idx = simple_index # Check that this doesn't cover MultiIndex case, if/when it does, # we can remove multi.test_compat.test_numeric_compat assert not isinstance(idx, MultiIndex) @@ -183,21 +187,21 @@ def test_numeric_compat(self): with pytest.raises(TypeError, match=floordiv_err): 1 // idx - def test_logical_compat(self): - idx = self.create_index() + def test_logical_compat(self, simple_index): + idx = simple_index with pytest.raises(TypeError, match="cannot perform all"): idx.all() with pytest.raises(TypeError, match="cannot perform any"): idx.any() - def test_repr_roundtrip(self): + def test_repr_roundtrip(self, simple_index): - idx = self.create_index() + idx = simple_index tm.assert_index_equal(eval(repr(idx)), idx) - def test_repr_max_seq_item_setting(self): + def test_repr_max_seq_item_setting(self, simple_index): # GH10182 - idx = self.create_index() + idx = simple_index idx = idx.repeat(50) with pd.option_context("display.max_seq_items", None): repr(idx) @@ -331,42 +335,42 @@ def test_numpy_argsort(self, index): with pytest.raises(ValueError, match=msg): np.argsort(index, order=("a", "b")) - def test_repeat(self): + def test_repeat(self, simple_index): rep = 2 - i = self.create_index() - expected = Index(i.values.repeat(rep), name=i.name) - tm.assert_index_equal(i.repeat(rep), expected) + idx = simple_index.copy() + expected = Index(idx.values.repeat(rep), name=idx.name) + tm.assert_index_equal(idx.repeat(rep), expected) - i = self.create_index() - rep = np.arange(len(i)) - expected = Index(i.values.repeat(rep), name=i.name) - tm.assert_index_equal(i.repeat(rep), expected) + idx = simple_index + rep = np.arange(len(idx)) + expected = Index(idx.values.repeat(rep), name=idx.name) + tm.assert_index_equal(idx.repeat(rep), expected) - def test_numpy_repeat(self): + def test_numpy_repeat(self, simple_index): rep = 2 - i = self.create_index() - expected = i.repeat(rep) - tm.assert_index_equal(np.repeat(i, rep), expected) + idx = simple_index + expected = idx.repeat(rep) + tm.assert_index_equal(np.repeat(idx, rep), expected) msg = "the 'axis' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.repeat(i, rep, axis=0) + np.repeat(idx, rep, axis=0) @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, klass): - i = self.create_index() - if isinstance(i, (DatetimeIndex, TimedeltaIndex)): + def test_where(self, klass, simple_index): + idx = simple_index + if isinstance(idx, (DatetimeIndex, TimedeltaIndex)): # where does not preserve freq - i = i._with_freq(None) + idx = idx._with_freq(None) - cond = [True] * len(i) - result = i.where(klass(cond)) - expected = i + cond = [True] * len(idx) + result = idx.where(klass(cond)) + expected = idx tm.assert_index_equal(result, expected) - cond = [False] + [True] * len(i[1:]) - expected = Index([i._na_value] + i[1:].tolist(), dtype=i.dtype) - result = i.where(klass(cond)) + cond = [False] + [True] * len(idx[1:]) + expected = Index([idx._na_value] + idx[1:].tolist(), dtype=idx.dtype) + result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) def test_insert_base(self, index): @@ -424,9 +428,9 @@ def test_equals(self, index): # do not test MultiIndex assert not index.equals(Series(index)) - def test_equals_op(self): + def test_equals_op(self, simple_index): # GH9947, GH10637 - index_a = self.create_index() + index_a = simple_index n = len(index_a) index_b = index_a[0:-1] @@ -487,15 +491,15 @@ def test_equals_op(self): # For RangeIndex we can convert to Int64Index tm.assert_series_equal(series_a == item, Series(expected3)) - def test_format(self): + def test_format(self, simple_index): # GH35439 - idx = self.create_index() + idx = simple_index expected = [str(x) for x in idx] assert idx.format() == expected def test_format_empty(self): # GH35712 - empty_idx = self._holder([]) + empty_idx = self._index_cls([]) assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] @@ -588,29 +592,29 @@ def test_nulls(self, index): tm.assert_numpy_array_equal(index.isna(), result) tm.assert_numpy_array_equal(index.notna(), ~result) - def test_empty(self): + def test_empty(self, simple_index): # GH 15270 - index = self.create_index() - assert not index.empty - assert index[:0].empty + idx = simple_index + assert not idx.empty + assert idx[:0].empty - def test_join_self_unique(self, join_type): - index = self.create_index() - if index.is_unique: - joined = index.join(index, how=join_type) - assert (index == joined).all() + def test_join_self_unique(self, join_type, simple_index): + idx = simple_index + if idx.is_unique: + joined = idx.join(idx, how=join_type) + assert (idx == joined).all() - def test_map(self): + def test_map(self, simple_index): # callable - index = self.create_index() + idx = simple_index # we don't infer UInt64 - if isinstance(index, UInt64Index): - expected = index.astype("int64") + if isinstance(idx, UInt64Index): + expected = idx.astype("int64") else: - expected = index + expected = idx - result = index.map(lambda x: x) + result = idx.map(lambda x: x) # For RangeIndex we convert to Int64Index tm.assert_index_equal(result, expected) @@ -621,66 +625,66 @@ def test_map(self): lambda values, index: Series(values, index), ], ) - def test_map_dictlike(self, mapper): + def test_map_dictlike(self, mapper, simple_index): - index = self.create_index() - if isinstance(index, CategoricalIndex): - pytest.skip(f"skipping tests for {type(index)}") + idx = simple_index + if isinstance(idx, CategoricalIndex): + pytest.skip(f"skipping tests for {type(idx)}") - identity = mapper(index.values, index) + identity = mapper(idx.values, idx) # we don't infer to UInt64 for a dict - if isinstance(index, UInt64Index) and isinstance(identity, dict): - expected = index.astype("int64") + if isinstance(idx, UInt64Index) and isinstance(identity, dict): + expected = idx.astype("int64") else: - expected = index + expected = idx - result = index.map(identity) + result = idx.map(identity) # For RangeIndex we convert to Int64Index tm.assert_index_equal(result, expected) # empty mappable - expected = Index([np.nan] * len(index)) - result = index.map(mapper(expected, index)) + expected = Index([np.nan] * len(idx)) + result = idx.map(mapper(expected, idx)) tm.assert_index_equal(result, expected) - def test_map_str(self): + def test_map_str(self, simple_index): # GH 31202 - index = self.create_index() - result = index.map(str) - expected = Index([str(x) for x in index], dtype=object) + idx = simple_index + result = idx.map(str) + expected = Index([str(x) for x in idx], dtype=object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize("name", [None, "foo"]) @pytest.mark.parametrize("ordered", [True, False]) - def test_astype_category(self, copy, name, ordered): + def test_astype_category(self, copy, name, ordered, simple_index): # GH 18630 - index = self.create_index() + idx = simple_index if name: - index = index.rename(name) + idx = idx.rename(name) # standard categories dtype = CategoricalDtype(ordered=ordered) - result = index.astype(dtype, copy=copy) - expected = CategoricalIndex(index.values, name=name, ordered=ordered) + result = idx.astype(dtype, copy=copy) + expected = CategoricalIndex(idx.values, name=name, ordered=ordered) tm.assert_index_equal(result, expected) # non-standard categories - dtype = CategoricalDtype(index.unique().tolist()[:-1], ordered) - result = index.astype(dtype, copy=copy) - expected = CategoricalIndex(index.values, name=name, dtype=dtype) + dtype = CategoricalDtype(idx.unique().tolist()[:-1], ordered) + result = idx.astype(dtype, copy=copy) + expected = CategoricalIndex(idx.values, name=name, dtype=dtype) tm.assert_index_equal(result, expected) if ordered is False: # dtype='category' defaults to ordered=False, so only test once - result = index.astype("category", copy=copy) - expected = CategoricalIndex(index.values, name=name) + result = idx.astype("category", copy=copy) + expected = CategoricalIndex(idx.values, name=name) tm.assert_index_equal(result, expected) - def test_is_unique(self): + def test_is_unique(self, simple_index): # initialize a unique index - index = self.create_index().drop_duplicates() + index = simple_index.drop_duplicates() assert index.is_unique is True # empty index should be unique @@ -700,32 +704,32 @@ def test_is_unique(self): assert index_na_dup.is_unique is False @pytest.mark.arm_slow - def test_engine_reference_cycle(self): + def test_engine_reference_cycle(self, simple_index): # GH27585 - index = self.create_index() + index = simple_index nrefs_pre = len(gc.get_referrers(index)) index._engine assert len(gc.get_referrers(index)) == nrefs_pre - def test_getitem_2d_deprecated(self): + def test_getitem_2d_deprecated(self, simple_index): # GH#30588 - idx = self.create_index() + idx = simple_index with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res = idx[:, None] assert isinstance(res, np.ndarray), type(res) - def test_copy_shares_cache(self): + def test_copy_shares_cache(self, simple_index): # GH32898, GH36840 - idx = self.create_index() + idx = simple_index idx.get_loc(idx[0]) # populates the _cache. copy = idx.copy() assert copy._cache is idx._cache - def test_shallow_copy_shares_cache(self): + def test_shallow_copy_shares_cache(self, simple_index): # GH32669, GH36840 - idx = self.create_index() + idx = simple_index idx.get_loc(idx[0]) # populates the _cache. shallow_copy = idx._view() diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 4c8ab27d2c824..a8f8406e24fef 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -9,33 +9,33 @@ class DatetimeLike(Base): - def test_argsort_matches_array(self): - rng = self.create_index() - rng = rng.insert(1, pd.NaT) + def test_argsort_matches_array(self, simple_index): + idx = simple_index + idx = idx.insert(1, pd.NaT) - result = rng.argsort() - expected = rng._data.argsort() + result = idx.argsort() + expected = idx._data.argsort() tm.assert_numpy_array_equal(result, expected) - def test_can_hold_identifiers(self): - idx = self.create_index() + def test_can_hold_identifiers(self, simple_index): + idx = simple_index key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False - def test_shift_identity(self): + def test_shift_identity(self, simple_index): - idx = self.create_index() + idx = simple_index tm.assert_index_equal(idx, idx.shift(0)) - def test_shift_empty(self): + def test_shift_empty(self, simple_index): # GH#14811 - idx = self.create_index()[:0] + idx = simple_index[:0] tm.assert_index_equal(idx, idx.shift(1)) - def test_str(self): + def test_str(self, simple_index): # test the string repr - idx = self.create_index() + idx = simple_index idx.name = "foo" assert not (f"length={len(idx)}" in str(idx)) assert "'foo'" in str(idx) @@ -47,19 +47,19 @@ def test_str(self): if hasattr(idx, "freq"): assert f"freq='{idx.freqstr}'" in str(idx) - def test_view(self): - i = self.create_index() + def test_view(self, simple_index): + idx = simple_index - i_view = i.view("i8") - result = self._holder(i) - tm.assert_index_equal(result, i) + idx_view = idx.view("i8") + result = self._index_cls(idx) + tm.assert_index_equal(result, idx) - i_view = i.view(self._holder) - result = self._holder(i) - tm.assert_index_equal(result, i_view) + idx_view = idx.view(self._index_cls) + result = self._index_cls(idx) + tm.assert_index_equal(result, idx_view) - def test_map_callable(self): - index = self.create_index() + def test_map_callable(self, simple_index): + index = simple_index expected = index + index.freq result = index.map(lambda x: x + x.freq) tm.assert_index_equal(result, expected) @@ -76,8 +76,8 @@ def test_map_callable(self): lambda values, index: pd.Series(values, index, dtype=object), ], ) - def test_map_dictlike(self, mapper): - index = self.create_index() + def test_map_dictlike(self, mapper, simple_index): + index = simple_index expected = index + index.freq # don't compare the freqs @@ -97,15 +97,15 @@ def test_map_dictlike(self, mapper): result = index.map(mapper([], [])) tm.assert_index_equal(result, expected) - def test_getitem_preserves_freq(self): - index = self.create_index() + def test_getitem_preserves_freq(self, simple_index): + index = simple_index assert index.freq is not None result = index[:] assert result.freq == index.freq - def test_where_cast_str(self): - index = self.create_index() + def test_where_cast_str(self, simple_index): + index = simple_index mask = np.ones(len(index), dtype=bool) mask[-1] = False diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index ed9a5054986cb..935e6afec246e 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -146,6 +146,7 @@ def test_date_range_int64_overflow_non_recoverable(self): with pytest.raises(OutOfBoundsDatetime, match=msg): date_range(end="1969-11-14", periods=106752 * 24, freq="H") + @pytest.mark.slow def test_date_range_int64_overflow_stride_endpoint_different_signs(self): # cases where stride * periods overflow int64 and stride/endpoint # have different signs diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 94303359958b3..0a387fe3141e4 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -10,7 +10,11 @@ class TestDatetimeIndex(DatetimeLike): - _holder = DatetimeIndex + _index_cls = DatetimeIndex + + @pytest.fixture + def simple_index(self) -> DatetimeIndex: + return date_range("20130101", periods=5) @pytest.fixture( params=[tm.makeDateIndex(10), date_range("20130110", periods=10, freq="-1D")], @@ -19,12 +23,9 @@ class TestDatetimeIndex(DatetimeLike): def index(self, request): return request.param - def create_index(self) -> DatetimeIndex: - return date_range("20130101", periods=5) - - def test_format(self): + def test_format(self, simple_index): # GH35439 - idx = self.create_index() + idx = simple_index expected = [f"{x:%Y-%m-%d}" for x in idx] assert idx.format() == expected diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 8bf418a2fc731..b14db459f996d 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -16,13 +16,17 @@ class TestBase(Base): in test_interval.py or the specific test file (e.g. test_astype.py) """ - _holder = IntervalIndex + _index_cls = IntervalIndex + + @pytest.fixture + def simple_index(self) -> IntervalIndex: + return self._index_cls.from_breaks(range(11), closed="right") @pytest.fixture def index(self): return tm.makeIntervalIndex(10) - def create_index(self, closed="right"): + def create_index(self, *, closed="right"): return IntervalIndex.from_breaks(range(11), closed=closed) def test_repr_max_seq_item_setting(self): @@ -44,8 +48,8 @@ def test_take(self, closed): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, closed, klass): - idx = self.create_index(closed=closed) + def test_where(self, simple_index, klass): + idx = simple_index cond = [True] * len(idx) expected = idx result = expected.where(klass(cond)) @@ -56,9 +60,9 @@ def test_where(self, closed, klass): result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_getitem_2d_deprecated(self): + def test_getitem_2d_deprecated(self, simple_index): # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable - idx = self.create_index() + idx = simple_index with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): idx[:, None] diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index ff0c2a0d67885..0e812f2d4590c 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -122,7 +122,7 @@ def test_consistency(): assert index.is_unique is False -@pytest.mark.arm_slow +@pytest.mark.slow def test_hash_collisions(): # non-smoke test that we don't get hash collisions diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 032b376f6d6a9..b80e92b105dbd 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -19,7 +19,11 @@ class TestPeriodIndex(DatetimeLike): - _holder = PeriodIndex + _index_cls = PeriodIndex + + @pytest.fixture + def simple_index(self) -> Index: + return period_range("20130101", periods=5, freq="D") @pytest.fixture( params=[ @@ -31,9 +35,6 @@ class TestPeriodIndex(DatetimeLike): def index(self, request): return request.param - def create_index(self) -> PeriodIndex: - return period_range("20130101", periods=5, freq="D") - def test_pickle_compat_construction(self): pass @@ -357,7 +358,7 @@ def test_map(self): def test_format_empty(self): # GH35712 - empty_idx = self._holder([], freq="A") + empty_idx = self._index_cls([], freq="A") assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py index 9539b0ff7cdba..e306b6e67cf7f 100644 --- a/pandas/tests/indexes/ranges/test_constructors.py +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -119,7 +119,9 @@ def test_constructor_range(self): expected = RangeIndex(1, 5, 2) tm.assert_index_equal(result, expected, exact=True) - msg = r"^from_range\(\) got an unexpected keyword argument" + msg = ( + r"(RangeIndex.)?from_range\(\) got an unexpected keyword argument( 'copy')?" + ) with pytest.raises(TypeError, match=msg): RangeIndex.from_range(range(10), copy=True) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 3d36e03751f95..f7313f100d429 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -21,7 +21,11 @@ class TestRangeIndex(Numeric): - _holder = RangeIndex + _index_cls = RangeIndex + + @pytest.fixture + def simple_index(self) -> Index: + return self._index_cls(start=0, stop=20, step=2) @pytest.fixture( params=[ @@ -33,16 +37,13 @@ class TestRangeIndex(Numeric): def index(self, request): return request.param - def create_index(self) -> RangeIndex: - return RangeIndex(start=0, stop=20, step=2) - - def test_can_hold_identifiers(self): - idx = self.create_index() + def test_can_hold_identifiers(self, simple_index): + idx = simple_index key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False - def test_too_many_names(self): - index = self.create_index() + def test_too_many_names(self, simple_index): + index = simple_index with pytest.raises(ValueError, match="^Length"): index.names = ["roger", "harold"] @@ -62,9 +63,9 @@ def test_start_stop_step_attrs(self, index, start, stop, step): assert index.step == step @pytest.mark.parametrize("attr_name", ["_start", "_stop", "_step"]) - def test_deprecated_start_stop_step_attrs(self, attr_name): + def test_deprecated_start_stop_step_attrs(self, attr_name, simple_index): # GH 26581 - idx = self.create_index() + idx = simple_index with tm.assert_produces_warning(FutureWarning): getattr(idx, attr_name) @@ -140,8 +141,8 @@ def test_view(self): i_view = i.view(RangeIndex) tm.assert_index_equal(i, i_view) - def test_dtype(self): - index = self.create_index() + def test_dtype(self, simple_index): + index = simple_index assert index.dtype == np.int64 def test_cache(self): @@ -253,13 +254,13 @@ def test_equals_range(self): assert left.equals(right) assert right.equals(left) - def test_logical_compat(self): - idx = self.create_index() + def test_logical_compat(self, simple_index): + idx = simple_index assert idx.all() == idx.values.all() assert idx.any() == idx.values.any() - def test_identical(self): - index = self.create_index() + def test_identical(self, simple_index): + index = simple_index i = Index(index.copy()) assert i.identical(index) @@ -304,17 +305,17 @@ def test_cant_or_shouldnt_cast(self, start, stop, step): with pytest.raises(TypeError, match=msg): RangeIndex(start, stop, step) - def test_view_index(self): - index = self.create_index() + def test_view_index(self, simple_index): + index = simple_index index.view(Index) - def test_prevent_casting(self): - index = self.create_index() + def test_prevent_casting(self, simple_index): + index = simple_index result = index.astype("O") assert result.dtype == np.object_ - def test_repr_roundtrip(self): - index = self.create_index() + def test_repr_roundtrip(self, simple_index): + index = simple_index tm.assert_index_equal(eval(repr(index)), index) def test_slice_keep_name(self): @@ -325,8 +326,8 @@ def test_has_duplicates(self, index): assert index.is_unique assert not index.has_duplicates - def test_extended_gcd(self): - index = self.create_index() + def test_extended_gcd(self, simple_index): + index = simple_index result = index._extended_gcd(6, 10) assert result[0] == result[1] * 6 + result[2] * 10 assert 2 == result[0] @@ -375,8 +376,8 @@ def test_pickle_compat_construction(self): # RangeIndex() is a valid constructor pass - def test_slice_specialised(self): - index = self.create_index() + def test_slice_specialised(self, simple_index): + index = simple_index index.name = "foo" # scalar indexing @@ -506,7 +507,7 @@ def test_engineless_lookup(self): def test_format_empty(self): # GH35712 - empty_idx = self._holder(0) + empty_idx = self._index_cls(0) assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] @@ -524,3 +525,11 @@ def test_append_len_one(self, RI): # GH39401 result = RI.append([]) tm.assert_index_equal(result, RI, exact=True) + + @pytest.mark.parametrize("base", [RangeIndex(0, 2), Index([0, 1])]) + def test_isin_range(self, base): + # GH#41151 + values = RangeIndex(0, 1) + result = base.isin(values) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1e9348dc410d7..b5822b768fdde 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -46,13 +46,14 @@ class TestIndex(Base): - _holder = Index + _index_cls = Index - def create_index(self) -> Index: - return Index(list("abcde")) + @pytest.fixture + def simple_index(self) -> Index: + return self._index_cls(list("abcde")) - def test_can_hold_identifiers(self): - index = self.create_index() + def test_can_hold_identifiers(self, simple_index): + index = simple_index key = index[0] assert index._can_hold_identifiers_and_holds_name(key) is True @@ -77,8 +78,6 @@ def test_constructor_casting(self, index): @pytest.mark.parametrize("index", ["string"], indirect=True) def test_constructor_copy(self, index): - # copy - # index = self.create_index() arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) @@ -600,8 +599,8 @@ def test_booleanindex(self, index): for i, val in enumerate(sub_index): assert sub_index.get_loc(val) == i - def test_fancy(self): - index = self.create_index() + def test_fancy(self, simple_index): + index = simple_index sl = index[[1, 2, 3]] for i in sl: assert i == sl[sl.get_loc(i)] @@ -628,9 +627,9 @@ def test_empty_fancy_raises(self, index): with pytest.raises(IndexError, match=msg): index[empty_farr] - def test_union_dt_as_obj(self, sort): + def test_union_dt_as_obj(self, sort, simple_index): # TODO: Replace with fixturesult - index = self.create_index() + index = simple_index date_index = date_range("2019-01-01", periods=10) first_cat = index.union(date_index) second_cat = index.union(index) @@ -754,9 +753,9 @@ def test_append_empty_preserve_name(self, name, expected): result = left.append(right) assert result.name == expected - def test_is_mixed_deprecated(self): + def test_is_mixed_deprecated(self, simple_index): # GH#32922 - index = self.create_index() + index = simple_index with tm.assert_produces_warning(FutureWarning): index.is_mixed() @@ -866,8 +865,8 @@ def test_format_datetime_with_time(self): assert result == expected @pytest.mark.parametrize("op", ["any", "all"]) - def test_logical_compat(self, op): - index = self.create_index() + def test_logical_compat(self, op, simple_index): + index = simple_index assert getattr(index, op)() == getattr(index.values, op)() @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) @@ -973,9 +972,9 @@ def test_is_monotonic_incomparable(self, attr): index = Index([5, datetime.now(), 7]) assert not getattr(index, attr) - def test_set_value_deprecated(self): + def test_set_value_deprecated(self, simple_index): # GH 28621 - idx = self.create_index() + idx = simple_index arr = np.array([1, 2, 3]) with tm.assert_produces_warning(FutureWarning): idx.set_value(arr, idx[1], 80) @@ -1415,29 +1414,30 @@ class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ # in py2 and py3 because ints and strings are uncomparable in py3 # (GH 13514) - _holder = Index + _index_cls = Index + + @pytest.fixture + def simple_index(self) -> Index: + return self._index_cls([0, "a", 1, "b", 2, "c"]) @pytest.fixture(params=[[0, "a", 1, "b", 2, "c"]], ids=["mixedIndex"]) def index(self, request): return Index(request.param) - def create_index(self) -> Index: - return Index([0, "a", 1, "b", 2, "c"]) - - def test_argsort(self): - index = self.create_index() + def test_argsort(self, simple_index): + index = simple_index with pytest.raises(TypeError, match="'>|<' not supported"): index.argsort() - def test_numpy_argsort(self): - index = self.create_index() + def test_numpy_argsort(self, simple_index): + index = simple_index with pytest.raises(TypeError, match="'>|<' not supported"): np.argsort(index) - def test_copy_name(self): + def test_copy_name(self, simple_index): # Check that "name" argument passed at initialization is honoured # GH12309 - index = self.create_index() + index = simple_index first = type(index)(index, copy=True, name="mario") second = type(first)(first, copy=False) @@ -1482,8 +1482,8 @@ def test_unique_na(self): result = idx.unique() tm.assert_index_equal(result, expected) - def test_logical_compat(self): - index = self.create_index() + def test_logical_compat(self, simple_index): + index = simple_index assert index.all() == index.values.all() assert index.any() == index.values.any() diff --git a/pandas/tests/indexes/test_engines.py b/pandas/tests/indexes/test_engines.py index 52af29d999fcc..9f41c68909f6e 100644 --- a/pandas/tests/indexes/test_engines.py +++ b/pandas/tests/indexes/test_engines.py @@ -61,7 +61,13 @@ class TestTimedeltaEngine: @pytest.mark.parametrize( "scalar", [ - pd.Timestamp(pd.Timedelta(days=42).asm8.view("datetime64[ns]")), + # error: Argument 1 to "Timestamp" has incompatible type "timedelta64"; + # expected "Union[integer[Any], float, str, date, datetime64]" + pd.Timestamp( + pd.Timedelta(days=42).asm8.view( + "datetime64[ns]" + ) # type: ignore[arg-type] + ), pd.Timedelta(days=42).value, pd.Timedelta(days=42).to_pytimedelta(), pd.Timedelta(days=42).to_timedelta64(), diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 99dadfba4e7aa..c5dc84dac0fd2 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -86,14 +86,14 @@ def test_where(self): # Tested in numeric.test_indexing pass - def test_can_hold_identifiers(self): - idx = self.create_index() + def test_can_hold_identifiers(self, simple_index): + idx = simple_index key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False - def test_format(self): + def test_format(self, simple_index): # GH35439 - idx = self.create_index() + idx = simple_index max_width = max(len(str(x)) for x in idx) expected = [str(x).ljust(max_width) for x in idx] assert idx.format() == expected @@ -101,9 +101,9 @@ def test_format(self): def test_numeric_compat(self): pass # override Base method - def test_insert_na(self, nulls_fixture): + def test_insert_na(self, nulls_fixture, simple_index): # GH 18295 (test missing) - index = self.create_index() + index = simple_index na_val = nulls_fixture if na_val is pd.NaT: @@ -116,7 +116,13 @@ def test_insert_na(self, nulls_fixture): class TestFloat64Index(Numeric): - _holder = Float64Index + _index_cls = Float64Index + _dtype = np.float64 + + @pytest.fixture + def simple_index(self) -> Index: + values = np.arange(5, dtype=self._dtype) + return self._index_cls(values) @pytest.fixture( params=[ @@ -128,63 +134,73 @@ class TestFloat64Index(Numeric): ids=["mixed", "float", "mixed_dec", "float_dec"], ) def index(self, request): - return Float64Index(request.param) + return self._index_cls(request.param) @pytest.fixture def mixed_index(self): - return Float64Index([1.5, 2, 3, 4, 5]) + return self._index_cls([1.5, 2, 3, 4, 5]) @pytest.fixture def float_index(self): - return Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) - - def create_index(self) -> Float64Index: - return Float64Index(np.arange(5, dtype="float64")) + return self._index_cls([0.0, 2.5, 5.0, 7.5, 10.0]) def test_repr_roundtrip(self, index): tm.assert_index_equal(eval(repr(index)), index) - def check_is_index(self, i): - assert isinstance(i, Index) - assert not isinstance(i, Float64Index) + def check_is_index(self, idx): + assert isinstance(idx, Index) + assert not isinstance(idx, self._index_cls) def check_coerce(self, a, b, is_float_index=True): assert a.equals(b) tm.assert_index_equal(a, b, exact=False) if is_float_index: - assert isinstance(b, Float64Index) + assert isinstance(b, self._index_cls) else: self.check_is_index(b) def test_constructor(self): + index_cls = self._index_cls + dtype = self._dtype # explicit construction - index = Float64Index([1, 2, 3, 4, 5]) - assert isinstance(index, Float64Index) - expected = np.array([1, 2, 3, 4, 5], dtype="float64") + index = index_cls([1, 2, 3, 4, 5]) + + assert isinstance(index, index_cls) + assert index.dtype.type is dtype + + expected = np.array([1, 2, 3, 4, 5], dtype=dtype) tm.assert_numpy_array_equal(index.values, expected) - index = Float64Index(np.array([1, 2, 3, 4, 5])) - assert isinstance(index, Float64Index) - index = Float64Index([1.0, 2, 3, 4, 5]) - assert isinstance(index, Float64Index) - index = Float64Index(np.array([1.0, 2, 3, 4, 5])) - assert isinstance(index, Float64Index) - assert index.dtype == float - - index = Float64Index(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32) - assert isinstance(index, Float64Index) - assert index.dtype == np.float64 - - index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32) - assert isinstance(index, Float64Index) - assert index.dtype == np.float64 + index = index_cls(np.array([1, 2, 3, 4, 5])) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls([1.0, 2, 3, 4, 5]) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls(np.array([1.0, 2, 3, 4, 5])) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls(np.array([1, 2, 3, 4, 5]), dtype=np.float32) + assert isinstance(index, index_cls) + assert index.dtype == dtype # nan handling - result = Float64Index([np.nan, np.nan]) + result = index_cls([np.nan, np.nan]) assert pd.isna(result.values).all() - result = Float64Index(np.array([np.nan])) + + result = index_cls(np.array([np.nan])) assert pd.isna(result.values).all() + result = Index(np.array([np.nan])) + assert isinstance(result, index_cls) + assert result.dtype == dtype assert pd.isna(result.values).all() @pytest.mark.parametrize( @@ -205,14 +221,16 @@ def test_invalid_dtype(self, index, dtype): index([1, 2, 3], dtype=dtype) def test_constructor_invalid(self): + index_cls = self._index_cls + cls_name = index_cls.__name__ # invalid msg = ( - r"Float64Index\(\.\.\.\) must be called with a collection of " + rf"{cls_name}\(\.\.\.\) must be called with a collection of " r"some kind, 0\.0 was passed" ) with pytest.raises(TypeError, match=msg): - Float64Index(0.0) + index_cls(0.0) # 2021-02-1 we get ValueError in numpy 1.20, but not on all builds msg = "|".join( @@ -222,11 +240,13 @@ def test_constructor_invalid(self): ] ) with pytest.raises((TypeError, ValueError), match=msg): - Float64Index(["a", "b", 0.0]) + index_cls(["a", "b", 0.0]) - msg = r"float\(\) argument must be a string or a number, not 'Timestamp'" + msg = ( + r"float\(\) argument must be a string or a( real)? number, not 'Timestamp'" + ) with pytest.raises(TypeError, match=msg): - Float64Index([Timestamp("20130101")]) + index_cls([Timestamp("20130101")]) def test_constructor_coerce(self, mixed_index, float_index): @@ -255,24 +275,25 @@ def test_type_coercion_fail(self, any_int_dtype): def test_type_coercion_valid(self, float_dtype): # There is no Float32Index, so we always # generate Float64Index. - i = Index([1, 2, 3.5], dtype=float_dtype) - tm.assert_index_equal(i, Index([1, 2, 3.5])) + idx = Index([1, 2, 3.5], dtype=float_dtype) + tm.assert_index_equal(idx, Index([1, 2, 3.5])) def test_equals_numeric(self): + index_cls = self._index_cls - i = Float64Index([1.0, 2.0]) - assert i.equals(i) - assert i.identical(i) + idx = index_cls([1.0, 2.0]) + assert idx.equals(idx) + assert idx.identical(idx) - i2 = Float64Index([1.0, 2.0]) - assert i.equals(i2) + idx2 = index_cls([1.0, 2.0]) + assert idx.equals(idx2) - i = Float64Index([1.0, np.nan]) - assert i.equals(i) - assert i.identical(i) + idx = index_cls([1.0, np.nan]) + assert idx.equals(idx) + assert idx.identical(idx) - i2 = Float64Index([1.0, np.nan]) - assert i.equals(i2) + idx2 = index_cls([1.0, np.nan]) + assert idx.equals(idx2) @pytest.mark.parametrize( "other", @@ -283,9 +304,9 @@ def test_equals_numeric(self): ), ) def test_equals_numeric_other_index_type(self, other): - i = Float64Index([1.0, 2.0]) - assert i.equals(other) - assert other.equals(i) + idx = self._index_cls([1.0, 2.0]) + assert idx.equals(other) + assert other.equals(idx) @pytest.mark.parametrize( "vals", @@ -295,10 +316,12 @@ def test_equals_numeric_other_index_type(self, other): ], ) def test_lookups_datetimelike_values(self, vals): + dtype = self._dtype + # If we have datetime64 or timedelta64 values, make sure they are # wrappped correctly GH#31163 ser = Series(vals, index=range(3, 6)) - ser.index = ser.index.astype("float64") + ser.index = ser.index.astype(dtype) expected = vals[1] @@ -332,19 +355,21 @@ def test_lookups_datetimelike_values(self, vals): assert isinstance(result, type(expected)) and result == expected def test_doesnt_contain_all_the_things(self): - i = Float64Index([np.nan]) - assert not i.isin([0]).item() - assert not i.isin([1]).item() - assert i.isin([np.nan]).item() + idx = self._index_cls([np.nan]) + assert not idx.isin([0]).item() + assert not idx.isin([1]).item() + assert idx.isin([np.nan]).item() def test_nan_multiple_containment(self): - i = Float64Index([1.0, np.nan]) - tm.assert_numpy_array_equal(i.isin([1.0]), np.array([True, False])) - tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), np.array([False, False])) - tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, True])) - tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), np.array([True, True])) - i = Float64Index([1.0, 2.0]) - tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False])) + index_cls = self._index_cls + + idx = index_cls([1.0, np.nan]) + tm.assert_numpy_array_equal(idx.isin([1.0]), np.array([True, False])) + tm.assert_numpy_array_equal(idx.isin([2.0, np.pi]), np.array([False, False])) + tm.assert_numpy_array_equal(idx.isin([np.nan]), np.array([False, True])) + tm.assert_numpy_array_equal(idx.isin([1.0, np.nan]), np.array([True, True])) + idx = index_cls([1.0, 2.0]) + tm.assert_numpy_array_equal(idx.isin([np.nan]), np.array([False, False])) def test_fillna_float64(self): # GH 11343 @@ -354,7 +379,7 @@ def test_fillna_float64(self): tm.assert_index_equal(idx.fillna(0.1), exp) # downcast - exp = Float64Index([1.0, 2.0, 3.0], name="x") + exp = self._index_cls([1.0, 2.0, 3.0], name="x") tm.assert_index_equal(idx.fillna(2), exp) # object @@ -364,30 +389,34 @@ def test_fillna_float64(self): class NumericInt(Numeric): def test_view(self): - i = self._holder([], name="Foo") - i_view = i.view() - assert i_view.name == "Foo" + index_cls = self._index_cls - i_view = i.view(self._dtype) - tm.assert_index_equal(i, self._holder(i_view, name="Foo")) + idx = index_cls([], name="Foo") + idx_view = idx.view() + assert idx_view.name == "Foo" - i_view = i.view(self._holder) - tm.assert_index_equal(i, self._holder(i_view, name="Foo")) + idx_view = idx.view(self._dtype) + tm.assert_index_equal(idx, index_cls(idx_view, name="Foo")) + + idx_view = idx.view(index_cls) + tm.assert_index_equal(idx, index_cls(idx_view, name="Foo")) def test_is_monotonic(self): - index = self._holder([1, 2, 3, 4]) + index_cls = self._index_cls + + index = index_cls([1, 2, 3, 4]) assert index.is_monotonic is True assert index.is_monotonic_increasing is True assert index._is_strictly_monotonic_increasing is True assert index.is_monotonic_decreasing is False assert index._is_strictly_monotonic_decreasing is False - index = self._holder([4, 3, 2, 1]) + index = index_cls([4, 3, 2, 1]) assert index.is_monotonic is False assert index._is_strictly_monotonic_increasing is False assert index._is_strictly_monotonic_decreasing is True - index = self._holder([1]) + index = index_cls([1]) assert index.is_monotonic is True assert index.is_monotonic_increasing is True assert index.is_monotonic_decreasing is True @@ -395,40 +424,43 @@ def test_is_monotonic(self): assert index._is_strictly_monotonic_decreasing is True def test_is_strictly_monotonic(self): - index = self._holder([1, 1, 2, 3]) + index_cls = self._index_cls + + index = index_cls([1, 1, 2, 3]) assert index.is_monotonic_increasing is True assert index._is_strictly_monotonic_increasing is False - index = self._holder([3, 2, 1, 1]) + index = index_cls([3, 2, 1, 1]) assert index.is_monotonic_decreasing is True assert index._is_strictly_monotonic_decreasing is False - index = self._holder([1, 1]) + index = index_cls([1, 1]) assert index.is_monotonic_increasing assert index.is_monotonic_decreasing assert not index._is_strictly_monotonic_increasing assert not index._is_strictly_monotonic_decreasing - def test_logical_compat(self): - idx = self.create_index() + def test_logical_compat(self, simple_index): + idx = simple_index assert idx.all() == idx.values.all() assert idx.any() == idx.values.any() - def test_identical(self): - index = self.create_index() - i = Index(index.copy()) - assert i.identical(index) + def test_identical(self, simple_index): + index = simple_index + + idx = Index(index.copy()) + assert idx.identical(index) - same_values_different_type = Index(i, dtype=object) - assert not i.identical(same_values_different_type) + same_values_different_type = Index(idx, dtype=object) + assert not idx.identical(same_values_different_type) - i = index.astype(dtype=object) - i = i.rename("foo") - same_values = Index(i, dtype=object) - assert same_values.identical(i) + idx = index.astype(dtype=object) + idx = idx.rename("foo") + same_values = Index(idx, dtype=object) + assert same_values.identical(idx) - assert not i.identical(index) - assert Index(same_values, name="foo", dtype=object).identical(i) + assert not idx.identical(index) + assert Index(same_values, name="foo", dtype=object).identical(idx) assert not index.astype(dtype=object).identical(index.astype(dtype=self._dtype)) @@ -440,58 +472,61 @@ def test_cant_or_shouldnt_cast(self): # can't data = ["foo", "bar", "baz"] with pytest.raises(TypeError, match=msg): - self._holder(data) + self._index_cls(data) # shouldn't data = ["0", "1", "2"] with pytest.raises(TypeError, match=msg): - self._holder(data) + self._index_cls(data) - def test_view_index(self): - index = self.create_index() + def test_view_index(self, simple_index): + index = simple_index index.view(Index) - def test_prevent_casting(self): - index = self.create_index() + def test_prevent_casting(self, simple_index): + index = simple_index result = index.astype("O") assert result.dtype == np.object_ class TestInt64Index(NumericInt): - _dtype = "int64" - _holder = Int64Index + _index_cls = Int64Index + _dtype = np.int64 + + @pytest.fixture + def simple_index(self) -> Index: + return self._index_cls(range(0, 20, 2), dtype=self._dtype) @pytest.fixture( params=[range(0, 20, 2), range(19, -1, -1)], ids=["index_inc", "index_dec"] ) def index(self, request): - return Int64Index(request.param) - - def create_index(self) -> Int64Index: - # return Int64Index(np.arange(5, dtype="int64")) - return Int64Index(range(0, 20, 2)) + return self._index_cls(request.param) def test_constructor(self): + index_cls = self._index_cls + dtype = self._dtype + # pass list, coerce fine - index = Int64Index([-5, 0, 1, 2]) - expected = Index([-5, 0, 1, 2], dtype=np.int64) + index = index_cls([-5, 0, 1, 2]) + expected = Index([-5, 0, 1, 2], dtype=dtype) tm.assert_index_equal(index, expected) # from iterable - index = Int64Index(iter([-5, 0, 1, 2])) + index = index_cls(iter([-5, 0, 1, 2])) tm.assert_index_equal(index, expected) # scalar raise Exception msg = ( - r"Int64Index\(\.\.\.\) must be called with a collection of some " + rf"{index_cls.__name__}\(\.\.\.\) must be called with a collection of some " "kind, 5 was passed" ) with pytest.raises(TypeError, match=msg): - Int64Index(5) + index_cls(5) # copy arr = index.values - new_index = Int64Index(arr, copy=True) + new_index = index_cls(arr, copy=True) tm.assert_index_equal(new_index, index) val = arr[0] + 3000 @@ -500,29 +535,32 @@ def test_constructor(self): assert new_index[0] != val # interpret list-like - expected = Int64Index([5, 0]) - for cls in [Index, Int64Index]: + expected = index_cls([5, 0]) + for cls in [Index, index_cls]: for idx in [ - cls([5, 0], dtype="int64"), - cls(np.array([5, 0]), dtype="int64"), - cls(Series([5, 0]), dtype="int64"), + cls([5, 0], dtype=dtype), + cls(np.array([5, 0]), dtype=dtype), + cls(Series([5, 0]), dtype=dtype), ]: tm.assert_index_equal(idx, expected) def test_constructor_corner(self): + index_cls = self._index_cls + dtype = self._dtype + arr = np.array([1, 2, 3, 4], dtype=object) - index = Int64Index(arr) - assert index.values.dtype == np.int64 + index = index_cls(arr) + assert index.values.dtype == dtype tm.assert_index_equal(index, Index(arr)) # preventing casting arr = np.array([1, "2", 3, "4"], dtype=object) with pytest.raises(TypeError, match="casting"): - Int64Index(arr) + index_cls(arr) arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1] with pytest.raises(TypeError, match="casting"): - Int64Index(arr_with_floats) + index_cls(arr_with_floats) def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): @@ -534,14 +572,14 @@ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): def test_constructor_unwraps_index(self): idx = Index([1, 2]) - result = Int64Index(idx) - expected = np.array([1, 2], dtype="int64") + result = self._index_cls(idx) + expected = np.array([1, 2], dtype=self._dtype) tm.assert_numpy_array_equal(result._data, expected) def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) - assert isinstance(arr, Int64Index) + assert isinstance(arr, self._index_cls) # but not if explicit dtype passed arr = Index([1, 2, 3, 4], dtype=object) @@ -550,8 +588,13 @@ def test_coerce_list(self): class TestUInt64Index(NumericInt): - _dtype = "uint64" - _holder = UInt64Index + _index_cls = UInt64Index + _dtype = np.uint64 + + @pytest.fixture + def simple_index(self) -> Index: + # compat with shared Int64/Float64 tests + return self._index_cls(np.arange(5, dtype=self._dtype)) @pytest.fixture( params=[ @@ -561,22 +604,21 @@ class TestUInt64Index(NumericInt): ids=["index_inc", "index_dec"], ) def index(self, request): - return UInt64Index(request.param) - - def create_index(self) -> UInt64Index: - # compat with shared Int64/Float64 tests - return UInt64Index(np.arange(5, dtype="uint64")) + return self._index_cls(request.param) def test_constructor(self): - idx = UInt64Index([1, 2, 3]) - res = Index([1, 2, 3], dtype=np.uint64) + index_cls = self._index_cls + dtype = self._dtype + + idx = index_cls([1, 2, 3]) + res = Index([1, 2, 3], dtype=dtype) tm.assert_index_equal(res, idx) - idx = UInt64Index([1, 2 ** 63]) - res = Index([1, 2 ** 63], dtype=np.uint64) + idx = index_cls([1, 2 ** 63]) + res = Index([1, 2 ** 63], dtype=dtype) tm.assert_index_equal(res, idx) - idx = UInt64Index([1, 2 ** 63]) + idx = index_cls([1, 2 ** 63]) res = Index([1, 2 ** 63]) tm.assert_index_equal(res, idx) @@ -585,8 +627,8 @@ def test_constructor(self): tm.assert_index_equal(res, idx) # https://github.com/pandas-dev/pandas/issues/29526 - idx = UInt64Index([1, 2 ** 63 + 1], dtype=np.uint64) - res = Index([1, 2 ** 63 + 1], dtype=np.uint64) + idx = index_cls([1, 2 ** 63 + 1], dtype=dtype) + res = Index([1, 2 ** 63 + 1], dtype=dtype) tm.assert_index_equal(res, idx) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 3555d043659cf..62c07f4306a96 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -17,7 +17,6 @@ Index, Int64Index, MultiIndex, - RangeIndex, Series, TimedeltaIndex, Timestamp, @@ -29,12 +28,10 @@ pandas_dtype, ) -COMPATIBLE_INCONSISTENT_PAIRS = { - (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), - (Float64Index, Int64Index): (tm.makeFloatIndex, tm.makeIntIndex), - (Float64Index, RangeIndex): (tm.makeFloatIndex, tm.makeIntIndex), - (Float64Index, UInt64Index): (tm.makeFloatIndex, tm.makeUIntIndex), -} +COMPATIBLE_INCONSISTENT_PAIRS = [ + (np.float64, np.int64), + (np.float64, np.uint64), +] def test_union_same_types(index): @@ -51,7 +48,7 @@ def test_union_different_types(index_flat, index_flat2): idx1 = index_flat idx2 = index_flat2 - type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) + type_pair = tuple(sorted([idx1.dtype.type, idx2.dtype.type], key=lambda x: str(x))) # Union with a non-unique, non-monotonic index raises error # This applies to the boolean index @@ -80,7 +77,15 @@ def test_union_different_types(index_flat, index_flat2): raise NotImplementedError -@pytest.mark.parametrize("idx_fact1,idx_fact2", COMPATIBLE_INCONSISTENT_PAIRS.values()) +@pytest.mark.parametrize( + "idx_fact1,idx_fact2", + [ + (tm.makeIntIndex, tm.makeRangeIndex), + (tm.makeFloatIndex, tm.makeIntIndex), + (tm.makeFloatIndex, tm.makeRangeIndex), + (tm.makeFloatIndex, tm.makeUIntIndex), + ], +) def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): # GH 23525 idx1 = idx_fact1(10) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index d0f4828e8c7bd..478697ed1a5be 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -20,19 +20,20 @@ class TestTimedeltaIndex(DatetimeLike): - _holder = TimedeltaIndex + _index_cls = TimedeltaIndex @pytest.fixture - def index(self): - return tm.makeTimedeltaIndex(10) - - def create_index(self) -> TimedeltaIndex: + def simple_index(self) -> TimedeltaIndex: index = pd.to_timedelta(range(5), unit="d")._with_freq("infer") assert index.freq == "D" ret = index + pd.offsets.Hour(1) assert ret.freq == "D" return ret + @pytest.fixture + def index(self): + return tm.makeTimedeltaIndex(10) + def test_numeric_compat(self): # Dummy method to override super's version; this test is now done # in test_arithmetic.py diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index eb13ec1f366af..395e9297a8fde 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -71,7 +71,7 @@ def test_getitem_non_matching(self, series_with_interval_index, indexer_sl): with pytest.raises(KeyError, match=r"^\[-1\]$"): indexer_sl(ser)[[-1, 3]] - @pytest.mark.arm_slow + @pytest.mark.slow def test_loc_getitem_large_series(self): ser = Series( np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001)) diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 1db354a7f30b5..6ccd44e698a8a 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -56,7 +56,7 @@ def test_cache_updating(): assert result == 2 -@pytest.mark.arm_slow +@pytest.mark.slow def test_indexer_caching(): # GH5727 # make sure that indexers are in the _internal_names_set diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 7642f78076dcb..6f4949267c00c 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -258,6 +258,15 @@ def test_setitem_series_timedelta64(self, val, exp_dtype): ) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + def test_setitem_series_no_coercion_from_values_list(self): + # GH35865 - int casted to str when internally calling np.array(ser.values) + ser = pd.Series(["a", 1]) + ser[:] = list(ser.values) + + expected = pd.Series(["a", 1]) + + tm.assert_series_equal(ser, expected) + def _assert_setitem_index_conversion( self, original_series, loc_key, expected_index, expected_dtype ): @@ -641,7 +650,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype): values = klass([True, False, True, True]) else: values = klass(x * fill_val for x in [5, 6, 7, 8]) - exp = klass([1 + 1j, values[1], 3 + 3j, values[3]]) + exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 853c7079a3c1b..ad0d4245d58c3 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1186,6 +1186,21 @@ def test_iloc_setitem_series_duplicate_columns(self): df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64) assert df.dtypes.iloc[2] == np.int64 + @pytest.mark.parametrize( + ["dtypes", "init_value", "expected_value"], + [("int64", "0", 0), ("float", "1.2", 1.2)], + ) + def test_iloc_setitem_dtypes_duplicate_columns( + self, dtypes, init_value, expected_value + ): + # GH#22035 + df = DataFrame([[init_value, "str", "str2"]], columns=["a", "b", "b"]) + df.iloc[:, 0] = df.iloc[:, 0].astype(dtypes) + expected_df = DataFrame( + [[expected_value, "str", "str2"]], columns=["a", "b", "b"] + ) + tm.assert_frame_equal(df, expected_df) + class TestILocCallable: def test_frame_iloc_getitem_callable(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 97b3412ce626e..11391efde4956 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2700,3 +2700,13 @@ def test_loc_setitem(self, string_series): string_series.loc[d2] = 6 assert string_series[d1] == 4 assert string_series[d2] == 6 + + @pytest.mark.parametrize("dtype", ["object", "string"]) + def test_loc_assign_dict_to_row(self, dtype): + # GH41044 + df = DataFrame({"A": ["abc", "def"], "B": ["ghi", "jkl"]}, dtype=dtype) + df.loc[0, :] = {"A": "newA", "B": "newB"} + + expected = DataFrame({"A": ["newA", "def"], "B": ["newB", "jkl"]}, dtype=dtype) + + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 0062d5aa34319..21299d76eaf5a 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -3,6 +3,7 @@ in core.internals """ +import pandas as pd from pandas.core import internals from pandas.core.internals import api @@ -44,3 +45,12 @@ def test_namespace(): result = [x for x in dir(internals) if not x.startswith("__")] assert set(result) == set(expected + modules) + + +def test_make_block_2d_with_dti(): + # GH#41168 + dti = pd.date_range("2012", periods=3, tz="UTC") + blk = api.make_block(dti, placement=[0]) + + assert blk.shape == (1, 3) + assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index a1c5810ba8bb8..08dba5aa76a2f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -545,7 +545,7 @@ def test_astype(self, t): mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8") t = np.dtype(t) - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, check_stacklevel=False): tmgr = mgr.astype(t, errors="ignore") assert tmgr.iget(2).dtype.type == t assert tmgr.iget(4).dtype.type == t @@ -618,10 +618,10 @@ def _compare(old_mgr, new_mgr): assert new_mgr.iget(8).dtype == np.float16 def test_invalid_ea_block(self): - with pytest.raises(AssertionError, match="block.size != values.size"): + with pytest.raises(ValueError, match="need to split"): create_mgr("a: category; b: category") - with pytest.raises(AssertionError, match="block.size != values.size"): + with pytest.raises(ValueError, match="need to split"): create_mgr("a: category2; b: category2") def test_interleave(self): @@ -823,7 +823,7 @@ def test_equals_block_order_different_dtypes(self, mgr_string): def test_single_mgr_ctor(self): mgr = create_single_mgr("f8", num_rows=5) - assert mgr.as_array().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] + assert mgr.external_values().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_args(self, value): @@ -837,6 +837,12 @@ def test_validate_bool_args(self, value): bm1.replace_list([1], [2], inplace=value) +def _as_array(mgr): + if mgr.ndim == 1: + return mgr.external_values() + return mgr.as_array() + + class TestIndexing: # Nosetests-style data-driven tests. # @@ -859,7 +865,7 @@ class TestIndexing: @pytest.mark.parametrize("mgr", MANAGERS) def test_get_slice(self, mgr): def assert_slice_ok(mgr, axis, slobj): - mat = mgr.as_array() + mat = _as_array(mgr) # we maybe using an ndarray to test slicing and # might not be the full length of the axis @@ -881,7 +887,7 @@ def assert_slice_ok(mgr, axis, slobj): mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( - mat[mat_slobj], sliced.as_array(), check_dtype=False + mat[mat_slobj], _as_array(sliced), check_dtype=False ) tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) @@ -919,10 +925,10 @@ def assert_slice_ok(mgr, axis, slobj): @pytest.mark.parametrize("mgr", MANAGERS) def test_take(self, mgr): def assert_take_ok(mgr, axis, indexer): - mat = mgr.as_array() + mat = _as_array(mgr) taken = mgr.take(indexer, axis) tm.assert_numpy_array_equal( - np.take(mat, indexer, axis), taken.as_array(), check_dtype=False + np.take(mat, indexer, axis), _as_array(taken), check_dtype=False ) tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis]) @@ -940,13 +946,13 @@ def assert_take_ok(mgr, axis, indexer): @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0]) def test_reindex_axis(self, fill_value, mgr): def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): - mat = mgr.as_array() + mat = _as_array(mgr) indexer = mgr.axes[axis].get_indexer_for(new_labels) reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) tm.assert_numpy_array_equal( algos.take_nd(mat, indexer, axis, fill_value=fill_value), - reindexed.as_array(), + _as_array(reindexed), check_dtype=False, ) tm.assert_index_equal(reindexed.axes[axis], new_labels) @@ -971,13 +977,13 @@ def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0]) def test_reindex_indexer(self, fill_value, mgr): def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): - mat = mgr.as_array() + mat = _as_array(mgr) reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) reindexed = mgr.reindex_indexer( new_labels, indexer, axis, fill_value=fill_value ) tm.assert_numpy_array_equal( - reindexed_mat, reindexed.as_array(), check_dtype=False + reindexed_mat, _as_array(reindexed), check_dtype=False ) tm.assert_index_equal(reindexed.axes[axis], new_labels) diff --git a/pandas/tests/io/data/excel/testmultiindex.ods b/pandas/tests/io/data/excel/testmultiindex.ods index deb88bdad1694..dca8d70abdc24 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.ods and b/pandas/tests/io/data/excel/testmultiindex.ods differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xls b/pandas/tests/io/data/excel/testmultiindex.xls index 08dc78ea34d56..c91698be29b13 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xls and b/pandas/tests/io/data/excel/testmultiindex.xls differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsb b/pandas/tests/io/data/excel/testmultiindex.xlsb index f5f62d305640f..a693e0c66afc2 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsb and b/pandas/tests/io/data/excel/testmultiindex.xlsb differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsm b/pandas/tests/io/data/excel/testmultiindex.xlsm index 8bd16b016608c..5a2a4ea35f0d9 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsm and b/pandas/tests/io/data/excel/testmultiindex.xlsm differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsx b/pandas/tests/io/data/excel/testmultiindex.xlsx index 56fc6f20b711a..a6174445bb83a 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsx and b/pandas/tests/io/data/excel/testmultiindex.xlsx differ diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 9010f978d268d..62f567457c3ab 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,4 +1,5 @@ from pathlib import Path +import re import numpy as np import pytest @@ -109,6 +110,66 @@ def test_write_append_mode(ext, mode, expected): assert wb2.worksheets[index]["A1"].value == cell_value +@pytest.mark.parametrize( + "if_sheet_exists,num_sheets,expected", + [ + ("new", 2, ["apple", "banana"]), + ("replace", 1, ["pear"]), + ], +) +def test_if_sheet_exists_append_modes(ext, if_sheet_exists, num_sheets, expected): + # GH 40230 + df1 = DataFrame({"fruit": ["apple", "banana"]}) + df2 = DataFrame({"fruit": ["pear"]}) + + with tm.ensure_clean(ext) as f: + df1.to_excel(f, engine="openpyxl", sheet_name="foo", index=False) + with ExcelWriter( + f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists + ) as writer: + df2.to_excel(writer, sheet_name="foo", index=False) + + wb = openpyxl.load_workbook(f) + assert len(wb.sheetnames) == num_sheets + assert wb.sheetnames[0] == "foo" + result = pd.read_excel(wb, "foo", engine="openpyxl") + assert list(result["fruit"]) == expected + if len(wb.sheetnames) == 2: + result = pd.read_excel(wb, wb.sheetnames[1], engine="openpyxl") + tm.assert_frame_equal(result, df2) + wb.close() + + +@pytest.mark.parametrize( + "if_sheet_exists,msg", + [ + ( + "invalid", + "'invalid' is not valid for if_sheet_exists. Valid options " + "are 'error', 'new' and 'replace'.", + ), + ( + "error", + "Sheet 'foo' already exists and if_sheet_exists is set to 'error'.", + ), + ( + None, + "Sheet 'foo' already exists and if_sheet_exists is set to 'error'.", + ), + ], +) +def test_if_sheet_exists_raises(ext, if_sheet_exists, msg): + # GH 40230 + df = DataFrame({"fruit": ["pear"]}) + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=re.escape(msg)): + df.to_excel(f, "foo", engine="openpyxl") + with ExcelWriter( + f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists + ) as writer: + df.to_excel(writer, sheet_name="foo") + + def test_to_excel_with_openpyxl_engine(ext): # GH 29854 with tm.ensure_clean(ext) as filename: @@ -175,7 +236,9 @@ def test_append_mode_file(ext): with tm.ensure_clean(ext) as f: df.to_excel(f, engine="openpyxl") - with ExcelWriter(f, mode="a", engine="openpyxl") as writer: + with ExcelWriter( + f, mode="a", engine="openpyxl", if_sheet_exists="new" + ) as writer: df.to_excel(writer) # make sure that zip files are not concatenated by making sure that diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 382c8412ab050..c4b3221e1d3a7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1193,6 +1193,17 @@ def test_one_col_noskip_blank_line(self, read_ext): result = pd.read_excel(file_name) tm.assert_frame_equal(result, expected) + def test_multiheader_two_blank_lines(self, read_ext): + # GH 40442 + file_name = "testmultiindex" + read_ext + columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) + data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]] + expected = DataFrame(data, columns=columns) + result = pd.read_excel( + file_name, sheet_name="mi_column_empty_rows", header=[0, 1] + ) + tm.assert_frame_equal(result, expected) + class TestExcelFileRead: @pytest.fixture(autouse=True) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index cce8c3d01025d..67a78f2b1de76 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1325,6 +1325,14 @@ def test_excel_duplicate_columns_with_names(self, path): expected = DataFrame([[0, 10, 0], [1, 11, 1]], columns=["A", "B", "A.1"]) tm.assert_frame_equal(result, expected) + def test_if_sheet_exists_raises(self, ext): + # GH 40230 + msg = "if_sheet_exists is only valid in append mode (mode='a')" + + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=re.escape(msg)): + ExcelWriter(f, if_sheet_exists="replace") + class TestExcelWriterEngineTests: @pytest.mark.parametrize( diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index eadb90839c74d..3b614be770bc5 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -541,7 +541,8 @@ def f(x): style1 = "foo: bar" - result = self.df.style.where(f, style1)._compute().ctx + with tm.assert_produces_warning(FutureWarning): + result = self.df.style.where(f, style1)._compute().ctx expected = { (r, c): [("foo", "bar")] for r, row in enumerate(self.df.index) @@ -568,14 +569,15 @@ def f(x): style1 = "foo: bar" style2 = "baz: foo" - result = self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx + with tm.assert_produces_warning(FutureWarning): + res = self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx expected = { (r, c): [("foo", "bar") if f(self.df.loc[row, col]) else ("baz", "foo")] for r, row in enumerate(self.df.index) for c, col in enumerate(self.df.columns) if row in self.df.loc[slice_].index and col in self.df.loc[slice_].columns } - assert result == expected + assert res == expected def test_where_subset_compare_with_applymap(self): # GH 17474 @@ -597,9 +599,10 @@ def g(x): ] for slice_ in slices: - result = ( - self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx - ) + with tm.assert_produces_warning(FutureWarning): + result = ( + self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx + ) expected = self.df.style.applymap(g, subset=slice_)._compute().ctx assert result == expected @@ -609,14 +612,15 @@ def test_where_kwargs(self): def f(x, val): return x > val - result = df.style.where(f, "color:green;", "color:red;", val=2)._compute().ctx + with tm.assert_produces_warning(FutureWarning): + res = df.style.where(f, "color:green;", "color:red;", val=2)._compute().ctx expected = { (0, 0): [("color", "red")], (0, 1): [("color", "red")], (1, 0): [("color", "green")], (1, 1): [("color", "green")], } - assert result == expected + assert res == expected def test_empty(self): df = DataFrame({"A": [1, 0]}) diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 24d1973eeda6d..f0d5ef19c4468 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -3,8 +3,6 @@ import pandas._config.config as cf -import pandas.util._test_decorators as td - import pandas as pd import pandas.io.formats.format as fmt @@ -121,7 +119,6 @@ def test_ambiguous_width(self): assert adjoined == expected -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON class TestTableSchemaRepr: @classmethod def setup_class(cls): diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 8c69ffedf1df4..febeb4d690562 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -7,8 +7,6 @@ import pandas as pd import pandas._testing as tm -pytestmark = td.skip_array_manager_not_yet_implemented - def test_compression_roundtrip(compression): df = pd.DataFrame( diff --git a/pandas/tests/io/json/test_deprecated_kwargs.py b/pandas/tests/io/json/test_deprecated_kwargs.py index 7367aaefb1c1e..79245bc9d34a8 100644 --- a/pandas/tests/io/json/test_deprecated_kwargs.py +++ b/pandas/tests/io/json/test_deprecated_kwargs.py @@ -2,15 +2,11 @@ Tests for the deprecated keyword arguments for `read_json`. """ -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm from pandas.io.json import read_json -pytestmark = td.skip_array_manager_not_yet_implemented - def test_deprecated_kwargs(): df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2]) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 9d955545aede3..71f1d03ea6d1f 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,8 +6,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -26,8 +24,6 @@ set_default_names, ) -pytestmark = td.skip_array_manager_not_yet_implemented - class TestBuildSchema: def setup_method(self, method): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index c01660ab5febe..a428d8c71a793 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -15,8 +15,6 @@ from pandas.io.json._normalize import nested_to_record -pytestmark = td.skip_array_manager_not_yet_implemented - @pytest.fixture def deep_nested(): @@ -153,6 +151,8 @@ def test_simple_records(self): tm.assert_frame_equal(result, expected) + # TODO(ArrayManager) sanitize S/U numpy dtypes to object + @td.skip_array_manager_not_yet_implemented def test_simple_normalize(self, state_data): result = json_normalize(state_data[0], "counties") expected = DataFrame(state_data[0]["counties"]) @@ -372,6 +372,8 @@ def test_meta_parameter_not_modified(self): for val in ["metafoo", "metabar", "foo", "bar"]: assert val in result + # TODO(ArrayManager) sanitize S/U numpy dtypes to object + @td.skip_array_manager_not_yet_implemented def test_record_prefix(self, state_data): result = json_normalize(state_data[0], "counties") expected = DataFrame(state_data[0]["counties"]) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3bd78d44a0b04..3cc77aa723fe9 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -12,6 +12,7 @@ from pandas.compat import ( IS64, PY38, + PY310, is_platform_windows, ) import pandas.util._test_decorators as td @@ -27,8 +28,7 @@ ) import pandas._testing as tm -pytestmark = td.skip_array_manager_not_yet_implemented - +pytestmark = pytest.mark.skipif(PY310, reason="timeout with coverage") _seriesd = tm.getSeriesData() @@ -318,7 +318,13 @@ def test_roundtrip_mixed(self, request, orient, convert_axes, numpy): '{"columns":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', - r"Shape of passed values is \(3, 2\), indices imply \(2, 2\)", + "|".join( + [ + r"Shape of passed values is \(3, 2\), indices imply \(2, 2\)", + "Passed arrays should have the same length as the rows Index: " + "3 vs 2 rows", + ] + ), "split", ), # too many columns diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 711addb1ac237..abc65f2f1eda1 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -3,8 +3,6 @@ import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -14,8 +12,6 @@ from pandas.io.json._json import JsonReader -pytestmark = td.skip_array_manager_not_yet_implemented - @pytest.fixture def lines_json_df(): @@ -196,7 +192,7 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): def test_readjson_unicode(monkeypatch): with tm.ensure_clean("test.json") as path: - monkeypatch.setattr("_bootlocale.getpreferredencoding", lambda l: "cp949") + monkeypatch.setattr("locale.getpreferredencoding", lambda l: "cp949") with open(path, "w", encoding="utf-8") as f: f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index c0337d1ad3ffe..805f6b8dbe461 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -18,7 +18,6 @@ IS64, is_platform_windows, ) -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -32,8 +31,6 @@ ) import pandas._testing as tm -pytestmark = td.skip_array_manager_not_yet_implemented - def _clean_dict(d): """ @@ -831,65 +828,65 @@ def test_0d_array(self): ( [{}, []], ValueError, - "nesting not supported for object or variable length dtypes", + r"nesting not supported for object or variable length dtypes", {}, ), ( [42, None], TypeError, - "int() argument must be a string, a bytes-like object or a number, " - "not 'NoneType'", + r"int\(\) argument must be a string, a bytes-like object or a( real)? " + r"number, not 'NoneType'", {}, ), ( [["a"], 42], ValueError, - "Cannot decode multidimensional arrays with variable length elements " - "to numpy", + r"Cannot decode multidimensional arrays with variable length elements " + r"to numpy", {}, ), ( [42, {}, "a"], TypeError, - "int() argument must be a string, a bytes-like object or a number, " - "not 'dict'", + r"int\(\) argument must be a string, a bytes-like object or a( real)? " + r"number, not 'dict'", {}, ), ( [42, ["a"], 42], ValueError, - "invalid literal for int() with base 10: 'a'", + r"invalid literal for int\(\) with base 10: 'a'", {}, ), ( ["a", "b", [], "c"], ValueError, - "nesting not supported for object or variable length dtypes", + r"nesting not supported for object or variable length dtypes", {}, ), ( [{"a": "b"}], ValueError, - "Cannot decode multidimensional arrays with variable length elements " - "to numpy", + r"Cannot decode multidimensional arrays with variable length elements " + r"to numpy", {"labelled": True}, ), ( {"a": {"b": {"c": 42}}}, ValueError, - "labels only supported up to 2 dimensions", + r"labels only supported up to 2 dimensions", {"labelled": True}, ), ( [{"a": 42, "b": 23}, {"c": 17}], ValueError, - "cannot reshape array of size 3 into shape (2,1)", + r"cannot reshape array of size 3 into shape \(2,1\)", {"labelled": True}, ), ], ) def test_array_numpy_except(self, bad_input, exc_type, err_msg, kwargs): - with pytest.raises(exc_type, match=re.escape(err_msg)): + with pytest.raises(exc_type, match=err_msg): ujson.decode(ujson.dumps(bad_input), numpy=True, **kwargs) def test_array_numpy_labelled(self): diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 6d5aeaa713687..ceb770ce72b78 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -162,6 +162,7 @@ def test_chunk_begins_with_newline_whitespace(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.slow @pytest.mark.xfail(reason="GH38630, sometimes gives ResourceWarning", strict=False) def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index f8aff3ad3696a..044af57f49240 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -159,6 +159,7 @@ def test_unsupported_dtype(c_parser_only, match, kwargs): @td.skip_if_32bit +@pytest.mark.slow def test_precise_conversion(c_parser_only): from decimal import Decimal @@ -300,6 +301,7 @@ def test_tokenize_CR_with_quoting(c_parser_only): tm.assert_frame_equal(result, expected) +@pytest.mark.slow def test_grow_boundary_at_cap(c_parser_only): # See gh-12494 # diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 89ece3b1a7300..006438df2a5e0 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -220,3 +220,20 @@ def test_parse_encoded_special_characters(encoding): expected = DataFrame(data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) +def test_encoding_memory_map(all_parsers, encoding): + # GH40986 + parser = all_parsers + expected = DataFrame( + { + "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"], + "mask": ["red", "purple", "orange", "blue"], + "weapon": ["sai", "bo staff", "nunchunk", "katana"], + } + ) + with tm.ensure_clean() as file: + expected.to_csv(file, index=False, encoding=encoding) + df = parser.read_csv(file, encoding=encoding, memory_map=True) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index f15fc16fbce38..3b814360d3aa4 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -389,6 +389,17 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) +def test_header_multi_index_blank_line(all_parsers): + # GH 40442 + parser = all_parsers + data = [[None, None], [1, 2], [3, 4]] + columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) + expected = DataFrame(data, columns=columns) + data = "a,b\nA,B\n,\n1,2\n3,4" + result = parser.read_csv(StringIO(data), header=[0, 1]) + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] ) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 104cf56419bfd..7f84c5e378d16 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -21,6 +21,7 @@ TextFileReader, read_csv, ) +from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs class TestTextReader: @@ -206,6 +207,8 @@ def test_numpy_string_dtype(self): aaaaa,5""" def _make_reader(**kwds): + if "dtype" in kwds: + kwds["dtype"] = ensure_dtype_objs(kwds["dtype"]) return TextReader(StringIO(data), delimiter=",", header=None, **kwds) reader = _make_reader(dtype="S5,i4") @@ -233,6 +236,8 @@ def test_pass_dtype(self): 4,d""" def _make_reader(**kwds): + if "dtype" in kwds: + kwds["dtype"] = ensure_dtype_objs(kwds["dtype"]) return TextReader(StringIO(data), delimiter=",", **kwds) reader = _make_reader(dtype={"one": "u1", 1: "S1"}) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index e20d78effa931..3b6bfee8f9657 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -39,6 +39,7 @@ def setup_method(self, datapath): df.iloc[:, k] = df.iloc[:, k].astype(np.float64) self.data.append(df) + @pytest.mark.slow def test_from_file(self): for j in 0, 1: df0 = self.data[j] @@ -47,6 +48,7 @@ def test_from_file(self): df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) + @pytest.mark.slow def test_from_buffer(self): for j in 0, 1: df0 = self.data[j] @@ -61,6 +63,7 @@ def test_from_buffer(self): df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) + @pytest.mark.slow def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] @@ -72,6 +75,7 @@ def test_from_iterator(self): df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) + @pytest.mark.slow def test_path_pathlib(self): for j in 0, 1: df0 = self.data[j] @@ -81,6 +85,7 @@ def test_path_pathlib(self): tm.assert_frame_equal(df, df0) @td.skip_if_no("py.path") + @pytest.mark.slow def test_path_localpath(self): from py.path import local as LocalPath @@ -91,6 +96,7 @@ def test_path_localpath(self): df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) + @pytest.mark.slow def test_iterator_loop(self): # github #13654 for j in 0, 1: diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index a8713f5bf36c9..5d3e3b8e23cdb 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -34,6 +34,7 @@ def setup_method(self, datapath): with td.file_leak_context(): yield + @pytest.mark.slow def test1_basic(self): # Tests with DEMO_G.xpt (all numeric file) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d882eb930137b..c918832df7aee 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -282,9 +282,7 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_excel", {"engine": "xlwt"}, "xlwt"), ("to_feather", {}, "pyarrow"), ("to_html", {}, "os"), - pytest.param( - "to_json", {}, "os", marks=td.skip_array_manager_not_yet_implemented - ), + ("to_json", {}, "os"), ("to_latex", {}, "os"), ("to_pickle", {}, "os"), ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), @@ -435,10 +433,7 @@ def test_is_fsspec_url(): @pytest.mark.parametrize("encoding", [None, "utf-8"]) -@pytest.mark.parametrize( - "format", - ["csv", pytest.param("json", marks=td.skip_array_manager_not_yet_implemented)], -) +@pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): # GH39247 expected = tm.makeDataFrame() diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 66c238bbd0962..6c90830639061 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -8,15 +8,11 @@ import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm import pandas.io.common as icom -pytestmark = td.skip_array_manager_not_yet_implemented - @pytest.mark.parametrize( "obj", diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c1ba625610f13..eccfab3a31241 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -249,7 +249,6 @@ def test_pickle_options(fsspectest): tm.assert_frame_equal(df, out) -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON def test_json_options(fsspectest, compression): df = DataFrame({"a": [0]}) df.to_json( diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 2ba4fbe71e244..887889bce1eaa 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -44,16 +44,7 @@ def ls(self, path, **kwargs): @td.skip_if_no("gcsfs") -@pytest.mark.parametrize( - "format", - [ - "csv", - pytest.param("json", marks=td.skip_array_manager_not_yet_implemented), - "parquet", - "excel", - "markdown", - ], -) +@pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) def test_to_read_gcs(gcs_buffer, format): """ Test that many to/read functions support GCS. diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a1f9c6f6af51a..f34e9b940317d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -9,7 +9,6 @@ from pandas import read_orc import pandas._testing as tm -pytest.importorskip("pyarrow", minversion="0.13.0") pytest.importorskip("pyarrow.orc") pytestmark = pytest.mark.filterwarnings( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 21ea2bd560060..7cc7acd9007fa 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import get_option + from pandas.compat import ( PY38, is_platform_windows, @@ -41,12 +43,12 @@ _HAVE_FASTPARQUET = False -pytestmark = [ - pytest.mark.filterwarnings("ignore:RangeIndex.* is deprecated:DeprecationWarning"), - # TODO(ArrayManager) fastparquet / pyarrow rely on BlockManager internals - td.skip_array_manager_not_yet_implemented, -] +pytestmark = pytest.mark.filterwarnings( + "ignore:RangeIndex.* is deprecated:DeprecationWarning" +) + +# TODO(ArrayManager) fastparquet relies on BlockManager internals # setup engines & skips @pytest.fixture( @@ -54,7 +56,8 @@ pytest.param( "fastparquet", marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET, reason="fastparquet is not installed" + not _HAVE_FASTPARQUET or get_option("mode.data_manager") == "array", + reason="fastparquet is not installed or ArrayManager is used", ), ), pytest.param( @@ -80,6 +83,8 @@ def pa(): def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") + elif get_option("mode.data_manager") == "array": + pytest.skip("ArrayManager is not supported with fastparquet") return "fastparquet" @@ -923,6 +928,18 @@ def test_filter_row_groups(self, pa): ) assert len(result) == 1 + def test_read_parquet_manager(self, pa, using_array_manager): + # ensure that read_parquet honors the pandas.options.mode.data_manager option + df = pd.DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"]) + + with tm.ensure_clean() as path: + df.to_parquet(path, pa) + result = read_parquet(path, pa) + if using_array_manager: + assert isinstance(result._mgr, pd.core.internals.ArrayManager) + else: + assert isinstance(result._mgr, pd.core.internals.BlockManager) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 04ddef57a9621..290e063a59be7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -52,7 +52,9 @@ import pandas.io.sql as sql from pandas.io.sql import ( + SQLAlchemyEngine, _gt14, + get_engine, read_sql_query, read_sql_table, ) @@ -575,6 +577,23 @@ def sample(pd_table, conn, keys, data_iter): # Nuke table self.drop_table("test_frame1") + def _to_sql_with_sql_engine(self, engine="auto", **engine_kwargs): + """`to_sql` with the `engine` param""" + # mostly copied from this class's `_to_sql()` method + self.drop_table("test_frame1") + + self.pandasSQL.to_sql( + self.test_frame1, "test_frame1", engine=engine, **engine_kwargs + ) + assert self.pandasSQL.has_table("test_frame1") + + num_entries = len(self.test_frame1) + num_rows = self._count_rows("test_frame1") + assert num_rows == num_entries + + # Nuke table + self.drop_table("test_frame1") + def _roundtrip(self): self.drop_table("test_frame_roundtrip") self.pandasSQL.to_sql(self.test_frame1, "test_frame_roundtrip") @@ -2053,6 +2072,41 @@ class Temporary(Base): tm.assert_frame_equal(df, expected) + # -- SQL Engine tests (in the base class for now) + def test_invalid_engine(self): + msg = "engine must be one of 'auto', 'sqlalchemy'" + with pytest.raises(ValueError, match=msg): + self._to_sql_with_sql_engine("bad_engine") + + def test_options_sqlalchemy(self): + # use the set option + + with pd.option_context("io.sql.engine", "sqlalchemy"): + self._to_sql_with_sql_engine() + + def test_options_auto(self): + # use the set option + + with pd.option_context("io.sql.engine", "auto"): + self._to_sql_with_sql_engine() + + def test_options_get_engine(self): + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + with pd.option_context("io.sql.engine", "sqlalchemy"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + with pd.option_context("io.sql.engine", "auto"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + def test_get_engine_auto_error_message(self): + # Expect different error messages from get_engine(engine="auto") + # if engines aren't installed vs. are installed but bad version + pass + # TODO fill this in when we add more engines + class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): def test_transactions(self): diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py index d7d3768fe3371..cabdbbdb44830 100644 --- a/pandas/tests/io/test_user_agent.py +++ b/pandas/tests/io/test_user_agent.py @@ -182,12 +182,7 @@ def do_GET(self): "responder, read_method, parquet_engine", [ (CSVUserAgentResponder, pd.read_csv, None), - pytest.param( - JSONUserAgentResponder, - pd.read_json, - None, - marks=td.skip_array_manager_not_yet_implemented, - ), + (JSONUserAgentResponder, pd.read_json, None), (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"), pytest.param( ParquetFastParquetUserAgentResponder, @@ -199,12 +194,7 @@ def do_GET(self): (PickleUserAgentResponder, pd.read_pickle, None), (StataUserAgentResponder, pd.read_stata, None), (GzippedCSVUserAgentResponder, pd.read_csv, None), - pytest.param( - GzippedJSONUserAgentResponder, - pd.read_json, - None, - marks=td.skip_array_manager_not_yet_implemented, - ), + (GzippedJSONUserAgentResponder, pd.read_json, None), ], ) def test_server_and_default_headers(responder, read_method, parquet_engine): @@ -233,12 +223,7 @@ def test_server_and_default_headers(responder, read_method, parquet_engine): "responder, read_method, parquet_engine", [ (CSVUserAgentResponder, pd.read_csv, None), - pytest.param( - JSONUserAgentResponder, - pd.read_json, - None, - marks=td.skip_array_manager_not_yet_implemented, - ), + (JSONUserAgentResponder, pd.read_json, None), (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"), pytest.param( ParquetFastParquetUserAgentResponder, @@ -250,12 +235,7 @@ def test_server_and_default_headers(responder, read_method, parquet_engine): (PickleUserAgentResponder, pd.read_pickle, None), (StataUserAgentResponder, pd.read_stata, None), (GzippedCSVUserAgentResponder, pd.read_csv, None), - pytest.param( - GzippedJSONUserAgentResponder, - pd.read_json, - None, - marks=td.skip_array_manager_not_yet_implemented, - ), + (GzippedJSONUserAgentResponder, pd.read_json, None), ], ) def test_server_and_custom_headers(responder, read_method, parquet_engine): diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 5f22115362ce4..c588c3c3ca0bd 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -1172,7 +1172,6 @@ def test_style_to_string(): assert out_xml == out_str -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON @td.skip_if_no("lxml") def test_style_to_json(): xsl = """\ diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py index 6844124d15f9d..a9b691f2a42b9 100644 --- a/pandas/tests/plotting/frame/test_frame_color.py +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -546,7 +546,13 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): df = DataFrame(np.random.randn(5, 5)) bp = df.plot.box(return_type="dict") - _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) + _check_colors( + bp, + default_colors[0], + default_colors[0], + default_colors[2], + default_colors[0], + ) tm.close() dict_colors = { @@ -569,7 +575,7 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): # partial colors dict_colors = {"whiskers": "c", "medians": "m"} bp = df.plot.box(color=dict_colors, return_type="dict") - _check_colors(bp, default_colors[0], "c", "m") + _check_colors(bp, default_colors[0], "c", "m", default_colors[0]) tm.close() from matplotlib import cm @@ -577,12 +583,12 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): # Test str -> colormap functionality bp = df.plot.box(colormap="jet", return_type="dict") jet_colors = [cm.jet(n) for n in np.linspace(0, 1, 3)] - _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2], jet_colors[0]) tm.close() # Test colormap functionality bp = df.plot.box(colormap=cm.jet, return_type="dict") - _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2], jet_colors[0]) tm.close() # string color is applied to all artists except fliers diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 448679d562a4a..dbceeae44a493 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -195,6 +195,39 @@ def test_color_kwd(self, colors_kwd, expected): for k, v in expected.items(): assert result[k][0].get_color() == v + @pytest.mark.parametrize( + "scheme,expected", + [ + ( + "dark_background", + { + "boxes": "#8dd3c7", + "whiskers": "#8dd3c7", + "medians": "#bfbbd9", + "caps": "#8dd3c7", + }, + ), + ( + "default", + { + "boxes": "#1f77b4", + "whiskers": "#1f77b4", + "medians": "#2ca02c", + "caps": "#1f77b4", + }, + ), + ], + ) + def test_colors_in_theme(self, scheme, expected): + # GH: 40769 + df = DataFrame(np.random.rand(10, 2)) + import matplotlib.pyplot as plt + + plt.style.use(scheme) + result = df.plot.box(return_type="dict") + for k, v in expected.items(): + assert result[k][0].get_color() == v + @pytest.mark.parametrize( "dict_colors, msg", [({"boxes": "r", "invalid_key": "r"}, "invalid key 'invalid_key'")], diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 71e6aa38d60e5..66cb2f2291e98 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -58,14 +58,16 @@ def test_custom_grouper(index): g = s.groupby(b) # check all cython functions work - funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] + g.ohlc() # doesn't use _cython_agg_general + funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) b = Grouper(freq=Minute(5), closed="right", label="right") g = s.groupby(b) # check all cython functions work - funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] + g.ohlc() # doesn't use _cython_agg_general + funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) @@ -79,7 +81,7 @@ def test_custom_grouper(index): idx = DatetimeIndex(idx, freq="5T") expect = Series(arr, index=idx) - # GH2763 - return in put dtype if we can + # GH2763 - return input dtype if we can result = g.agg(np.sum) tm.assert_series_equal(result, expect) @@ -1204,6 +1206,9 @@ def test_resample_median_bug_1688(): result = df.resample("T").apply(lambda x: x.mean()) exp = df.asfreq("T") + if dtype == "float32": + # TODO: Empty groups cause x.mean() to return float64 + exp = exp.astype("float64") tm.assert_frame_equal(result, exp) result = df.resample("T").median() @@ -1684,6 +1689,8 @@ def f(data, add_arg): df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) result = df.groupby("A").resample("D").agg(f, multiplier) expected = df.groupby("A").resample("D").mean().multiply(multiplier) + # TODO: GH 41137 + expected = expected.astype("float64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 999d8a6c90ba2..3e78d6ebf4c0c 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -258,6 +258,8 @@ def f(x): return x.resample("2s").apply(lambda y: y.sum()) result = g.apply(f) + # y.sum() results in int64 instead of int32 on 32-bit architectures + expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -289,7 +291,7 @@ def test_apply_columns_multilevel(): agg_dict = {col: (np.sum if col[3] == "one" else np.mean) for col in df.columns} result = df.resample("H").apply(lambda x: agg_dict[x.name](x)) expected = DataFrame( - np.array([0] * 4).reshape(2, 2), + 2 * [[0, 0.0]], index=date_range(start="2017-01-01", freq="1H", periods=2), columns=pd.MultiIndex.from_tuples( [("A", "a", "", "one"), ("B", "b", "i", "two")] @@ -354,11 +356,15 @@ def test_apply_to_one_column_of_df(): {"col": range(10), "col1": range(10, 20)}, index=date_range("2012-01-01", periods=10, freq="20min"), ) + + # access "col" via getattr -> make sure we handle AttributeError result = df.resample("H").apply(lambda group: group.col.sum()) expected = Series( [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="H") ) tm.assert_series_equal(result, expected) + + # access "col" via _getitem__ -> make sure we handle KeyErrpr result = df.resample("H").apply(lambda group: group["col"].sum()) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index c6ee295208607..b1560623cd871 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -162,7 +162,7 @@ def test_resample_with_timedelta_yields_no_empty_groups(): result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) expected = DataFrame( - [[768.0] * 4] * 12 + [[528.0] * 4], + [[768] * 4] * 12 + [[528] * 4], index=timedelta_range(start="1s", periods=13, freq="3s"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 44299d51a878f..62fd93026d5e2 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -559,6 +559,8 @@ def test_crosstab_with_numpy_size(self): expected = DataFrame( expected_data, index=expected_index, columns=expected_column ) + # aggfunc is np.size, resulting in integers + expected["All"] = expected["All"].astype("int64") tm.assert_frame_equal(result, expected) def test_crosstab_duplicate_names(self): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3d1c3b81c492f..2276281e3ecf8 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -986,7 +986,6 @@ def test_margins_dtype(self): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to floats)") def test_margins_dtype_len(self): mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 7996c15ae8e64..c12d28f6f1380 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -293,8 +293,8 @@ def test_qcut_bool_coercion_to_int(bins, box, compare): @pytest.mark.parametrize("q", [2, 5, 10]) -def test_qcut_nullable_integer(q, any_nullable_int_dtype): - arr = pd.array(np.arange(100), dtype=any_nullable_int_dtype) +def test_qcut_nullable_integer(q, any_nullable_numeric_dtype): + arr = pd.array(np.arange(100), dtype=any_nullable_numeric_dtype) arr[::2] = pd.NA result = qcut(arr, q) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 663892cefb5e6..2340d154e9e10 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -10,6 +10,7 @@ import pytest import pytz +from pandas.compat import PY310 from pandas.errors import OutOfBoundsDatetime from pandas import ( @@ -223,7 +224,11 @@ def test_constructor_tz_or_tzinfo(self): def test_constructor_positional(self): # see gh-10758 - msg = "an integer is required" + msg = ( + "'NoneType' object cannot be interpreted as an integer" + if PY310 + else "an integer is required" + ) with pytest.raises(TypeError, match=msg): Timestamp(2000, 1) @@ -331,7 +336,9 @@ def test_constructor_fromordinal(self): tz="UTC", ), Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC), + # error: Argument 9 to "Timestamp" has incompatible type "_UTCclass"; + # expected "Optional[int]" + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC), # type: ignore[arg-type] ], ) def test_constructor_nanosecond(self, result): diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 528e95f65c8f4..6185fe6c54fa4 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -40,6 +40,26 @@ def test_clip_types_and_nulls(self): assert list(isna(s)) == list(isna(lower)) assert list(isna(s)) == list(isna(upper)) + def test_series_clipping_with_na_values( + self, any_nullable_numeric_dtype, nulls_fixture + ): + # Ensure that clipping method can handle NA values with out failing + # GH#40581 + + s = Series([nulls_fixture, 1.0, 3.0], dtype=any_nullable_numeric_dtype) + s_clipped_upper = s.clip(upper=2.0) + s_clipped_lower = s.clip(lower=2.0) + + expected_upper = Series( + [nulls_fixture, 1.0, 2.0], dtype=any_nullable_numeric_dtype + ) + expected_lower = Series( + [nulls_fixture, 2.0, 3.0], dtype=any_nullable_numeric_dtype + ) + + tm.assert_series_equal(s_clipped_upper, expected_upper) + tm.assert_series_equal(s_clipped_lower, expected_lower) + def test_clip_with_na_args(self): """Should process np.nan argument as None """ # GH#17276 @@ -49,8 +69,13 @@ def test_clip_with_na_args(self): tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) # GH#19992 - tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) - tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) + tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3])) + tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1])) + + # GH#40420 + s = Series([1, 2, 3]) + result = s.clip(0, [np.nan, np.nan, np.nan]) + tm.assert_series_equal(s, result) def test_clip_against_series(self): # GH#6966 diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 0b6939a0997a4..72b6b7527f57f 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -28,6 +28,17 @@ nanops, ops, ) +from pandas.core.computation import expressions as expr + + +@pytest.fixture( + autouse=True, scope="module", params=[0, 1000000], ids=["numexpr", "python"] +) +def switch_numexpr_min_elements(request): + _MIN_ELEMENTS = expr._MIN_ELEMENTS + expr._MIN_ELEMENTS = request.param + yield request.param + expr._MIN_ELEMENTS = _MIN_ELEMENTS def _permute(obj): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 82961a42e4ff0..67649e6e37b35 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1341,7 +1341,7 @@ def test_constructor_dtype_timedelta64(self): # td.astype('m8[%s]' % t) # valid astype - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # astype(int64) deprecated td.astype("int64") diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index ab95b2071ae10..0c54042d983ad 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Index, @@ -12,79 +14,118 @@ ) -def test_contains(): +@pytest.fixture( + params=[ + "object", + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def any_string_dtype(request): + """ + Parametrized fixture for string dtypes. + * 'object' + * 'string' + * 'arrow_string' + """ + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + return request.param + + +def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ ) - values = Series(values) + values = Series(values, dtype=any_string_dtype) pat = "mmm[_]+" result = values.str.contains(pat) - expected = Series(np.array([False, np.nan, True, True, False], dtype=np.object_)) + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series( + np.array([False, np.nan, True, True, False], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, regex=False) - expected = Series(np.array([False, np.nan, False, False, True], dtype=np.object_)) + expected = Series( + np.array([False, np.nan, False, False, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) - values = Series(np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object)) + values = Series( + np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object), + dtype=any_string_dtype, + ) result = values.str.contains(pat) - expected = Series(np.array([False, False, True, True])) - assert result.dtype == np.bool_ + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) # case insensitive using regex - values = Series(np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object)) + values = Series( + np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object), + dtype=any_string_dtype, + ) result = values.str.contains("FOO|mmm", case=False) - expected = Series(np.array([True, False, True, True])) + expected = Series(np.array([True, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) # case insensitive without regex - result = Series(values).str.contains("foo", regex=False, case=False) - expected = Series(np.array([True, False, True, False])) + result = values.str.contains("foo", regex=False, case=False) + expected = Series(np.array([True, False, True, False]), dtype=expected_dtype) tm.assert_series_equal(result, expected) - # mixed + # unicode + values = Series( + np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_), + dtype=any_string_dtype, + ) + pat = "mmm[_]+" + + result = values.str.contains(pat) + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series( + np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype + ) + tm.assert_series_equal(result, expected) + + result = values.str.contains(pat, na=False) + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + values = Series( + np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_), + dtype=any_string_dtype, + ) + result = values.str.contains(pat) + expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +def test_contains_object_mixed(): mixed = Series( np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) ) - rs = mixed.str.contains("o") - xp = Series( + result = mixed.str.contains("o") + expected = Series( np.array( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], dtype=np.object_, ) ) - tm.assert_series_equal(rs, xp) - - rs = mixed.str.contains("o") - xp = Series([False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) - - # unicode - values = Series(np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_)) - pat = "mmm[_]+" - - result = values.str.contains(pat) - expected = Series(np.array([False, np.nan, True, True], dtype=np.object_)) - tm.assert_series_equal(result, expected) - - result = values.str.contains(pat, na=False) - expected = Series(np.array([False, False, True, True])) - tm.assert_series_equal(result, expected) - - values = Series(np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_)) - result = values.str.contains(pat) - expected = Series(np.array([False, False, True, True])) - assert result.dtype == np.bool_ tm.assert_series_equal(result, expected) -def test_contains_for_object_category(): +def test_contains_na_kwarg_for_object_category(): # gh 22158 # na for category @@ -108,6 +149,29 @@ def test_contains_for_object_category(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "na, expected", + [ + (None, pd.NA), + (True, True), + (False, False), + (0, False), + (3, True), + (np.nan, pd.NA), + ], +) +@pytest.mark.parametrize("regex", [True, False]) +def test_contains_na_kwarg_for_nullable_string_dtype( + nullable_string_dtype, na, expected, regex +): + # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 + + values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) + result = values.str.contains("a", na=na, regex=regex) + expected = Series([True, False, False, True, expected], dtype="boolean") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) @@ -136,6 +200,25 @@ def test_startswith(dtype, null_value, na): tm.assert_series_equal(rs, xp) +@pytest.mark.parametrize("na", [None, True, False]) +def test_startswith_nullable_string_dtype(nullable_string_dtype, na): + values = Series( + ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], + dtype=nullable_string_dtype, + ) + result = values.str.startswith("foo", na=na) + exp = Series( + [False, na, True, False, False, na, True, False, False], dtype="boolean" + ) + tm.assert_series_equal(result, exp) + + result = values.str.startswith("rege.", na=na) + exp = Series( + [False, na, False, False, False, na, False, False, True], dtype="boolean" + ) + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize("dtype", [None, "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) @@ -164,6 +247,25 @@ def test_endswith(dtype, null_value, na): tm.assert_series_equal(rs, xp) +@pytest.mark.parametrize("na", [None, True, False]) +def test_endswith_nullable_string_dtype(nullable_string_dtype, na): + values = Series( + ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], + dtype=nullable_string_dtype, + ) + result = values.str.endswith("foo", na=na) + exp = Series( + [False, na, False, False, True, na, True, False, False], dtype="boolean" + ) + tm.assert_series_equal(result, exp) + + result = values.str.endswith("rege.", na=na) + exp = Series( + [False, na, False, False, False, na, False, False, True], dtype="boolean" + ) + tm.assert_series_equal(result, exp) + + def test_replace(): values = Series(["fooBAD__barBAD", np.nan]) @@ -508,59 +610,73 @@ def _check(result, expected): tm.assert_series_equal(result, expected) -def test_contains_moar(): +def test_contains_moar(any_string_dtype): # PR #1179 - s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) + s = Series( + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], + dtype=any_string_dtype, + ) result = s.str.contains("a") + expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series( - [False, False, False, True, True, False, np.nan, False, False, True] + [False, False, False, True, True, False, np.nan, False, False, True], + dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( - [True, False, False, True, True, False, np.nan, True, False, True] + [True, False, False, True, True, False, np.nan, True, False, True], + dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False] + [False, False, False, True, False, False, np.nan, False, False, False], + dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False] + [False, False, False, True, False, False, np.nan, False, False, False], + dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( - [False, False, False, True, True, False, np.nan, True, False, False] + [False, False, False, True, True, False, np.nan, True, False, False], + dtype=expected_dtype, ) tm.assert_series_equal(result, expected) -def test_contains_nan(): +def test_contains_nan(any_string_dtype): # PR #14171 - s = Series([np.nan, np.nan, np.nan], dtype=np.object_) + s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected = Series([False, False, False], dtype=np.bool_) + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.contains("foo", na=True) - expected = Series([True, True, True], dtype=np.bool_) + expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.contains("foo", na="foo") - expected = Series(["foo", "foo", "foo"], dtype=np.object_) + if any_string_dtype == "object": + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + else: + expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -609,14 +725,14 @@ def test_replace_moar(): tm.assert_series_equal(result, expected) -def test_match_findall_flags(): +def test_flags_kwarg(any_string_dtype): data = { "Dave": "dave@google.com", "Steve": "steve@gmail.com", "Rob": "rob@gmail.com", "Wes": np.nan, } - data = Series(data) + data = Series(data, dtype=any_string_dtype) pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 02ccb3a930557..23c9b14c5a36a 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -1,5 +1,3 @@ -import operator - import numpy as np import pytest @@ -119,20 +117,3 @@ def test_str_get_stringarray_multiple_nans(nullable_string_dtype): result = s.str.get(2) expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"], dtype=nullable_string_dtype)) tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "input, method", - [ - (["a", "b", "c"], operator.methodcaller("capitalize")), - (["a b", "a bc. de"], operator.methodcaller("capitalize")), - ], -) -def test_capitalize(input, method, nullable_string_dtype): - a = Series(input, dtype=nullable_string_dtype) - b = Series(input, dtype="object") - result = method(a.str) - expected = method(b.str) - - assert result.dtype.name == nullable_string_dtype - tm.assert_series_equal(result.astype(object), expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 75fc7a782772a..964dd9bdd0e0a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -95,29 +95,32 @@ def test_basic(self): exp = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(range(5)))) + arr = np.arange(5, dtype=np.intp)[::-1] + + codes, uniques = algos.factorize(arr) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) + exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(range(5))), sort=True) - + codes, uniques = algos.factorize(arr, sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(np.arange(5.0)))) + arr = np.arange(5.0)[::-1] + + codes, uniques = algos.factorize(arr) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) + exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) + codes, uniques = algos.factorize(arr, sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) + exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) def test_mixed(self): @@ -246,6 +249,17 @@ def test_complex_sorting(self): with pytest.raises(TypeError, match=msg): algos.factorize(x17[::-1], sort=True) + def test_numeric_dtype_factorize(self, any_real_dtype): + # GH41132 + dtype = any_real_dtype + data = np.array([1, 2, 2, 1], dtype=dtype) + expected_codes = np.array([0, 1, 1, 0], dtype=np.intp) + expected_uniques = np.array([1, 2], dtype=dtype) + + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + def test_float64_factorize(self, writable): data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) data.setflags(write=writable) @@ -1742,14 +1756,15 @@ def _check(arr): _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan])) _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan])) - def test_basic(self, writable): + @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) + def test_basic(self, writable, dtype): exp = np.array([1, 2], dtype=np.float64) - for dtype in np.typecodes["AllInteger"]: - data = np.array([1, 100], dtype=dtype) - data.setflags(write=writable) - s = Series(data) - tm.assert_numpy_array_equal(algos.rank(s), exp) + data = np.array([1, 100], dtype=dtype) + data.setflags(write=writable) + ser = Series(data) + result = algos.rank(ser) + tm.assert_numpy_array_equal(result, exp) def test_uint64_overflow(self): exp = np.array([1, 2], dtype=np.float64) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index e94cb23b359d0..6ac85f9d36fdc 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -242,7 +242,7 @@ def testit(): def test_bool_ops_raise_on_arithmetic(self, op_str, opname): df = DataFrame({"a": np.random.rand(10) > 0.5, "b": np.random.rand(10) > 0.5}) - msg = f"operator {repr(op_str)} not implemented for bool dtypes" + msg = f"operator '{opname}' not implemented for bool dtypes" f = getattr(operator, opname) err_msg = re.escape(msg) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 2fa3acf939c5b..a49b7c2b7f86e 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -67,7 +67,6 @@ def test_int64_overflow(self): assert left[k] == v assert len(left) == len(right) - @pytest.mark.arm_slow def test_int64_overflow_moar(self): # GH9096 diff --git a/pandas/tests/window/moments/test_moments_rolling_skew_kurt.py b/pandas/tests/window/moments/test_moments_rolling_skew_kurt.py index 3cd4b115c90c7..34d5f686eb853 100644 --- a/pandas/tests/window/moments/test_moments_rolling_skew_kurt.py +++ b/pandas/tests/window/moments/test_moments_rolling_skew_kurt.py @@ -152,6 +152,7 @@ def test_center_reindex_series(series, roll_func): tm.assert_series_equal(series_xp, series_rs) +@pytest.mark.slow @pytest.mark.parametrize("roll_func", ["kurt", "skew"]) def test_center_reindex_frame(frame, roll_func): # shifter index diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 06b34201e0dba..644081fee0d8f 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -123,26 +123,64 @@ def func_2(x): @td.skip_if_no("numba", "0.46.0") -class TestGroupbyEWMMean: - def test_invalid_engine(self): +class TestEWMMean: + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_invalid_engine(self, grouper): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) with pytest.raises(ValueError, match="engine must be either"): - df.groupby("A").ewm(com=1.0).mean(engine="foo") + grouper(df).ewm(com=1.0).mean(engine="foo") - def test_invalid_engine_kwargs(self): + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_invalid_engine_kwargs(self, grouper): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) with pytest.raises(ValueError, match="cython engine does not"): - df.groupby("A").ewm(com=1.0).mean( + grouper(df).ewm(com=1.0).mean( engine="cython", engine_kwargs={"nopython": True} ) - def test_cython_vs_numba(self, nogil, parallel, nopython, ignore_na, adjust): + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_cython_vs_numba( + self, grouper, nogil, parallel, nopython, ignore_na, adjust + ): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) - gb_ewm = df.groupby("A").ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) + ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = gb_ewm.mean(engine="cython") + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na): + # GH 40951 + halflife = "23 days" + times = to_datetime( + [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-01-10", + "2020-02-23", + "2020-01-03", + ] + ) + df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]}) + ewm = grouper(df).ewm( + halflife=halflife, adjust=True, ignore_na=ignore_na, times=times + ) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 9abae632e5da3..28465e3a979a7 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1357,3 +1357,57 @@ def test_rolling_std_small_values(): result = s.rolling(2).std() expected = Series([np.nan, 7.071068e-9, 7.071068e-9]) tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15) + + +@pytest.mark.parametrize( + "start, exp_values", + [ + (1, [0.03, 0.0155, 0.0155, 0.011, 0.01025]), + (2, [0.001, 0.001, 0.0015, 0.00366666]), + ], +) +def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): + # GH#41053 + df = DataFrame( + [ + 0.03, + 0.03, + 0.001, + np.NaN, + 0.002, + 0.008, + np.NaN, + np.NaN, + np.NaN, + np.NaN, + np.NaN, + np.NaN, + 0.005, + 0.2, + ] + ) + + values = exp_values + [ + 0.00366666, + 0.005, + 0.005, + 0.008, + np.NaN, + np.NaN, + 0.005, + 0.102500, + ] + expected = DataFrame( + values, + index=list(range(start, len(values) + start)), + ) + result = df.iloc[start:].rolling(5, min_periods=0).mean() + tm.assert_frame_equal(result, expected) + + +def test_rolling_sum_all_nan_window_floating_artifacts(): + # GH#41053 + df = DataFrame([0.002, 0.008, 0.005, np.NaN, np.NaN, np.NaN]) + result = df.rolling(3, min_periods=0).sum() + expected = DataFrame([0.002, 0.010, 0.015, 0.013, 0.005, 0.0]) + tm.assert_frame_equal(result, expected) diff --git a/pyproject.toml b/pyproject.toml index 3ffda4e2149c0..01d28777eb47e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,3 +32,30 @@ exclude = ''' | setup.py ) ''' + +[tool.pytest.ini_options] +# sync minversion with pyproject.toml & install.rst +minversion = "6.0" +addopts = "--strict-data-files --strict-markers --capture=no --durations=30 --junitxml=test-data.xml" +xfail_strict = true +testpaths = "pandas" +doctest_optionflags = [ + "NORMALIZE_WHITESPACE", + "IGNORE_EXCEPTION_DETAIL", + "ELLIPSIS", +] +filterwarnings = [ + "error:Sparse:FutureWarning", + "error:The SparseArray:FutureWarning", +] +junit_family = "xunit2" +markers = [ + "single: mark a test as single cpu only", + "slow: mark a test as slow", + "network: mark a test as network", + "db: tests requiring a database (mysql or postgres)", + "high_memory: mark a test as a high-memory only", + "clipboard: mark a pd.read_clipboard test", + "arm_slow: mark a test as slow for arm64 architecture", + "arraymanager: mark a test to run with ArrayManager enabled", +] diff --git a/requirements-dev.txt b/requirements-dev.txt index ea7ca43742934..33deeef9f1f82 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,7 +8,7 @@ asv cython>=0.29.21 black==20.8b1 cpplint -flake8==3.9.1 +flake8==3.9.0 flake8-bugbear==21.3.2 flake8-comprehensions==3.1.0 isort>=5.2.1 diff --git a/test_fast.bat b/test_fast.bat index 34c61fea08ab4..642e0549f3228 100644 --- a/test_fast.bat +++ b/test_fast.bat @@ -1,3 +1,3 @@ :: test on windows set PYTHONHASHSEED=314159265 -pytest --skip-slow --skip-network --skip-db -m "not single" -n 4 -r sXX --strict-markers pandas +pytest --skip-slow --skip-network --skip-db -m "not single" -n 4 -r sXX pandas diff --git a/test_fast.sh b/test_fast.sh index 6444b81b3c6da..9d446964cf501 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -5,4 +5,4 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -pytest pandas --skip-slow --skip-network --skip-db -m "not single" -n 4 -r sxX --strict-markers "$@" +pytest pandas --skip-slow --skip-network --skip-db -m "not single" -n 4 -r sxX "$@" diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 9a178d26659c3..9da7d3bbe8ab6 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -86,6 +86,7 @@ maintainers: - dsaxton - MarcoGorelli - rhshadrach + - phofl emeritus: - Wouter Overmeire - Skipper Seabold