diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 46de8d466dd11..b391871b18245 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,12 +37,6 @@ jobs: ci/code_checks.sh lint if: always() - - name: Dependencies consistency - run: | - source activate pandas-dev - ci/code_checks.sh dependencies - if: always() - - name: Checks on imported code run: | source activate pandas-dev diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c6d36133f067..315aeb6423254 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,8 +15,7 @@ repos: - id: flake8 name: flake8 (cython template) files: \.pxi\.in$ - types: - - file + types: [text] args: [--append-config=flake8/cython-template.cfg] - repo: https://github.com/PyCQA/isort rev: 5.6.3 @@ -32,16 +31,20 @@ repos: - id: pyupgrade args: [--py37-plus] - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.6.0 + rev: v1.7.0 hooks: - id: rst-backticks + - id: rst-directive-colons + types: [text] + - id: rst-inline-touching-normal + types: [text] - repo: local hooks: - id: pip_to_conda name: Generate pip dependency from conda description: This hook checks if the conda environment.yml and requirements-dev.txt are equal language: python - entry: python -m scripts.generate_pip_deps_from_conda + entry: python scripts/generate_pip_deps_from_conda.py files: ^(environment.yml|requirements-dev.txt)$ pass_filenames: false additional_dependencies: [pyyaml] @@ -53,6 +56,61 @@ repos: types: [rst] args: [--filename=*.rst] additional_dependencies: [flake8-rst==0.7.0, flake8==3.7.9] + - id: non-standard-imports + name: Check for non-standard imports + language: pygrep + entry: | + (?x) + # Check for imports from pandas.core.common instead of `import pandas.core.common as com` + from\ pandas\.core\.common\ import| + from\ pandas\.core\ import\ common| + + # Check for imports from collections.abc instead of `from collections import abc` + from\ collections\.abc\ import| + + from\ numpy\ import\ nan + types: [python] + - id: non-standard-imports-in-tests + name: Check for non-standard imports in test suite + language: pygrep + entry: | + (?x) + # Check for imports from pandas._testing instead of `import pandas._testing as tm` + from\ pandas\._testing\ import| + from\ pandas\ import\ _testing\ as\ tm| + + # No direct imports from conftest + conftest\ import| + import\ conftest + types: [python] + files: ^pandas/tests/ + - id: incorrect-code-directives + name: Check for incorrect code block or IPython directives + language: pygrep + entry: (\.\. code-block ::|\.\. ipython ::) + files: \.(py|pyx|rst)$ + - id: unwanted-patterns-strings-to-concatenate + name: Check for use of not concatenated strings + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" + files: \.(py|pyx|pxd|pxi)$ + - id: unwanted-patterns-strings-with-wrong-placed-whitespace + name: Check for strings with wrong placed spaces + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" + files: \.(py|pyx|pxd|pxi)$ + - id: unwanted-patterns-private-import-across-module + name: Check for import of private attributes across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" + types: [python] + exclude: ^(asv_bench|pandas/_vendored|pandas/tests|doc)/ + - id: unwanted-patterns-private-function-across-module + name: Check for use of private functions across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" + types: [python] + exclude: ^(asv_bench|pandas/_vendored|pandas/tests|doc)/ - repo: https://github.com/asottile/yesqa rev: v1.2.2 hooks: @@ -61,4 +119,6 @@ repos: rev: v3.2.0 hooks: - id: end-of-file-fixer - exclude: '.html$|^LICENSES/|.csv$|.txt$|.svg$|.py$' + exclude: ^LICENSES/|\.(html|csv|txt|svg|py)$ + - id: trailing-whitespace + exclude: \.(html|svg)$ diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index bda3ab71d1a00..22f002e6cb79a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -358,6 +358,26 @@ def time_category_size(self): self.draws.groupby(self.cats).size() +class FillNA: + def setup(self): + N = 100 + self.df = DataFrame( + {"group": [1] * N + [2] * N, "value": [np.nan, 1.0] * N} + ).set_index("group") + + def time_df_ffill(self): + self.df.groupby("group").fillna(method="ffill") + + def time_df_bfill(self): + self.df.groupby("group").fillna(method="bfill") + + def time_srs_ffill(self): + self.df.groupby("group")["value"].fillna(method="ffill") + + def time_srs_bfill(self): + self.df.groupby("group")["value"].fillna(method="bfill") + + class GroupByMethods: param_names = ["dtype", "method", "application"] diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 4ca9a82ae4827..656fe2197bc8a 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -24,5 +24,11 @@ def time_read_pickle(self): def time_write_pickle(self): self.df.to_pickle(self.fname) + def peakmem_read_pickle(self): + read_pickle(self.fname) + + def peakmem_write_pickle(self): + self.df.to_pickle(self.fname) + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d8b35abb94b9d..7c75ad031e7cd 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import DataFrame, Series +from pandas import Categorical, DataFrame, Series from .pandas_vb_common import tm @@ -16,6 +16,10 @@ def setup(self, dtype): self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() + # GH37371. Testing construction of string series/frames from ExtensionArrays + self.series_cat_arr = Categorical(self.series_arr) + self.frame_cat_arr = Categorical(self.frame_arr) + def time_series_construction(self, dtype): Series(self.series_arr, dtype=dtype) @@ -28,6 +32,18 @@ def time_frame_construction(self, dtype): def peakmem_frame_construction(self, dtype): DataFrame(self.frame_arr, dtype=dtype) + def time_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def peakmem_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def time_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + + def peakmem_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + class Methods: def setup(self): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 27c904dda5b45..4ed542b3a28e3 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -3,7 +3,14 @@ import dateutil import numpy as np -from pandas import DataFrame, Series, date_range, period_range, to_datetime +from pandas import ( + DataFrame, + Series, + date_range, + period_range, + timedelta_range, + to_datetime, +) from pandas.tseries.frequencies import infer_freq @@ -121,12 +128,15 @@ def time_convert(self): class Iteration: - params = [date_range, period_range] + params = [date_range, period_range, timedelta_range] param_names = ["time_index"] def setup(self, time_index): N = 10 ** 6 - self.idx = time_index(start="20140101", freq="T", periods=N) + if time_index is timedelta_range: + self.idx = time_index(start=0, freq="T", periods=N) + else: + self.idx = time_index(start="20140101", freq="T", periods=N) self.exit = 10000 def time_iter(self, time_index): diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 113ad3e338952..b1091ea7f60e4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -26,3 +26,28 @@ jobs: parameters: name: Windows vmImage: vs2017-win2016 + +- job: py37_32bit + pool: + vmImage: ubuntu-18.04 + + steps: + - script: | + docker pull quay.io/pypa/manylinux2014_i686 + docker run -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ + /bin/bash -xc "cd pandas && \ + /opt/python/cp37-cp37m/bin/python -m venv ~/virtualenvs/pandas-dev && \ + . ~/virtualenvs/pandas-dev/bin/activate && \ + python -m pip install --no-deps -U pip wheel setuptools && \ + pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis pytest-azurepipelines && \ + python setup.py build_ext -q -i -j2 && \ + python -m pip install --no-build-isolation -e . && \ + pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml" + displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' + + - task: PublishTestResults@2 + condition: succeededOrFailed() + inputs: + testResultsFiles: '**/test-*.xml' + failTaskOnFailedTests: true + testRunTitle: 'Publish test results for Python 3.7-32 bit full Linux' diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 784d2c9a411ab..a9cce9357b531 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -15,11 +15,10 @@ # $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors -# $ ./ci/code_checks.sh dependencies # check that dependencies are consistent # $ ./ci/code_checks.sh typing # run static type analysis -[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "dependencies" || "$1" == "typing" ]] || \ - { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|dependencies|typing]"; exit 9999; } +[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "typing" ]] || \ + { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|typing]"; exit 9999; } BASE_DIR="$(dirname $0)/.." RET=0 @@ -38,6 +37,12 @@ function invgrep { return $((! $EXIT_STATUS)) } +function check_namespace { + local -r CLASS="${1}" + grep -R -l --include "*.py" " ${CLASS}(" pandas/tests | xargs grep -n "pd\.${CLASS}(" + test $? -gt 0 +} + if [[ "$GITHUB_ACTIONS" == "true" ]]; then FLAKE8_FORMAT="##[error]%(path)s:%(row)s:%(col)s:%(code)s:%(text)s" INVGREP_PREPEND="##[error]" @@ -48,31 +53,6 @@ fi ### LINTING ### if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then - echo "black --version" - black --version - - MSG='Checking black formatting' ; echo $MSG - black . --check - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # `setup.cfg` contains the list of error codes that are being ignored in flake8 - - echo "flake8 --version" - flake8 --version - - # pandas/_libs/src is C code, so no need to search there. - MSG='Linting .py code' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" . - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Linting .pyx and .pxd code' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" pandas --append-config=flake8/cython.cfg - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Linting .pxi.in' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" pandas/_libs --append-config=flake8/cython-template.cfg - RET=$(($RET + $?)) ; echo $MSG "DONE" - # Check that cython casting is of the form `obj` as opposed to ` obj`; # it doesn't make a difference, but we want to be internally consistent. # Note: this grep pattern is (intended to be) equivalent to the python @@ -93,81 +73,11 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/_libs/*.cpp RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for use of not concatenated strings' ; echo $MSG - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" --format="##[error]{source_path}:{line_number}:{msg}" . - else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" . - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for strings with wrong placed spaces' ; echo $MSG - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" --format="##[error]{source_path}:{line_number}:{msg}" . - else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" . - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for import of private attributes across modules' ; echo $MSG - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/ - else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/ - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for use of private functions across modules' ; echo $MSG - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ --format="##[error]{source_path}:{line_number}:{msg}" pandas/ - else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ pandas/ - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - - echo "isort --version-number" - isort --version-number - - # Imports - Check formatting using isort see setup.cfg for settings - MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts web" - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) - else - eval $ISORT_CMD - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - fi ### PATTERNS ### if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then - # Check for imports from pandas.core.common instead of `import pandas.core.common as com` - # Check for imports from collections.abc instead of `from collections import abc` - MSG='Check for non-standard imports' ; echo $MSG - invgrep -R --include="*.py*" -E "from pandas.core.common import" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from pandas.core import common" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from collections.abc import" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from numpy import nan" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Checks for test suite - # Check for imports from pandas._testing instead of `import pandas._testing as tm` - invgrep -R --include="*.py*" -E "from pandas._testing import" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from pandas import _testing as tm" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # No direct imports from conftest - invgrep -R --include="*.py*" -E "conftest import" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "import conftest" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for use of exec' ; echo $MSG invgrep -R --include="*.py*" -E "[^a-zA-Z0-9_]exec\(" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -180,14 +90,6 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -r -E --include '*.py' "[[:space:]] pytest.raises" pandas/tests/ RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for python2-style file encodings' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E "# -\*- coding: utf-8 -\*-" pandas scripts - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for python2-style super usage' ; echo $MSG - invgrep -R --include="*.py" -E "super\(\w*, (self|cls)\)" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for use of builtin filter function' ; echo $MSG invgrep -R --include="*.py" -P '(? "Tools for Visual Studio 2019". In the installer, select the "C++ build tools" workload. -**Mac OS** +**macOS** Information about compiler installation can be found here: https://devguide.python.org/setup/#macos @@ -299,7 +299,7 @@ Creating a Python environment (pip) If you aren't using conda for your development environment, follow these instructions. You'll need to have at least Python 3.6.1 installed on your system. -**Unix**/**Mac OS with virtualenv** +**Unix**/**macOS with virtualenv** .. code-block:: bash @@ -318,7 +318,7 @@ You'll need to have at least Python 3.6.1 installed on your system. python setup.py build_ext --inplace -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 -**Unix**/**Mac OS with pyenv** +**Unix**/**macOS with pyenv** Consult the docs for setting up pyenv `here `__. @@ -598,7 +598,7 @@ Building master branch documentation When pull requests are merged into the pandas ``master`` branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted `here -`__, see also +`__, see also the :ref:`Continuous Integration ` section. .. _contributing.code: diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 25ca77627ef39..670905f6587bc 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -201,7 +201,7 @@ Jupyter notebooks can be converted to a number of open standard output formats Python) through 'Download As' in the web interface and ``jupyter convert`` in a shell. -pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods +pandas DataFrames implement ``_repr_html_`` and ``_repr_latex`` methods which are utilized by Jupyter Notebook for displaying (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may or may not be @@ -230,7 +230,7 @@ allows users to view, manipulate and edit pandas ``Index``, ``Series``, and ``DataFrame`` objects like a "spreadsheet", including copying and modifying values, sorting, displaying a "heatmap", converting data types and more. pandas objects can also be renamed, duplicated, new columns added, -copyed/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. +copied/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. Spyder can also import data from a variety of plain text and binary files or the clipboard into a new pandas DataFrame via a sophisticated import wizard. @@ -376,6 +376,23 @@ Dask-ML enables parallel and distributed machine learning using Dask alongside e Koalas provides a familiar pandas DataFrame interface on top of Apache Spark. It enables users to leverage multi-cores on one machine or a cluster of machines to speed up or scale their DataFrame code. +`Modin `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``modin.pandas`` DataFrame is a parallel and distributed drop-in replacement +for pandas. This means that you can use Modin with existing pandas code or write +new code with the existing pandas API. Modin can leverage your entire machine or +cluster to speed up and scale your pandas workloads, including traditionally +time-consuming tasks like ingesting data (``read_csv``, ``read_excel``, +``read_parquet``, etc.). + +.. code:: python + + # import pandas as pd + import modin.pandas as pd + + df = pd.read_csv("big.csv") # use all your cores! + `Odo `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -400,16 +417,6 @@ If also displays progress bars. # df.apply(func) df.parallel_apply(func) -`Ray `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -pandas on Ray is an early stage DataFrame library that wraps pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous pandas notebooks while experiencing a considerable speedup from pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use pandas on Ray just like you would pandas. - -.. code:: python - - # import pandas as pd - import ray.dataframe as pd - `Vaex `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index eb7ee000a9a86..6f6eeada0cfed 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -533,7 +533,7 @@ pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. pandas provides a wide range of functions to cleaning textual data and extract useful information from it. +Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. .. raw:: html diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 70d145c54e919..df481e8c986f7 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -28,20 +28,20 @@ Installing pandas Installing with Anaconda ~~~~~~~~~~~~~~~~~~~~~~~~ -Installing pandas and the rest of the `NumPy `__ and -`SciPy `__ stack can be a little +Installing pandas and the rest of the `NumPy `__ and +`SciPy `__ stack can be a little difficult for inexperienced users. The simplest way to install not only pandas, but Python and the most popular -packages that make up the `SciPy `__ stack -(`IPython `__, `NumPy `__, +packages that make up the `SciPy `__ stack +(`IPython `__, `NumPy `__, `Matplotlib `__, ...) is with `Anaconda `__, a cross-platform -(Linux, Mac OS X, Windows) Python distribution for data analytics and +(Linux, macOS, Windows) Python distribution for data analytics and scientific computing. After running the installer, the user will have access to pandas and the -rest of the `SciPy `__ stack without needing to install +rest of the `SciPy `__ stack without needing to install anything else, and without needing to wait for any software to be compiled. Installation instructions for `Anaconda `__ @@ -220,7 +220,7 @@ Dependencies Package Minimum supported version ================================================================ ========================== `setuptools `__ 24.2.0 -`NumPy `__ 1.16.5 +`NumPy `__ 1.16.5 `python-dateutil `__ 2.7.3 `pytz `__ 2017.3 ================================================================ ========================== @@ -262,7 +262,7 @@ BeautifulSoup4 4.6.0 HTML parser for read_html (see :ref Jinja2 2.10 Conditional formatting with DataFrame.style PyQt4 Clipboard I/O PyQt5 Clipboard I/O -PyTables 3.4.4 HDF5-based reading / writing +PyTables 3.5.1 HDF5-based reading / writing SQLAlchemy 1.2.8 SQL support for databases other than sqlite SciPy 1.12.0 Miscellaneous statistical functions xlsxwriter 1.0.2 Excel writing @@ -280,7 +280,6 @@ psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading -pytables 3.5.1 HDF5 reading / writing pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O s3fs 0.4.0 Amazon S3 access diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index b7fb99a98d78f..b8db7de5b7b10 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -66,15 +66,15 @@ How to manipulate textual data?
  • -Make all name characters lowercase +Make all name characters lowercase. .. ipython:: python titanic["Name"].str.lower() To make each of the strings in the ``Name`` column lowercase, select the ``Name`` column -(see :ref:`tutorial on selection of data <10min_tut_03_subset>`), add the ``str`` accessor and -apply the ``lower`` method. As such, each of the strings is converted element wise. +(see the :ref:`tutorial on selection of data <10min_tut_03_subset>`), add the ``str`` accessor and +apply the ``lower`` method. As such, each of the strings is converted element-wise. .. raw:: html @@ -86,7 +86,7 @@ having a ``dt`` accessor, a number of specialized string methods are available when using the ``str`` accessor. These methods have in general matching names with the equivalent built-in string methods for single elements, but are applied -element-wise (remember :ref:`element wise calculations <10min_tut_05_columns>`?) +element-wise (remember :ref:`element-wise calculations <10min_tut_05_columns>`?) on each of the values of the columns. .. raw:: html @@ -94,7 +94,7 @@ on each of the values of the columns.
    • -Create a new column ``Surname`` that contains the surname of the Passengers by extracting the part before the comma. +Create a new column ``Surname`` that contains the surname of the passengers by extracting the part before the comma. .. ipython:: python @@ -135,7 +135,7 @@ More information on extracting parts of strings is available in the user guide s
      • -Extract the passenger data about the Countesses on board of the Titanic. +Extract the passenger data about the countesses on board of the Titanic. .. ipython:: python @@ -145,15 +145,15 @@ Extract the passenger data about the Countesses on board of the Titanic. titanic[titanic["Name"].str.contains("Countess")] -(*Interested in her story? See *\ `Wikipedia `__\ *!*) +(*Interested in her story? See* `Wikipedia `__\ *!*) The string method :meth:`Series.str.contains` checks for each of the values in the column ``Name`` if the string contains the word ``Countess`` and returns -for each of the values ``True`` (``Countess`` is part of the name) of +for each of the values ``True`` (``Countess`` is part of the name) or ``False`` (``Countess`` is not part of the name). This output can be used to subselect the data using conditional (boolean) indexing introduced in the :ref:`subsetting of data tutorial <10min_tut_03_subset>`. As there was -only one Countess on the Titanic, we get one row as a result. +only one countess on the Titanic, we get one row as a result. .. raw:: html @@ -220,7 +220,7 @@ we can do a selection using the ``loc`` operator, introduced in the
        • -In the "Sex" column, replace values of "male" by "M" and values of "female" by "F" +In the "Sex" column, replace values of "male" by "M" and values of "female" by "F". .. ipython:: python @@ -256,7 +256,7 @@ a ``dictionary`` to define the mapping ``{from : to}``.

          REMEMBER

          - String methods are available using the ``str`` accessor. -- String methods work element wise and can be used for conditional +- String methods work element-wise and can be used for conditional indexing. - The ``replace`` method is a convenient method to convert values according to a given dictionary. diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 4aba8f709fba0..c6deb4b7ea383 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -17,7 +17,7 @@ pandas documentation `Source Repository `__ | `Issues & Ideas `__ | `Q&A Support `__ | -`Mailing List `__ +`Mailing List `__ :mod:`pandas` is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the `Python `__ diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 5c068d8404cd6..43e2509469488 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -63,7 +63,9 @@ Properties Timestamp.asm8 Timestamp.day Timestamp.dayofweek + Timestamp.day_of_week Timestamp.dayofyear + Timestamp.day_of_year Timestamp.days_in_month Timestamp.daysinmonth Timestamp.fold @@ -233,7 +235,9 @@ Properties Period.day Period.dayofweek + Period.day_of_week Period.dayofyear + Period.day_of_year Period.days_in_month Period.daysinmonth Period.end_time diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index ba12c19763605..d3f9413dae565 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -345,9 +345,11 @@ Time/date components DatetimeIndex.time DatetimeIndex.timetz DatetimeIndex.dayofyear + DatetimeIndex.day_of_year DatetimeIndex.weekofyear DatetimeIndex.week DatetimeIndex.dayofweek + DatetimeIndex.day_of_week DatetimeIndex.weekday DatetimeIndex.quarter DatetimeIndex.tz @@ -461,7 +463,9 @@ Properties PeriodIndex.day PeriodIndex.dayofweek + PeriodIndex.day_of_week PeriodIndex.dayofyear + PeriodIndex.day_of_year PeriodIndex.days_in_month PeriodIndex.daysinmonth PeriodIndex.end_time diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index f1069e46b56cc..8d74c288bf801 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -22,10 +22,6 @@ Attributes :toctree: api/ Series.index - -.. autosummary:: - :toctree: api/ - Series.array Series.values Series.dtype @@ -323,8 +319,10 @@ Datetime properties Series.dt.week Series.dt.weekofyear Series.dt.dayofweek + Series.dt.day_of_week Series.dt.weekday Series.dt.dayofyear + Series.dt.day_of_year Series.dt.quarter Series.dt.is_month_start Series.dt.is_month_end diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index b24020848b363..b9f0683697ba6 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -652,9 +652,9 @@ parameter: :header: "``closed``", "Description", "Default for" :widths: 20, 30, 30 - ``right``, close right endpoint, time-based windows + ``right``, close right endpoint, ``left``, close left endpoint, - ``both``, close both endpoints, fixed windows + ``both``, close both endpoints, ``neither``, open endpoints, For example, having the right endpoint open is useful in many problems that require that there is no contamination @@ -681,9 +681,6 @@ from present information back to past information. This allows the rolling windo df -Currently, this feature is only implemented for time-based windows. -For fixed windows, the closed parameter cannot be set and the rolling window will always have both endpoints closed. - .. _stats.iter_rolling_window: Iteration over window: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 0b24ff61d87b8..1c271e74aafba 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -23,7 +23,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` - ;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` + binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` binary;`OpenDocument `__;:ref:`read_excel`; binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` @@ -5686,7 +5686,7 @@ ignored. dtypes: float64(1), int64(1) memory usage: 15.3 MB -Given the next test set: +The following test functions will be used below to compare the performance of several IO methods: .. code-block:: python @@ -5791,7 +5791,7 @@ Given the next test set: def test_parquet_read(): pd.read_parquet("test.parquet") -When writing, the top-three functions in terms of speed are ``test_feather_write``, ``test_hdf_fixed_write`` and ``test_hdf_fixed_write_compress``. +When writing, the top three functions in terms of speed are ``test_feather_write``, ``test_hdf_fixed_write`` and ``test_hdf_fixed_write_compress``. .. code-block:: ipython @@ -5825,7 +5825,7 @@ When writing, the top-three functions in terms of speed are ``test_feather_write In [13]: %timeit test_parquet_write(df) 67.6 ms ± 706 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) -When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and +When reading, the top three functions in terms of speed are ``test_feather_read``, ``test_pickle_read`` and ``test_hdf_fixed_read``. @@ -5862,8 +5862,7 @@ When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and 24.4 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) -For this test case ``test.pkl.compress``, ``test.parquet`` and ``test.feather`` took the least space on disk. -Space on disk (in bytes) +The files ``test.pkl.compress``, ``test.parquet`` and ``test.feather`` took the least space on disk (in bytes). .. code-block:: none diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 9dd4fb68ae26a..9b1c9b8d04270 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -302,10 +302,10 @@ positional argument (a regex object) and return a string. return m.group(0)[::-1] - pd.Series( - ["foo 123", "bar baz", np.nan], - dtype="string" - ).str.replace(pat, repl, regex=True) + pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace( + pat, repl, regex=True + ) + # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 9fbd02df50d10..af5e172658386 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -803,9 +803,11 @@ There are several time/date properties that one can access from ``Timestamp`` or time,"Returns datetime.time (does not contain timezone information)" timetz,"Returns datetime.time as local time with timezone information" dayofyear,"The ordinal day of year" + day_of_year,"The ordinal day of year" weekofyear,"The week ordinal of the year" week,"The week ordinal of the year" dayofweek,"The number of the day of the week with Monday=0, Sunday=6" + day_of_week,"The number of the day of the week with Monday=0, Sunday=6" weekday,"The number of the day of the week with Monday=0, Sunday=6" quarter,"Quarter of the date: Jan-Mar = 1, Apr-Jun = 2, etc." days_in_month,"The number of days in the month of the datetime" diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 96d9fc6077325..c4ee8677a6b0d 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -64,7 +64,7 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column plt.figure(); @savefig frame_plot_basic.png - df.plot() + df.plot(); You can plot one column versus another using the ``x`` and ``y`` keywords in :meth:`~DataFrame.plot`: @@ -119,7 +119,7 @@ For example, a bar plot can be created the following way: plt.figure(); @savefig bar_plot_ex.png - df.iloc[5].plot(kind="bar") + df.iloc[5].plot(kind="bar"); You can also create these other plots using the methods ``DataFrame.plot.`` instead of providing the ``kind`` keyword argument. This makes it easier to discover plot methods and the specific arguments they use: @@ -180,7 +180,7 @@ bar plot: df2 = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig bar_plot_multi_ex.png - df2.plot.bar() + df2.plot.bar(); To produce a stacked bar plot, pass ``stacked=True``: @@ -193,7 +193,7 @@ To produce a stacked bar plot, pass ``stacked=True``: .. ipython:: python @savefig bar_plot_stacked_ex.png - df2.plot.bar(stacked=True) + df2.plot.bar(stacked=True); To get horizontal bar plots, use the ``barh`` method: @@ -206,7 +206,7 @@ To get horizontal bar plots, use the ``barh`` method: .. ipython:: python @savefig barh_plot_stacked_ex.png - df2.plot.barh(stacked=True) + df2.plot.barh(stacked=True); .. _visualization.hist: @@ -414,7 +414,7 @@ groupings. For instance, df = pd.DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) - plt.figure() + plt.figure(); @savefig box_plot_ex2.png bp = df.boxplot(by="X") @@ -518,7 +518,7 @@ When input data contains ``NaN``, it will be automatically filled by 0. If you w df = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig area_plot_stacked.png - df.plot.area() + df.plot.area(); To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 unless otherwise specified: @@ -531,7 +531,7 @@ To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 .. ipython:: python @savefig area_plot_unstacked.png - df.plot.area(stacked=False) + df.plot.area(stacked=False); .. _visualization.scatter: @@ -554,7 +554,7 @@ These can be specified by the ``x`` and ``y`` keywords. df = pd.DataFrame(np.random.rand(50, 4), columns=["a", "b", "c", "d"]) @savefig scatter_plot.png - df.plot.scatter(x="a", y="b") + df.plot.scatter(x="a", y="b"); To plot multiple column groups in a single axes, repeat ``plot`` method specifying target ``ax``. It is recommended to specify ``color`` and ``label`` keywords to distinguish each groups. @@ -563,7 +563,7 @@ It is recommended to specify ``color`` and ``label`` keywords to distinguish eac ax = df.plot.scatter(x="a", y="b", color="DarkBlue", label="Group 1") @savefig scatter_plot_repeated.png - df.plot.scatter(x="c", y="d", color="DarkGreen", label="Group 2", ax=ax) + df.plot.scatter(x="c", y="d", color="DarkGreen", label="Group 2", ax=ax); .. ipython:: python :suppress: @@ -576,7 +576,7 @@ each point: .. ipython:: python @savefig scatter_plot_colored.png - df.plot.scatter(x="a", y="b", c="c", s=50) + df.plot.scatter(x="a", y="b", c="c", s=50); .. ipython:: python @@ -591,7 +591,7 @@ bubble chart using a column of the ``DataFrame`` as the bubble size. .. ipython:: python @savefig scatter_plot_bubble.png - df.plot.scatter(x="a", y="b", s=df["c"] * 200) + df.plot.scatter(x="a", y="b", s=df["c"] * 200); .. ipython:: python :suppress: @@ -837,7 +837,7 @@ You can create a scatter plot matrix using the df = pd.DataFrame(np.random.randn(1000, 4), columns=["a", "b", "c", "d"]) @savefig scatter_matrix_kde.png - scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal="kde") + scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal="kde"); .. ipython:: python :suppress: @@ -1086,7 +1086,7 @@ layout and formatting of the returned plot: plt.figure(); @savefig series_plot_basic2.png - ts.plot(style="k--", label="Series") + ts.plot(style="k--", label="Series"); .. ipython:: python :suppress: @@ -1144,7 +1144,7 @@ it empty for ylabel. df.plot(); @savefig plot_xlabel_ylabel.png - df.plot(xlabel="new x", ylabel="new y") + df.plot(xlabel="new x", ylabel="new y"); .. ipython:: python :suppress: @@ -1320,7 +1320,7 @@ with the ``subplots`` keyword: .. ipython:: python @savefig frame_plot_subplots.png - df.plot(subplots=True, figsize=(6, 6)) + df.plot(subplots=True, figsize=(6, 6)); .. ipython:: python :suppress: @@ -1343,7 +1343,7 @@ or columns needed, given the other. .. ipython:: python @savefig frame_plot_subplots_layout.png - df.plot(subplots=True, layout=(2, 3), figsize=(6, 6), sharex=False) + df.plot(subplots=True, layout=(2, 3), figsize=(6, 6), sharex=False); .. ipython:: python :suppress: @@ -1354,7 +1354,7 @@ The above example is identical to using: .. ipython:: python - df.plot(subplots=True, layout=(2, -1), figsize=(6, 6), sharex=False) + df.plot(subplots=True, layout=(2, -1), figsize=(6, 6), sharex=False); .. ipython:: python :suppress: @@ -1379,9 +1379,9 @@ otherwise you will see a warning. target1 = [axes[0][0], axes[1][1], axes[2][2], axes[3][3]] target2 = [axes[3][0], axes[2][1], axes[1][2], axes[0][3]] - df.plot(subplots=True, ax=target1, legend=False, sharex=False, sharey=False) + df.plot(subplots=True, ax=target1, legend=False, sharex=False, sharey=False); @savefig frame_plot_subplots_multi_ax.png - (-df).plot(subplots=True, ax=target2, legend=False, sharex=False, sharey=False) + (-df).plot(subplots=True, ax=target2, legend=False, sharex=False, sharey=False); .. ipython:: python :suppress: @@ -1409,15 +1409,15 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a fig, axes = plt.subplots(nrows=2, ncols=2) plt.subplots_adjust(wspace=0.2, hspace=0.5) - df["A"].plot(ax=axes[0, 0]) - axes[0, 0].set_title("A") - df["B"].plot(ax=axes[0, 1]) - axes[0, 1].set_title("B") - df["C"].plot(ax=axes[1, 0]) - axes[1, 0].set_title("C") - df["D"].plot(ax=axes[1, 1]) + df["A"].plot(ax=axes[0, 0]); + axes[0, 0].set_title("A"); + df["B"].plot(ax=axes[0, 1]); + axes[0, 1].set_title("B"); + df["C"].plot(ax=axes[1, 0]); + axes[1, 0].set_title("C"); + df["D"].plot(ax=axes[1, 1]); @savefig series_plot_multi.png - axes[1, 1].set_title("D") + axes[1, 1].set_title("D"); .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index bb2aa166419b4..194bb61f2c1c8 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -147,7 +147,7 @@ Bug fixes - Bug in ``setitem`` where type promotion is applied to the entire block (:issue:`10280`) - Bug in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`) - Bug in ``GroupBy.get_group`` when grouping on multiple keys, one of which is categorical. (:issue:`10132`) -- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) +- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetic ( :issue:`9926`) - Bug in ``DataFrame`` construction from nested ``dict`` with ``datetime64`` (:issue:`10160`) - Bug in ``Series`` construction from ``dict`` with ``datetime64`` keys (:issue:`9456`) - Bug in ``Series.plot(label="LABEL")`` not correctly setting the label (:issue:`10119`) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 1918a1e8caf6c..dd859dabc9c64 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0241: -Whats new in 0.24.1 (February 3, 2019) --------------------------------------- +What's new in 0.24.1 (February 3, 2019) +--------------------------------------- .. warning:: diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 27e84bf0a7cd7..36684d465373c 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_0242: -Whats new in 0.24.2 (March 12, 2019) ------------------------------------- +What's new in 0.24.2 (March 12, 2019) +------------------------------------- .. warning:: diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index aa2c77da4ee6f..6cb728800dc68 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -19,7 +19,16 @@ Fixed regressions - Fixed regression where :meth:`DataFrame.agg` would fail with :exc:`TypeError` when passed positional arguments to be passed on to the aggregation function (:issue:`36948`). - Fixed regression in :class:`RollingGroupby` with ``sort=False`` not being respected (:issue:`36889`) - Fixed regression in :meth:`Series.astype` converting ``None`` to ``"nan"`` when casting to string (:issue:`36904`) +- Fixed regression in :meth:`Series.rank` method failing for read-only data (:issue:`37290`) - Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`) +- Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`) +- Fixed regression in ``DataFrame.groupby(..).std()`` with nullable integer dtype (:issue:`37415`) +- Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`) +- Fixed regression where slicing :class:`DatetimeIndex` raised :exc:`AssertionError` on irregular time series with ``pd.NaT`` or on unsorted indices (:issue:`36953` and :issue:`35509`) +- Fixed regression in certain offsets (:meth:`pd.offsets.Day() ` and below) no longer being hashable (:issue:`37267`) +- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`) +- Fixed regression in setitem with :meth:`DataFrame.iloc` which raised error when trying to set a value while filtering with a boolean list (:issue:`36741`) +- Fixed regression in :attr:`MultiIndex.is_monotonic_increasing` returning wrong results with ``NaN`` in at least one of the levels (:issue:`37220`) .. --------------------------------------------------------------------------- @@ -27,7 +36,11 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) +- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` raising a ``ValueError`` when the target was read-only (:issue:`37174`) +- Bug in :meth:`GroupBy.fillna` that introduced a performance regression after 1.0.5 (:issue:`36757`) +- Bug in :meth:`DataFrame.info` was raising a ``KeyError`` when the DataFrame has integer column names (:issue:`37245`) +- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on (:issue:`35792`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f77a55b95d567..812af544ed9d8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -96,6 +96,32 @@ For example: buffer = io.BytesIO() data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") +Support for short caption and table position in ``to_latex`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrame.to_latex` now allows one to specify +a floating table position (:issue:`35281`) +and a short caption (:issue:`36267`). + +New keyword ``position`` is implemented to set the position. + +.. ipython:: python + + data = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + table = data.to_latex(position='ht') + print(table) + +Usage of keyword ``caption`` is extended. +Besides taking a single string as an argument, +one can optionally provide a tuple of ``(full_caption, short_caption)`` +to add a short caption macro. + +.. ipython:: python + + data = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + table = data.to_latex(caption=('the full long caption', 'short caption')) + print(table) + .. _whatsnew_120.read_csv_table_precision_default: Change in default floating precision for ``read_csv`` and ``read_table`` @@ -154,7 +180,7 @@ Alternatively, you can also use the dtype object: .. warning:: Experimental: the new floating data types are currently experimental, and its - behaviour or API may still change without warning. Expecially the behaviour + behaviour or API may still change without warning. Especially the behaviour regarding NaN (distinct from NA missing values) is subject to change. .. _whatsnew_120.index_name_preservation: @@ -181,6 +207,8 @@ level-by-level basis. Other enhancements ^^^^^^^^^^^^^^^^^^ +- Added ``day_of_week``(compatibility alias ``dayofweek``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex`` (:issue:`9605`) +- Added ``day_of_year`` (compatibility alias ``dayofyear``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex`` (:issue:`9605`) - Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) - :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) @@ -193,6 +221,10 @@ Other enhancements - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). - :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) +- :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`) +- :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`) +- :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`) +- :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) .. _whatsnew_120.api_breaking.python: @@ -304,7 +336,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) +- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) @@ -315,6 +347,8 @@ Performance improvements avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) - Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) - Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`) +- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) +- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) .. --------------------------------------------------------------------------- @@ -339,15 +373,16 @@ Datetimelike - Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) - Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`) - Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) +- Bug in :meth:`DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`) - Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) - :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) - Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) +- Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) Timedelta ^^^^^^^^^ - Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) -- -- +- Bug in parsing of ISO 8601 durations in :class:`Timedelta`, :meth:`pd.to_datetime` (:issue:`37159`, fixes :issue:`29773` and :issue:`36204`) Timezones ^^^^^^^^^ @@ -366,9 +401,13 @@ Numeric - Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) - Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`) - Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) +- Bug in :class:`MultiIndex` comparison with tuple incorrectly treating tuple as array-like (:issue:`21517`) - Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`) - Bug in :class:`DataFrame` arithmetic ops incorrectly accepting keyword arguments (:issue:`36843`) - Bug in :class:`IntervalArray` comparisons with :class:`Series` not returning :class:`Series` (:issue:`36908`) +- Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`) +- Bug in :meth:`DataFrame.std`` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`) Conversion ^^^^^^^^^^ @@ -379,9 +418,8 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.to_string`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` adding a leading space when ``index=False`` (:issue:`24980`) +- Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype :class:`Series` containing only numeric strings and ``NA`` (:issue:`37262`) - -- - Interval ^^^^^^^^ @@ -398,6 +436,9 @@ Indexing - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) - Bug in :meth:`DataFrame.reset_index` was incorrectly raising a ``ValueError`` for input with a :class:`MultiIndex` with missing values in a level with ``Categorical`` dtype (:issue:`24206`) - Bug in indexing with boolean masks on datetime-like values sometimes returning a view instead of a copy (:issue:`36210`) +- Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc.__getitem__` with :class:`IntervalIndex` columns and a numeric indexer (:issue:`26490`) +- Bug in :meth:`Series.loc.__getitem__` with a non-unique :class:`MultiIndex` and an empty-list indexer (:issue:`13691`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` with a level named "0" (:issue:`37194`) Missing ^^^^^^^ @@ -430,6 +471,9 @@ I/O - Bug in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`) - Bug in :meth:`to_json` with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`) - Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`) +- Bug in :meth:`DataFrame.to_html`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` ignoring the ``na_rep`` argument when ``float_format`` was also specified (:issue:`9046`, :issue:`13828`) +- Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) +- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) Plotting ^^^^^^^^ @@ -446,7 +490,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample(...)` that would throw a ``ValueError`` when resampling from "D" to "24H" over a transition into daylight savings time (DST) (:issue:`35219`) - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) -- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) @@ -457,6 +500,9 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.ffill` and :meth:`DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) - Bug in :meth:`RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) - Bug in :meth:`DataFrame.groupby.rolling` returning wrong values with partial centered window (:issue:`36040`). +- Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) +- Bug in :meth:`Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`) +- Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`) Reshaping ^^^^^^^^^ @@ -466,6 +512,7 @@ Reshaping - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) - Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`) - Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) +- Bug in :func:`join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`) - Sparse @@ -480,16 +527,18 @@ ExtensionArray - Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`) - Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`284881`) - Fixed bug when applying a NumPy ufunc with multiple outputs to a :class:`pandas.arrays.IntegerArray` returning None (:issue:`36913`) - +- Fixed an inconsistency in :class:`PeriodArray`'s ``__init__`` signature to those of :class:`DatetimeArray` and :class:`TimedeltaArray` (:issue:`37289`) Other ^^^^^ - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) -- Fixed metadata propagation in the :class:`Series.dt` and :class:`Series.str` accessors (:issue:`28283`) +- Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`) +- Fixed metadata propagation in the :class:`Series.dt` and :class:`Series.str` accessors and :class:`DataFrame.duplicated` and :class:`DataFrame.stack` and :class:`DataFrame.unstack` and :class:`DataFrame.pivot` methods (:issue:`28283`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) +- Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`). .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index c4723a5f064c7..f0cf485d9584a 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -325,7 +325,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): @cython.boundscheck(False) @cython.wraparound(False) -def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1) -> ndarray: +def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result @@ -799,7 +799,7 @@ ctypedef fused rank_t: @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - rank_t[:] in_arr, + ndarray[rank_t, ndim=1] in_arr, ties_method="average", bint ascending=True, na_option="keep", @@ -1018,7 +1018,7 @@ def rank_1d( def rank_2d( - rank_t[:, :] in_arr, + ndarray[rank_t, ndim=2] in_arr, int axis=0, ties_method="average", bint ascending=True, @@ -1195,6 +1195,7 @@ ctypedef fused diff_t: ctypedef fused out_t: float32_t float64_t + int64_t @cython.boundscheck(False) @@ -1204,11 +1205,13 @@ def diff_2d( ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, + bint datetimelike=False, ): cdef: Py_ssize_t i, j, sx, sy, start, stop bint f_contig = arr.flags.f_contiguous # bint f_contig = arr.is_f_contig() # TODO(cython 3) + diff_t left, right # Disable for unsupported dtype combinations, # see https://github.com/cython/cython/issues/2646 @@ -1218,6 +1221,9 @@ def diff_2d( elif (out_t is float64_t and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): raise NotImplementedError + elif out_t is int64_t and diff_t is not int64_t: + # We only have out_t of int64_t if we have datetimelike + raise NotImplementedError else: # We put this inside an indented else block to avoid cython build # warnings about unreachable code @@ -1231,7 +1237,15 @@ def diff_2d( start, stop = 0, sx + periods for j in range(sy): for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] + left = arr[i, j] + right = arr[i - periods, j] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if periods >= 0: start, stop = periods, sy @@ -1239,7 +1253,15 @@ def diff_2d( start, stop = 0, sy + periods for j in range(start, stop): for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] + left = arr[i, j] + right = arr[i, j - periods] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if axis == 0: if periods >= 0: @@ -1248,7 +1270,15 @@ def diff_2d( start, stop = 0, sx + periods for i in range(start, stop): for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] + left = arr[i, j] + right = arr[i - periods, j] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if periods >= 0: start, stop = periods, sy @@ -1256,7 +1286,15 @@ def diff_2d( start, stop = 0, sy + periods for i in range(sx): for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] + left = arr[i, j] + right = arr[i, j - periods] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right # generated from template diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index fcd081f563f92..4a466ada765ca 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -208,7 +208,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): {{if dtype == 'object'}} def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}(const {{c_type}}[:] arr, {{c_type}}[:] values): +def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{endif}} """ Return boolean of values in arr on an diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 922dcd7e74aa0..2cb4df7e054fe 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -651,6 +651,11 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) + if hasattr(arr, "to_numpy"): + arr = arr.to_numpy() + elif not isinstance(arr, np.ndarray): + arr = np.array(arr, dtype="object") + result = np.asarray(arr, dtype="object") if copy and result is arr: @@ -1414,10 +1419,12 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return "time" elif is_decimal(val): - return "decimal" + if is_decimal_array(values): + return "decimal" elif is_complex(val): - return "complex" + if is_complex_array(values): + return "complex" elif util.is_float_object(val): if is_float_array(values): @@ -1702,6 +1709,34 @@ cpdef bint is_float_array(ndarray values): return validator.validate(values) +cdef class ComplexValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return ( + util.is_complex_object(value) + or (util.is_float_object(value) and is_nan(value)) + ) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.complexfloating) + + +cdef bint is_complex_array(ndarray values): + cdef: + ComplexValidator validator = ComplexValidator(len(values), values.dtype) + return validator.validate(values) + + +cdef class DecimalValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return is_decimal(value) + + +cdef bint is_decimal_array(ndarray values): + cdef: + DecimalValidator validator = DecimalValidator(len(values), values.dtype) + return validator.validate(values) + + cdef class StringValidator(Validator): cdef inline bint is_value_typed(self, object value) except -1: return isinstance(value, str) @@ -1989,7 +2024,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, elif util.is_bool_object(val): floats[i] = uints[i] = ints[i] = bools[i] = val seen.bool_ = True - elif val is None: + elif val is None or val is C_NA: seen.saw_null() floats[i] = complexes[i] = NaN elif hasattr(val, '__len__') and len(val) == 0: @@ -2546,8 +2581,6 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): # kludge, for Series return np.empty(0, dtype='f8') - keys = getattr(keys, 'values', keys) - for i in range(n): val = keys[i] if val in mapping: diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 5343999c369f7..2af10a5b72d33 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1134,7 +1134,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } break; - + } } diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 3a628f997e5d6..88ad008b42c21 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -357,10 +357,12 @@ class NaTType(_NaT): week = property(fget=lambda self: np.nan) dayofyear = property(fget=lambda self: np.nan) + day_of_year = property(fget=lambda self: np.nan) weekofyear = property(fget=lambda self: np.nan) days_in_month = property(fget=lambda self: np.nan) daysinmonth = property(fget=lambda self: np.nan) dayofweek = property(fget=lambda self: np.nan) + day_of_week = property(fget=lambda self: np.nan) # inject Timedelta properties days = property(fget=lambda self: np.nan) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 101e86bb37912..98b2ddbd21ee1 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -791,6 +791,11 @@ cdef class Tick(SingleConstructorOffset): def is_anchored(self) -> bool: return False + # This is identical to BaseOffset.__hash__, but has to be redefined here + # for Python 3, because we've redefined __eq__. + def __hash__(self) -> int: + return hash(self._params) + # -------------------------------------------------------------------- # Comparison and Arithmetic Methods diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 27402c8d255b6..b1f9ff71f5faa 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1373,7 +1373,7 @@ cdef accessor _get_accessor_func(str field): return pweek elif field == "day_of_year": return pday_of_year - elif field == "weekday": + elif field == "weekday" or field == "day_of_week": return pweekday elif field == "days_in_month": return pdays_in_month @@ -1475,6 +1475,9 @@ cdef class _Period(PeriodMixin): PeriodDtypeBase _dtype BaseOffset freq + dayofweek = _Period.day_of_week + dayofyear = _Period.day_of_year + def __cinit__(self, int64_t ordinal, BaseOffset freq): self.ordinal = ordinal self.freq = freq @@ -1882,7 +1885,7 @@ cdef class _Period(PeriodMixin): return self.weekofyear @property - def dayofweek(self) -> int: + def day_of_week(self) -> int: """ Day of the week the period lies in, with Monday=0 and Sunday=6. @@ -1900,33 +1903,33 @@ cdef class _Period(PeriodMixin): See Also -------- - Period.dayofweek : Day of the week the period lies in. - Period.weekday : Alias of Period.dayofweek. + Period.day_of_week : Day of the week the period lies in. + Period.weekday : Alias of Period.day_of_week. Period.day : Day of the month. Period.dayofyear : Day of the year. Examples -------- >>> per = pd.Period('2017-12-31 22:00', 'H') - >>> per.dayofweek + >>> per.day_of_week 6 For periods that span over multiple days, the day at the beginning of the period is returned. >>> per = pd.Period('2017-12-31 22:00', '4H') - >>> per.dayofweek + >>> per.day_of_week 6 - >>> per.start_time.dayofweek + >>> per.start_time.day_of_week 6 For periods with a frequency higher than days, the last day of the period is returned. >>> per = pd.Period('2018-01', 'M') - >>> per.dayofweek + >>> per.day_of_week 2 - >>> per.end_time.dayofweek + >>> per.end_time.day_of_week 2 """ base = self._dtype._dtype_code @@ -1986,7 +1989,7 @@ cdef class _Period(PeriodMixin): return self.dayofweek @property - def dayofyear(self) -> int: + def day_of_year(self) -> int: """ Return the day of the year. @@ -2002,19 +2005,19 @@ cdef class _Period(PeriodMixin): See Also -------- Period.day : Return the day of the month. - Period.dayofweek : Return the day of week. - PeriodIndex.dayofyear : Return the day of year of all indexes. + Period.day_of_week : Return the day of week. + PeriodIndex.day_of_year : Return the day of year of all indexes. Examples -------- >>> period = pd.Period("2015-10-23", freq='H') - >>> period.dayofyear + >>> period.day_of_year 296 >>> period = pd.Period("2012-12-31", freq='D') - >>> period.dayofyear + >>> period.day_of_year 366 >>> period = pd.Period("2013-01-01", freq='D') - >>> period.dayofyear + >>> period.day_of_year 1 """ base = self._dtype._dtype_code diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index f647098140528..8eb995dee645b 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -312,7 +312,7 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, * object into a NumPy npy_datetimestruct. Uses tzinfo (if present) * to convert to UTC time. * - * The following implementation just asks for attributes, and thus + * The following implementation just asks for attributes, and thus * supports datetime duck typing. The tzinfo time zone conversion * requires this style of access as well. * diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index ee32ed53a908b..e21526a8f69e4 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -604,7 +604,7 @@ cdef inline int64_t parse_iso_format_string(str ts) except? -1: for c in ts: # number (ascii codes) - if ord(c) >= 48 and ord(c) <= 57: + if 48 <= ord(c) <= 57: have_value = 1 if have_dot: @@ -620,27 +620,28 @@ cdef inline int64_t parse_iso_format_string(str ts) except? -1: if not len(unit): number.append(c) else: - # if in days, pop trailing T - if unit[-1] == 'T': - unit.pop() - elif 'H' in unit or 'M' in unit: - if len(number) > 2: - raise ValueError(err_msg) r = timedelta_from_spec(number, '0', unit) result += timedelta_as_neg(r, neg) neg = 0 unit, number = [], [c] else: - if c == 'P': - pass # ignore leading character + if c == 'P' or c == 'T': + pass # ignore marking characters P and T elif c == '-': if neg or have_value: raise ValueError(err_msg) else: neg = 1 - elif c in ['D', 'T', 'H', 'M']: + elif c in ['W', 'D', 'H', 'M']: unit.append(c) + if c in ['H', 'M'] and len(number) > 2: + raise ValueError(err_msg) + r = timedelta_from_spec(number, '0', unit) + result += timedelta_as_neg(r, neg) + + neg = 0 + unit, number = [], [] elif c == '.': # append any seconds if len(number): @@ -661,11 +662,8 @@ cdef inline int64_t parse_iso_format_string(str ts) except? -1: r = timedelta_from_spec(number, '0', dec_unit) result += timedelta_as_neg(r, neg) else: # seconds - if len(number) <= 2: - r = timedelta_from_spec(number, '0', 'S') - result += timedelta_as_neg(r, neg) - else: - raise ValueError(err_msg) + r = timedelta_from_spec(number, '0', 'S') + result += timedelta_as_neg(r, neg) else: raise ValueError(err_msg) @@ -1058,7 +1056,8 @@ cdef class _Timedelta(timedelta): See Also -------- - Timestamp.isoformat + Timestamp.isoformat : Function is used to convert the given + Timestamp object into the ISO format. Notes ----- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index a8f6c60bcb300..9076325d01bab 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -230,6 +230,8 @@ cdef class _Timestamp(ABCTimestamp): # higher than np.ndarray and np.matrix __array_priority__ = 100 + dayofweek = _Timestamp.day_of_week + dayofyear = _Timestamp.day_of_year def __hash__(_Timestamp self): if self.nanosecond: @@ -538,14 +540,14 @@ cdef class _Timestamp(ABCTimestamp): return bool(ccalendar.is_leapyear(self.year)) @property - def dayofweek(self) -> int: + def day_of_week(self) -> int: """ Return day of the week. """ return self.weekday() @property - def dayofyear(self) -> int: + def day_of_year(self) -> int: """ Return the day of the year. """ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 131e0154ce8fc..2c315ca13e563 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -3,7 +3,6 @@ import cython from cython import Py_ssize_t -from libc.stdlib cimport free, malloc from libcpp.deque cimport deque import numpy as np @@ -59,7 +58,7 @@ cdef: cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -cdef bint is_monotonic_start_end_bounds( +cdef bint is_monotonic_increasing_start_end_bounds( ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end ): return is_monotonic(start, False)[0] and is_monotonic(end, False)[0] @@ -144,9 +143,11 @@ def roll_sum(ndarray[float64_t] values, ndarray[int64_t] start, int64_t s, e int64_t nobs = 0, i, j, N = len(values) ndarray[float64_t] output - bint is_monotonic_bounds + bint is_monotonic_increasing_bounds - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) with nogil: @@ -155,7 +156,7 @@ def roll_sum(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: # setup @@ -174,9 +175,10 @@ def roll_sum(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_sum(minp, nobs, sum_x) - if not is_monotonic_bounds: - for j in range(s, e): - remove_sum(values[j], &nobs, &sum_x, &compensation_remove) + if not is_monotonic_increasing_bounds: + nobs = 0 + sum_x = 0.0 + compensation_remove = 0.0 return output @@ -245,9 +247,11 @@ def roll_mean(ndarray[float64_t] values, ndarray[int64_t] start, int64_t s, e Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) ndarray[float64_t] output - bint is_monotonic_bounds + bint is_monotonic_increasing_bounds - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) with nogil: @@ -256,7 +260,7 @@ def roll_mean(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: # setup for j in range(s, e): @@ -277,10 +281,11 @@ def roll_mean(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_mean(minp, nobs, neg_ct, sum_x) - if not is_monotonic_bounds: - for j in range(s, e): - val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove) + if not is_monotonic_increasing_bounds: + nobs = 0 + neg_ct = 0 + sum_x = 0.0 + compensation_remove = 0.0 return output # ---------------------------------------------------------------------- @@ -368,9 +373,12 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start, int64_t s, e Py_ssize_t i, j, N = len(values) ndarray[float64_t] output - bint is_monotonic_bounds + bint is_monotonic_increasing_bounds - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + minp = max(minp, 1) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) with nogil: @@ -382,7 +390,7 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) @@ -403,10 +411,11 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_var(minp, ddof, nobs, ssqdm_x) - if not is_monotonic_bounds: - for j in range(s, e): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x, - &compensation_remove) + if not is_monotonic_increasing_bounds: + nobs = 0.0 + mean_x = 0.0 + ssqdm_x = 0.0 + compensation_remove = 0.0 return output @@ -486,9 +495,12 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, int64_t nobs = 0, i, j, N = len(values) int64_t s, e ndarray[float64_t] output - bint is_monotonic_bounds + bint is_monotonic_increasing_bounds - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + minp = max(minp, 3) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) with nogil: @@ -500,7 +512,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): val = values[j] @@ -523,10 +535,11 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_skew(minp, nobs, x, xx, xxx) - if not is_monotonic_bounds: - for j in range(s, e): - val = values[j] - remove_skew(val, &nobs, &x, &xx, &xxx) + if not is_monotonic_increasing_bounds: + nobs = 0 + x = 0.0 + xx = 0.0 + xxx = 0.0 return output @@ -610,9 +623,12 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 int64_t nobs = 0, i, j, s, e, N = len(values) ndarray[float64_t] output - bint is_monotonic_bounds + bint is_monotonic_increasing_bounds - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + minp = max(minp, 4) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) with nogil: @@ -624,7 +640,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) @@ -644,9 +660,12 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) - if not is_monotonic_bounds: - for j in range(s, e): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + if not is_monotonic_increasing_bounds: + nobs = 0 + x = 0.0 + xx = 0.0 + xxx = 0.0 + xxxx = 0.0 return output @@ -656,7 +675,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win=0): + ndarray[int64_t] end, int64_t minp): # GH 32865. win argument kept for compatibility cdef: float64_t val, res, prev @@ -664,7 +683,7 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, int ret = 0 skiplist_t *sl Py_ssize_t i, j - int64_t nobs = 0, N = len(values), s, e + int64_t nobs = 0, N = len(values), s, e, win int midpoint ndarray[float64_t] output @@ -722,6 +741,8 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, else: res = (skiplist_get(sl, midpoint, &ret) + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + if ret == 0: + res = NaN else: res = NaN @@ -906,7 +927,7 @@ interpolation_types = { def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win, + ndarray[int64_t] end, int64_t minp, float64_t quantile, str interpolation): """ O(N log(window)) implementation using skip list @@ -933,7 +954,7 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, # actual skiplist ops outweigh any window computation costs output = np.empty(N, dtype=float) - if win == 0 or (end - start).max() == 0: + if (end - start).max() == 0: output[:] = NaN return output win = (end - start).max() @@ -1009,6 +1030,9 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, vlow = skiplist_get(skiplist, idx, &ret) vhigh = skiplist_get(skiplist, idx + 1, &ret) output[i] = (vlow + vhigh) / 2 + + if ret == 0: + output[i] = NaN else: output[i] = NaN @@ -1020,7 +1044,7 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, def roll_apply(object obj, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, - object func, bint raw, + object function, bint raw, tuple args, dict kwargs): cdef: ndarray[float64_t] output, counts @@ -1048,9 +1072,9 @@ def roll_apply(object obj, if counts[i] >= minp: if raw: - output[i] = func(arr[s:e], *args, **kwargs) + output[i] = function(arr[s:e], *args, **kwargs) else: - output[i] = func(obj.iloc[s:e], *args, **kwargs) + output[i] = function(obj.iloc[s:e], *args, **kwargs) else: output[i] = NaN @@ -1088,13 +1112,8 @@ cdef ndarray[float64_t] _roll_weighted_sum_mean(float64_t[:] values, if avg: tot_wgt = np.zeros(in_n, dtype=np.float64) - if minp > win_n: - raise ValueError(f"min_periods (minp) must be <= " - f"window (win)") elif minp > in_n: minp = in_n + 1 - elif minp < 0: - raise ValueError('min_periods must be >= 0') minp = max(minp, 1) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 9af1159a805ec..6a49a5bb34855 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -43,16 +43,14 @@ def calculate_variable_window_bounds( (ndarray[int64], ndarray[int64]) """ cdef: - bint left_closed = False - bint right_closed = False - int index_growth_sign = 1 + bint left_closed = False, right_closed = False ndarray[int64_t, ndim=1] start, end - int64_t start_bound, end_bound + int64_t start_bound, end_bound, index_growth_sign = 1 Py_ssize_t i, j - # if windows is variable, default is 'right', otherwise default is 'both' + # default is 'right' if closed is None: - closed = 'right' if index is not None else 'both' + closed = 'right' if closed in ['right', 'both']: right_closed = True diff --git a/pandas/_testing.py b/pandas/_testing.py index cf6272edc4c05..a4fdb390abf42 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -6,6 +6,7 @@ import gzip import operator import os +import re from shutil import rmtree import string import tempfile @@ -2546,10 +2547,11 @@ def wrapper(*args, **kwargs): @contextmanager def assert_produces_warning( - expected_warning=Warning, + expected_warning: Optional[Union[Type[Warning], bool]] = Warning, filter_level="always", - check_stacklevel=True, - raise_on_extra_warnings=True, + check_stacklevel: bool = True, + raise_on_extra_warnings: bool = True, + match: Optional[str] = None, ): """ Context manager for running code expected to either raise a specific @@ -2584,6 +2586,8 @@ class for all warnings. To check that no warning is returned, raise_on_extra_warnings : bool, default True Whether extra warnings not of the type `expected_warning` should cause the test to fail. + match : str, optional + Match warning message. Examples -------- @@ -2610,28 +2614,28 @@ class for all warnings. To check that no warning is returned, with warnings.catch_warnings(record=True) as w: saw_warning = False + matched_message = False + warnings.simplefilter(filter_level) yield w extra_warnings = [] for actual_warning in w: - if expected_warning and issubclass( - actual_warning.category, expected_warning - ): + if not expected_warning: + continue + + expected_warning = cast(Type[Warning], expected_warning) + if issubclass(actual_warning.category, expected_warning): saw_warning = True if check_stacklevel and issubclass( actual_warning.category, (FutureWarning, DeprecationWarning) ): - from inspect import getframeinfo, stack + _assert_raised_with_correct_stacklevel(actual_warning) + + if match is not None and re.search(match, str(actual_warning.message)): + matched_message = True - caller = getframeinfo(stack()[2][0]) - msg = ( - "Warning not set with correct stacklevel. " - f"File where warning is raised: {actual_warning.filename} != " - f"{caller.filename}. Warning message: {actual_warning.message}" - ) - assert actual_warning.filename == caller.filename, msg else: extra_warnings.append( ( @@ -2641,18 +2645,41 @@ class for all warnings. To check that no warning is returned, actual_warning.lineno, ) ) + if expected_warning: - msg = ( - f"Did not see expected warning of class " - f"{repr(expected_warning.__name__)}" - ) - assert saw_warning, msg + expected_warning = cast(Type[Warning], expected_warning) + if not saw_warning: + raise AssertionError( + f"Did not see expected warning of class " + f"{repr(expected_warning.__name__)}" + ) + + if match and not matched_message: + raise AssertionError( + f"Did not see warning {repr(expected_warning.__name__)} " + f"matching {match}" + ) + if raise_on_extra_warnings and extra_warnings: raise AssertionError( f"Caused unexpected warning(s): {repr(extra_warnings)}" ) +def _assert_raised_with_correct_stacklevel( + actual_warning: warnings.WarningMessage, +) -> None: + from inspect import getframeinfo, stack + + caller = getframeinfo(stack()[3][0]) + msg = ( + "Warning not set with correct stacklevel. " + f"File where warning is raised: {actual_warning.filename} != " + f"{caller.filename}. Warning message: {actual_warning.message}" + ) + assert actual_warning.filename == caller.filename, msg + + class RNGContext: """ Context manager to set the numpy random number generator speed. Returns diff --git a/pandas/_typing.py b/pandas/_typing.py index 7678d1bf12d8b..a9177106535fc 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -38,6 +38,8 @@ from pandas.core.indexes.base import Index from pandas.core.series import Series + from pandas.io.formats.format import EngFormatter + # array-like AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) @@ -127,6 +129,10 @@ EncodingVar = TypeVar("EncodingVar", str, None, Optional[str]) +# type of float formatter in DataFrameFormatter +FloatFormatType = Union[str, Callable, "EngFormatter"] + + @dataclass class IOargs(Generic[ModeVar, EncodingVar]): """ diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 40688a3978cfc..d3c7888cac704 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -18,13 +18,12 @@ "openpyxl": "2.5.7", "pandas_gbq": "0.12.0", "pyarrow": "0.15.0", - "pytables": "3.4.4", "pytest": "5.0.1", "pyxlsb": "1.0.6", "s3fs": "0.4.0", "scipy": "1.2.0", "sqlalchemy": "1.2.8", - "tables": "3.4.4", + "tables": "3.5.1", "tabulate": "0.8.3", "xarray": "0.12.0", "xlrd": "1.2.0", diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index c074b06042e26..c2e91c7877d35 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -387,7 +387,7 @@ def validate_resampler_func(method: str, args, kwargs) -> None: raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis: Optional[int]) -> None: +def validate_minmax_axis(axis: Optional[int], ndim: int = 1) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero or None, as otherwise it will be incorrectly ignored. @@ -395,12 +395,12 @@ def validate_minmax_axis(axis: Optional[int]) -> None: Parameters ---------- axis : int or None + ndim : int, default 1 Raises ------ ValueError """ - ndim = 1 # hard-coded for Index if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): diff --git a/pandas/conftest.py b/pandas/conftest.py index ebb24c184d9a4..515d20e8c5781 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -34,7 +34,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame +from pandas import DataFrame, Series import pandas._testing as tm from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex @@ -361,6 +361,19 @@ def multiindex_year_month_day_dataframe_random_data(): return ymd +@pytest.fixture +def multiindex_dataframe_random_data(): + """DataFrame with 2 level MultiIndex with random data""" + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame( + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + ) + + def _create_multiindex(): """ MultiIndex used to test the general functionality of this object @@ -516,6 +529,23 @@ def series_with_simple_index(index): return _create_series(index) +@pytest.fixture +def series_with_multilevel_index(): + """ + Fixture with a Series with a 2-level MultiIndex. + """ + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + data = np.random.randn(8) + ser = Series(data, index=index) + ser[3] = np.NaN + return ser + + _narrow_dtypes = [ np.float16, np.float32, diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 2caf1f75f3da1..15c2a4a6c5c04 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -4,7 +4,7 @@ that can be mixed into or pinned onto other pandas classes. """ -from typing import FrozenSet, Set +from typing import FrozenSet, List, Set import warnings from pandas.util._decorators import doc @@ -12,28 +12,21 @@ class DirNamesMixin: _accessors: Set[str] = set() - _deprecations: FrozenSet[str] = frozenset() + _hidden_attrs: FrozenSet[str] = frozenset() - def _dir_deletions(self): + def _dir_deletions(self) -> Set[str]: """ Delete unwanted __dir__ for this object. """ - return self._accessors | self._deprecations + return self._accessors | self._hidden_attrs - def _dir_additions(self): + def _dir_additions(self) -> Set[str]: """ Add additional __dir__ for this object. """ - rv = set() - for accessor in self._accessors: - try: - getattr(self, accessor) - rv.add(accessor) - except AttributeError: - pass - return rv + return {accessor for accessor in self._accessors if hasattr(self, accessor)} - def __dir__(self): + def __dir__(self) -> List[str]: """ Provide method name lookup and completion. @@ -41,7 +34,7 @@ def __dir__(self): ----- Only provide 'public' methods. """ - rv = set(dir(type(self))) + rv = set(super().__dir__()) rv = (rv - self._dir_deletions()) | self._dir_additions() return sorted(rv) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index ba7638e269fc0..4ab0c2515f5fa 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -31,7 +31,7 @@ from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import is_dict_like, is_list_like -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCNDFrame, ABCSeries from pandas.core.base import DataError, SpecificationError import pandas.core.common as com @@ -621,58 +621,27 @@ def aggregate(obj, arg: AggFuncType, *args, **kwargs): # set the final keys keys = list(arg.keys()) - # combine results - - def is_any_series() -> bool: - # return a boolean if we have *any* nested series - return any(isinstance(r, ABCSeries) for r in results.values()) - - def is_any_frame() -> bool: - # return a boolean if we have *any* nested series - return any(isinstance(r, ABCDataFrame) for r in results.values()) - - if isinstance(results, list): - return concat(results, keys=keys, axis=1, sort=True), True - - elif is_any_frame(): - # we have a dict of DataFrames - # return a MI DataFrame + # Avoid making two isinstance calls in all and any below + is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] + # combine results + if all(is_ndframe): keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys - return ( - concat([results[k] for k in keys_to_use], keys=keys_to_use, axis=1), - True, + axis = 0 if isinstance(obj, ABCSeries) else 1 + result = concat({k: results[k] for k in keys_to_use}, axis=axis) + elif any(is_ndframe): + # There is a mix of NDFrames and scalars + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" ) + else: + from pandas import Series - elif isinstance(obj, ABCSeries) and is_any_series(): - - # we have a dict of Series - # return a MI Series - try: - result = concat(results) - except TypeError as err: - # we want to give a nice error here if - # we have non-same sized objects, so - # we don't automatically broadcast - - raise ValueError( - "cannot perform both aggregation " - "and transformation operations " - "simultaneously" - ) from err - - return result, True - - # fall thru - from pandas import DataFrame, Series - - try: - result = DataFrame(results) - except ValueError: # we have a dict of scalars - # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d2005d46bbbf1..9a3144d1ccbaa 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -696,6 +696,8 @@ def factorize( # return original tenor if isinstance(original, ABCIndexClass): + if original.dtype.kind in ["m", "M"] and isinstance(uniques, np.ndarray): + uniques = type(original._data)._simple_new(uniques, dtype=original.dtype) uniques = original._shallow_copy(uniques, name=None) elif isinstance(original, ABCSeries): from pandas import Index @@ -1650,7 +1652,8 @@ def take_nd( """ mask_info = None - if is_extension_array_dtype(arr): + if isinstance(arr, ABCExtensionArray): + # Check for EA to catch DatetimeArray, TimedeltaArray return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) arr = extract_array(arr) @@ -1908,6 +1911,8 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): if is_extension_array_dtype(dtype): if hasattr(arr, f"__{op.__name__}__"): + if axis != 0: + raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") return op(arr, arr.shift(n)) else: warn( @@ -1922,18 +1927,26 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): is_timedelta = False is_bool = False if needs_i8_conversion(arr.dtype): - dtype = np.float64 + dtype = np.int64 arr = arr.view("i8") na = iNaT is_timedelta = True elif is_bool_dtype(dtype): + # We have to cast in order to be able to hold np.nan dtype = np.object_ is_bool = True elif is_integer_dtype(dtype): + # We have to cast in order to be able to hold np.nan dtype = np.float64 + orig_ndim = arr.ndim + if orig_ndim == 1: + # reshape so we can always use algos.diff_2d + arr = arr.reshape(-1, 1) + # TODO: require axis == 0 + dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) @@ -1944,7 +1957,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): if arr.ndim == 2 and arr.dtype.name in _diff_special: # TODO: can diff_2d dtype specialization troubles be fixed by defining # out_arr inside diff_2d? - algos.diff_2d(arr, out_arr, n, axis) + algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) else: # To keep mypy happy, _res_indexer is a list while res_indexer is # a tuple, ditto for lag_indexer. @@ -1978,8 +1991,10 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] if is_timedelta: - out_arr = out_arr.astype("int64").view("timedelta64[ns]") + out_arr = out_arr.view("timedelta64[ns]") + if orig_ndim == 1: + out_arr = out_arr[:, 0] return out_arr @@ -2043,7 +2058,7 @@ def safe_sort( "Only list-like objects are allowed to be passed to safe_sort as values" ) - if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): + if not isinstance(values, (np.ndarray, ABCExtensionArray)): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 95a003efbe1d0..948ffdc1f7c01 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -180,13 +180,10 @@ def _validate_shift_value(self, fill_value): return self._validate_fill_value(fill_value) def __setitem__(self, key, value): - key = self._validate_setitem_key(key) + key = check_array_indexer(self, key) value = self._validate_setitem_value(value) self._ndarray[key] = value - def _validate_setitem_key(self, key): - return check_array_indexer(self, key) - def _validate_setitem_value(self, value): return value @@ -198,7 +195,8 @@ def __getitem__(self, key): return self._box_func(result) return self._from_backing_data(result) - key = self._validate_getitem_key(key) + key = extract_array(key, extract_numpy=True) + key = check_array_indexer(self, key) result = self._ndarray[key] if lib.is_scalar(result): return self._box_func(result) @@ -206,10 +204,6 @@ def __getitem__(self, key): result = self._from_backing_data(result) return result - def _validate_getitem_key(self, key): - key = extract_array(key, extract_numpy=True) - return check_array_indexer(self, key) - @doc(ExtensionArray.fillna) def fillna(self: _T, value=None, method=None, limit=None) -> _T: value, method = validate_fillna_kwargs(value, method) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index debfb50caeeaa..57f8f11d4d04c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -232,8 +232,8 @@ def _from_factorized(cls, values, original): See Also -------- - factorize - ExtensionArray.factorize + factorize : Top-level factorize method that dispatches here. + ExtensionArray.factorize : Encode the extension array as an enumerated type. """ raise AbstractMethodError(cls) @@ -501,13 +501,18 @@ def _values_for_argsort(self) -> np.ndarray: See Also -------- - ExtensionArray.argsort + ExtensionArray.argsort : Return the indices that would sort this array. """ # Note: this is used in `ExtensionArray.argsort`. return np.array(self) def argsort( - self, ascending: bool = True, kind: str = "quicksort", *args, **kwargs + self, + ascending: bool = True, + kind: str = "quicksort", + na_position: str = "last", + *args, + **kwargs, ) -> np.ndarray: """ Return the indices that would sort this array. @@ -538,7 +543,14 @@ def argsort( # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = nargsort(self, kind=kind, ascending=ascending, na_position="last") + values = self._values_for_argsort() + result = nargsort( + values, + kind=kind, + ascending=ascending, + na_position=na_position, + mask=np.asarray(self.isna()), + ) return result def argmin(self): @@ -956,8 +968,8 @@ def take( See Also -------- - numpy.take - api.extensions.take + numpy.take : Take elements from an array along an axis. + api.extensions.take : Take elements from an array. Notes ----- diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 9ffe00cf3189a..b46e70f5a936d 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -418,7 +418,7 @@ def _values_for_argsort(self) -> np.ndarray: See Also -------- - ExtensionArray.argsort + ExtensionArray.argsort : Return the indices that would sort this array. """ data = self._data.copy() data[self._mask] = -1 diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 081a363ce03c6..499bb364c48a1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -288,7 +288,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = PandasObject._deprecations | frozenset(["tolist"]) + _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) _typ = "categorical" _can_hold_na = True @@ -2061,7 +2061,7 @@ def unique(self): -------- pandas.unique CategoricalIndex.unique - Series.unique + Series.unique : Return unique values of Series object. Examples -------- diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index eb5e5b03fe243..a6c74294c4c75 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -65,7 +65,7 @@ from pandas.core.arrays._mixins import NDArrayBackedExtensionArray import pandas.core.common as com from pandas.core.construction import array, extract_array -from pandas.core.indexers import check_setitem_lengths +from pandas.core.indexers import check_array_indexer, check_setitem_lengths from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison, make_invalid_op @@ -75,6 +75,7 @@ from pandas.core.arrays import DatetimeArray, TimedeltaArray DTScalarOrNaT = Union[DatetimeLikeScalar, NaTType] +DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") class InvalidComparison(Exception): @@ -86,7 +87,20 @@ class InvalidComparison(Exception): pass -class AttributesMixin: +class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): + """ + Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray + + Assumes that __new__/__init__ defines: + _data + _freq + + and that the inheriting class has methods: + _generate_range + """ + + _is_recognized_dtype: Callable[[DtypeObj], bool] + _recognized_scalars: Tuple[Type, ...] _data: np.ndarray def __init__(self, data, dtype=None, freq=None, copy=False): @@ -184,25 +198,6 @@ def _check_compatible_with( """ raise AbstractMethodError(self) - -DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") - - -class DatetimeLikeArrayMixin(OpsMixin, AttributesMixin, NDArrayBackedExtensionArray): - """ - Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray - - Assumes that __new__/__init__ defines: - _data - _freq - - and that the inheriting class has methods: - _generate_range - """ - - _is_recognized_dtype: Callable[[DtypeObj], bool] - _recognized_scalars: Tuple[Type, ...] - # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat @@ -231,7 +226,10 @@ def _box_values(self, values): return lib.map_infer(values, self._box_func) def __iter__(self): - return (self._box_func(v) for v in self.asi8) + if self.ndim > 1: + return (self[n] for n in range(len(self))) + else: + return (self._box_func(v) for v in self.asi8) @property def asi8(self) -> np.ndarray: @@ -294,7 +292,7 @@ def _get_getitem_freq(self, key): elif self.ndim != 1: freq = None else: - key = self._validate_getitem_key(key) # maybe ndarray[bool] -> slice + key = check_array_indexer(self, key) # maybe ndarray[bool] -> slice freq = None if isinstance(key, slice): if self.freq is not None and key.step is not None: @@ -421,7 +419,7 @@ def _from_factorized(cls, values, original): # Validation Methods # TODO: try to de-duplicate these, ensure identical behavior - def _validate_comparison_value(self, other, opname: str): + def _validate_comparison_value(self, other): if isinstance(other, str): try: # GH#18435 strings get a pass from tzawareness compat @@ -431,7 +429,7 @@ def _validate_comparison_value(self, other, opname: str): raise InvalidComparison(other) if isinstance(other, self._recognized_scalars) or other is NaT: - other = self._scalar_type(other) # type: ignore[call-arg] + other = self._scalar_type(other) try: self._check_compatible_with(other) except TypeError as err: @@ -446,7 +444,7 @@ def _validate_comparison_value(self, other, opname: str): else: try: - other = self._validate_listlike(other, opname, allow_object=True) + other = self._validate_listlike(other, allow_object=True) self._check_compatible_with(other) except TypeError as err: if is_object_dtype(getattr(other, "dtype", None)): @@ -479,11 +477,10 @@ def _validate_fill_value(self, fill_value): f"Got '{str(fill_value)}'." ) try: - fill_value = self._validate_scalar(fill_value, msg) + fill_value = self._validate_scalar(fill_value) except TypeError as err: raise ValueError(msg) from err - rv = self._unbox(fill_value) - return self._rebox_native(rv) + return self._unbox(fill_value, setitem=True) def _validate_shift_value(self, fill_value): # TODO(2.0): once this deprecation is enforced, use _validate_fill_value @@ -512,17 +509,16 @@ def _validate_shift_value(self, fill_value): return self._unbox(fill_value) - def _validate_scalar(self, value, msg: Optional[str] = None): + def _validate_scalar(self, value, allow_listlike: bool = False): """ Validate that the input value can be cast to our scalar_type. Parameters ---------- value : object - msg : str, optional. - Message to raise in TypeError on invalid input. - If not provided, `value` is cast to a str and used - as the message. + allow_listlike: bool, default False + When raising an exception, whether the message should say + listlike inputs are allowed. Returns ------- @@ -533,6 +529,7 @@ def _validate_scalar(self, value, msg: Optional[str] = None): try: value = self._scalar_from_string(value) except ValueError as err: + msg = self._validation_error_message(value, allow_listlike) raise TypeError(msg) from err elif is_valid_nat_for_dtype(value, self.dtype): @@ -544,13 +541,39 @@ def _validate_scalar(self, value, msg: Optional[str] = None): value = self._scalar_type(value) # type: ignore[call-arg] else: - if msg is None: - msg = str(value) + msg = self._validation_error_message(value, allow_listlike) raise TypeError(msg) return value - def _validate_listlike(self, value, opname: str, allow_object: bool = False): + def _validation_error_message(self, value, allow_listlike: bool = False) -> str: + """ + Construct an exception message on validation error. + + Some methods allow only scalar inputs, while others allow either scalar + or listlike. + + Parameters + ---------- + allow_listlike: bool, default False + + Returns + ------- + str + """ + if allow_listlike: + msg = ( + f"value should be a '{self._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{type(value).__name__}' instead." + ) + else: + msg = ( + f"value should be a '{self._scalar_type.__name__}' or 'NaT'. " + f"Got '{type(value).__name__}' instead." + ) + return msg + + def _validate_listlike(self, value, allow_object: bool = False): if isinstance(value, type(self)): return value @@ -580,64 +603,53 @@ def _validate_listlike(self, value, opname: str, allow_object: bool = False): elif not type(self)._is_recognized_dtype(value.dtype): raise TypeError( - f"{opname} requires compatible dtype or scalar, " - f"not {type(value).__name__}" + f"value should be a '{self._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{type(value).__name__}' instead." ) - return value def _validate_searchsorted_value(self, value): - msg = "searchsorted requires compatible dtype or scalar" if not is_list_like(value): - value = self._validate_scalar(value, msg) + value = self._validate_scalar(value, True) else: - value = self._validate_listlike(value, "searchsorted") + value = self._validate_listlike(value) - rv = self._unbox(value) - return self._rebox_native(rv) + return self._unbox(value) def _validate_setitem_value(self, value): - msg = ( - f"'value' should be a '{self._scalar_type.__name__}', 'NaT', " - f"or array of those. Got '{type(value).__name__}' instead." - ) if is_list_like(value): - value = self._validate_listlike(value, "setitem") + value = self._validate_listlike(value) else: - value = self._validate_scalar(value, msg) + value = self._validate_scalar(value, True) return self._unbox(value, setitem=True) def _validate_insert_value(self, value): - msg = f"cannot insert {type(self).__name__} with incompatible label" - value = self._validate_scalar(value, msg) + value = self._validate_scalar(value) - self._check_compatible_with(value, setitem=True) - # TODO: if we dont have compat, should we raise or astype(object)? - # PeriodIndex does astype(object) - return value - # Note: we do not unbox here because the caller needs boxed value - # to check for freq. + return self._unbox(value, setitem=True) def _validate_where_value(self, other): - msg = f"Where requires matching dtype, not {type(other)}" if not is_list_like(other): - other = self._validate_scalar(other, msg) + other = self._validate_scalar(other, True) else: - other = self._validate_listlike(other, "where") + other = self._validate_listlike(other) return self._unbox(other, setitem=True) - def _unbox(self, other, setitem: bool = False) -> Union[np.int64, np.ndarray]: + def _unbox( + self, other, setitem: bool = False + ) -> Union[np.int64, np.datetime64, np.timedelta64, np.ndarray]: """ Unbox either a scalar with _unbox_scalar or an instance of our own type. """ if lib.is_scalar(other): other = self._unbox_scalar(other, setitem=setitem) + other = self._rebox_native(other) else: # same type as self self._check_compatible_with(other, setitem=setitem) - other = other.view("i8") + other = other._ndarray return other # ------------------------------------------------------------------ @@ -851,7 +863,7 @@ def _cmp_method(self, other, op): return op(self.ravel(), other.ravel()).reshape(self.shape) try: - other = self._validate_comparison_value(other, f"__{op.__name__}__") + other = self._validate_comparison_value(other) except InvalidComparison: return invalid_comparison(self, other, op) @@ -861,11 +873,13 @@ def _cmp_method(self, other, op): # comparison otherwise it would fail to raise when # comparing tz-aware and tz-naive with np.errstate(all="ignore"): - result = ops.comp_method_OBJECT_ARRAY(op, self.astype(object), other) + result = ops.comp_method_OBJECT_ARRAY( + op, np.asarray(self.astype(object)), other + ) return result - other_i8 = self._unbox(other) - result = op(self.asi8, other_i8) + other_vals = self._unbox(other) + result = op(self._ndarray, other_vals) o_mask = isna(other) if self._hasnans | np.any(o_mask): @@ -1250,13 +1264,24 @@ def min(self, axis=None, skipna=True, *args, **kwargs): Series.min : Return the minimum value in a Series. """ nv.validate_min(args, kwargs) - nv.validate_minmax_axis(axis) + nv.validate_minmax_axis(axis, self.ndim) - result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna()) - if isna(result): - # Period._from_ordinal does not handle np.nan gracefully - return NaT - return self._box_func(result) + if is_period_dtype(self.dtype): + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmin( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + if result is NaT: + return NaT + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) + if lib.is_scalar(result): + return self._box_func(result) + return self._from_backing_data(result) def max(self, axis=None, skipna=True, *args, **kwargs): """ @@ -1272,25 +1297,26 @@ def max(self, axis=None, skipna=True, *args, **kwargs): # TODO: skipna is broken with max. # See https://github.com/pandas-dev/pandas/issues/24265 nv.validate_max(args, kwargs) - nv.validate_minmax_axis(axis) + nv.validate_minmax_axis(axis, self.ndim) - mask = self.isna() - if skipna: - values = self[~mask].asi8 - elif mask.any(): - return NaT - else: - values = self.asi8 - - if not len(values): - # short-circuit for empty max / min - return NaT + if is_period_dtype(self.dtype): + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmax( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + if result is NaT: + return result + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) - result = nanops.nanmax(values, skipna=skipna) - # Don't have to worry about NA `result`, since no NA went in. - return self._box_func(result) + result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) + if lib.is_scalar(result): + return self._box_func(result) + return self._from_backing_data(result) - def mean(self, skipna=True): + def mean(self, skipna=True, axis: Optional[int] = 0): """ Return the mean value of the Array. @@ -1300,6 +1326,7 @@ def mean(self, skipna=True): ---------- skipna : bool, default True Whether to ignore any NaT elements. + axis : int, optional, default 0 Returns ------- @@ -1323,21 +1350,12 @@ def mean(self, skipna=True): "obj.to_timestamp(how='start').mean()" ) - mask = self.isna() - if skipna: - values = self[~mask] - elif mask.any(): - return NaT - else: - values = self - - if not len(values): - # short-circuit for empty max / min - return NaT - - result = nanops.nanmean(values.view("i8"), skipna=skipna) - # Don't have to worry about NA `result`, since no NA went in. - return self._box_func(result) + result = nanops.nanmean( + self._ndarray, axis=axis, skipna=skipna, mask=self.isna() + ) + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) def median(self, axis: Optional[int] = None, skipna: bool = True, *args, **kwargs): nv.validate_median(args, kwargs) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index fb8604a8c87ba..751290d5af9ae 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -177,7 +177,9 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): "week", "weekday", "dayofweek", + "day_of_week", "dayofyear", + "day_of_year", "quarter", "days_in_month", "daysinmonth", @@ -473,7 +475,7 @@ def _check_compatible_with(self, other, setitem: bool = False): if setitem: # Stricter check for setitem vs comparison methods if not timezones.tz_compare(self.tz, other.tz): - raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") + raise ValueError(f"Timezones don't match. '{self.tz}' != '{other.tz}'") def _maybe_clear_freq(self): self._freq = None @@ -563,19 +565,21 @@ def __iter__(self): ------ tstamp : Timestamp """ - - # convert in chunks of 10k for efficiency - data = self.asi8 - length = len(self) - chunksize = 10000 - chunks = int(length / chunksize) + 1 - for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, length) - converted = ints_to_pydatetime( - data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" - ) - yield from converted + if self.ndim > 1: + return (self[n] for n in range(len(self))) + else: + # convert in chunks of 10k for efficiency + data = self.asi8 + length = len(self) + chunksize = 10000 + chunks = int(length / chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = ints_to_pydatetime( + data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" + ) + yield from converted def astype(self, dtype, copy=True): # We handle @@ -1257,8 +1261,10 @@ def isocalendar(self): See Also -------- - Timestamp.isocalendar - datetime.date.isocalendar + Timestamp.isocalendar : Function return a 3-tuple containing ISO year, + week number, and weekday for the given Timestamp object. + datetime.date.isocalendar : Return a named tuple object with + three components: year, week and weekday. Examples -------- @@ -1531,16 +1537,18 @@ def weekofyear(self): 2017-01-08 6 Freq: D, dtype: int64 """ - dayofweek = _field_accessor("dayofweek", "dow", _dayofweek_doc) - weekday = dayofweek + day_of_week = _field_accessor("day_of_week", "dow", _dayofweek_doc) + dayofweek = day_of_week + weekday = day_of_week - dayofyear = _field_accessor( + day_of_year = _field_accessor( "dayofyear", "doy", """ The ordinal day of the year. """, ) + dayofyear = day_of_year quarter = _field_accessor( "quarter", "q", diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 88a5a88efe146..e53276369a46f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -484,7 +484,7 @@ def _values_for_argsort(self) -> np.ndarray: See Also -------- - ExtensionArray.argsort + ExtensionArray.argsort : Return the indices that would sort this array. """ data = self._data.copy() if self._mask.any(): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index bf2b3a0a1c9ba..ef8093660b413 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -67,7 +67,9 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for storing Period data. - Users should use :func:`period_array` to create new instances. + Users should use :func:`period_range` to create new instances. + Alternatively, :func:`array` can be used to create new instances + from a sequence of Period scalars. Parameters ---------- @@ -76,14 +78,14 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): converted to ordinals without inference or copy (PeriodArray, ndarray[int64]), or a box around such an array (Series[period], PeriodIndex). + dtype : PeriodDtype, optional + A PeriodDtype instance from which to extract a `freq`. If both + `freq` and `dtype` are specified, then the frequencies must match. freq : str or DateOffset The `freq` to use for the array. Mostly applicable when `values` is an ndarray of integers, when `freq` is required. When `values` is a PeriodArray (or box around), it's checked that ``values.freq`` matches `freq`. - dtype : PeriodDtype, optional - A PeriodDtype instance from which to extract a `freq`. If both - `freq` and `dtype` are specified, then the frequencies must match. copy : bool, default False Whether to copy the ordinals before storing. @@ -97,8 +99,10 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): See Also -------- - period_array : Create a new PeriodArray. + Period: Represents a period of time. PeriodIndex : Immutable Index for period data. + period_range: Create a fixed-frequency PeriodArray. + array: Construct a pandas array. Notes ----- @@ -136,7 +140,9 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): "weekday", "week", "dayofweek", + "day_of_week", "dayofyear", + "day_of_year", "quarter", "qyear", "days_in_month", @@ -148,7 +154,7 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): # -------------------------------------------------------------------- # Constructors - def __init__(self, values, freq=None, dtype=None, copy=False): + def __init__(self, values, dtype=None, freq=None, copy=False): freq = validate_dtype_freq(dtype, freq) if freq is not None: @@ -377,12 +383,13 @@ def __arrow_array__(self, type=None): """, ) week = weekofyear - dayofweek = _field_accessor( - "weekday", + day_of_week = _field_accessor( + "day_of_week", """ The day of the week with Monday=0, Sunday=6. """, ) + dayofweek = day_of_week weekday = dayofweek dayofyear = day_of_year = _field_accessor( "day_of_year", @@ -882,7 +889,7 @@ def period_array( if is_datetime64_dtype(data_dtype): return PeriodArray._from_datetime64(data, freq) if is_period_dtype(data_dtype): - return PeriodArray(data, freq) + return PeriodArray(data, freq=freq) # other iterable of some kind if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 2e62fade93dcb..4346e02069667 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -270,7 +270,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): """ _subtyp = "sparse_array" # register ABCSparseArray - _deprecations = PandasObject._deprecations | frozenset(["get_values"]) + _hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"]) _sparse_index: SparseIndex def __init__( diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 6ddb3f1b7aa53..17fcd00b7b251 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -17,7 +17,11 @@ ) from pandas._libs.tslibs.conversion import precision_from_unit from pandas._libs.tslibs.fields import get_timedelta_field -from pandas._libs.tslibs.timedeltas import array_to_timedelta64, parse_timedelta_unit +from pandas._libs.tslibs.timedeltas import ( + array_to_timedelta64, + ints_to_pytimedelta, + parse_timedelta_unit, +) from pandas.compat.numpy import function as nv from pandas.core.dtypes.common import ( @@ -346,6 +350,21 @@ def astype(self, dtype, copy: bool = True): return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) + def __iter__(self): + if self.ndim > 1: + return (self[n] for n in range(len(self))) + else: + # convert in chunks of 10k for efficiency + data = self.asi8 + length = len(self) + chunksize = 10000 + chunks = int(length / chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = ints_to_pytimedelta(data[start_i:end_i], box=True) + yield from converted + # ---------------------------------------------------------------- # Reductions @@ -362,15 +381,13 @@ def sum( nv.validate_sum( (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) ) - if not len(self): - return NaT - if not skipna and self._hasnans: - return NaT result = nanops.nansum( - self._data, axis=axis, skipna=skipna, min_count=min_count + self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) - return Timedelta(result) + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) def std( self, @@ -384,13 +401,11 @@ def std( nv.validate_stat_ddof_func( (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" ) - if not len(self): - return NaT - if not skipna and self._hasnans: - return NaT - result = nanops.nanstd(self._data, axis=axis, skipna=skipna, ddof=ddof) - return Timedelta(result) + result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) # ---------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/base.py b/pandas/core/base.py index 10b83116dee58..a22aac926e36e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,12 +4,22 @@ import builtins import textwrap -from typing import Any, Callable, Dict, FrozenSet, Optional, TypeVar, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + FrozenSet, + Optional, + TypeVar, + Union, + cast, +) import numpy as np import pandas._libs.lib as lib -from pandas._typing import IndexLabel +from pandas._typing import DtypeObj, IndexLabel from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -33,6 +43,9 @@ from pandas.core.construction import create_series_with_explicit_dtype import pandas.core.nanops as nanops +if TYPE_CHECKING: + from pandas import Categorical + _shared_docs: Dict[str, str] = dict() _indexops_doc_kwargs = dict( klass="IndexOpsMixin", @@ -238,7 +251,7 @@ def _gotitem(self, key, ndim: int, subset=None): Parameters ---------- key : str / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on @@ -301,10 +314,15 @@ class IndexOpsMixin(OpsMixin): # ndarray compatibility __array_priority__ = 1000 - _deprecations: FrozenSet[str] = frozenset( + _hidden_attrs: FrozenSet[str] = frozenset( ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ ) + @property + def dtype(self) -> DtypeObj: + # must be defined here as a property for mypy + raise AbstractMethodError(self) + @property def _values(self) -> Union[ExtensionArray, np.ndarray]: # must be defined here as a property for mypy @@ -348,7 +366,7 @@ def ndim(self) -> int: def item(self): """ - Return the first element of the underlying data as a python scalar. + Return the first element of the underlying data as a Python scalar. Returns ------- @@ -832,6 +850,7 @@ def _map_values(self, mapper, na_action=None): if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values + self = cast("Categorical", self) return self._values.map(mapper) values = self._values @@ -1182,6 +1201,16 @@ def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1): >>> ser.searchsorted([1, 3], side='right') array([1, 3]) + >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'])) + >>> ser + 0 2000-03-11 + 1 2000-03-12 + 2 2000-03-13 + dtype: datetime64[ns] + + >>> ser.searchsorted('3/14/2000') + 3 + >>> ser = pd.Categorical( ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True ... ) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 96de54380c7ad..8630867c64f88 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -21,8 +21,9 @@ class ExtensionDtype: See Also -------- - extensions.register_extension_dtype - extensions.ExtensionArray + extensions.register_extension_dtype: Register an ExtensionType + with pandas as class decorator. + extensions.ExtensionArray: Abstract base class for custom 1-D array types. Notes ----- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a7379376c2f78..692da8f8e021e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2,23 +2,37 @@ Routines for casting. """ +from contextlib import suppress from datetime import date, datetime, timedelta -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type +from typing import ( + TYPE_CHECKING, + Any, + List, + Optional, + Sequence, + Set, + Sized, + Tuple, + Type, + Union, +) import numpy as np -from pandas._libs import lib, tslib, tslibs +from pandas._libs import lib, tslib from pandas._libs.tslibs import ( NaT, OutOfBoundsDatetime, Period, Timedelta, Timestamp, + conversion, iNaT, ints_to_pydatetime, + ints_to_pytimedelta, ) from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import ArrayLike, Dtype, DtypeObj +from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -83,6 +97,8 @@ if TYPE_CHECKING: from pandas import Series from pandas.core.arrays import ExtensionArray + from pandas.core.indexes.base import Index + from pandas.core.indexes.datetimes import DatetimeIndex _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -118,7 +134,7 @@ def is_nested_object(obj) -> bool: return False -def maybe_downcast_to_dtype(result, dtype): +def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 @@ -154,12 +170,20 @@ def maybe_downcast_to_dtype(result, dtype): dtype = np.dtype(dtype) + elif dtype.type is Period: + from pandas.core.arrays import PeriodArray + + with suppress(TypeError): + # e.g. TypeError: int() argument must be a string, a + # bytes-like object or a number, not 'Period + return PeriodArray(result, freq=dtype.freq) + converted = maybe_downcast_numeric(result, dtype, do_round) if converted is not result: return converted # a datetimelike - # GH12821, iNaT is casted to float + # GH12821, iNaT is cast to float if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]: if hasattr(dtype, "tz"): # not a numpy dtype @@ -172,21 +196,10 @@ def maybe_downcast_to_dtype(result, dtype): else: result = result.astype(dtype) - elif dtype.type is Period: - # TODO(DatetimeArray): merge with previous elif - from pandas.core.arrays import PeriodArray - - try: - return PeriodArray(result, freq=dtype.freq) - except TypeError: - # e.g. TypeError: int() argument must be a string, a - # bytes-like object or a number, not 'Period - pass - return result -def maybe_downcast_numeric(result, dtype, do_round: bool = False): +def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False): """ Subset of maybe_downcast_to_dtype restricted to numeric dtypes. @@ -329,7 +342,9 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: return dtype -def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): +def maybe_cast_to_extension_array( + cls: Type["ExtensionArray"], obj: ArrayLike, dtype: Optional[ExtensionDtype] = None +) -> ArrayLike: """ Call to `_from_sequence` that returns the object unchanged on Exception. @@ -362,7 +377,9 @@ def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): return result -def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): +def maybe_upcast_putmask( + result: np.ndarray, mask: np.ndarray, other: Scalar +) -> Tuple[np.ndarray, bool]: """ A safe version of putmask that potentially upcasts the result. @@ -444,7 +461,9 @@ def changeit(): return result, False -def maybe_casted_values(index, codes=None): +def maybe_casted_values( + index: "Index", codes: Optional[np.ndarray] = None +) -> ArrayLike: """ Convert an index, given directly or as a pair (level, code), to a 1D array. @@ -468,39 +487,25 @@ def maybe_casted_values(index, codes=None): # if we have the codes, extract the values with a mask if codes is not None: - mask = codes == -1 + mask: np.ndarray = codes == -1 - # we can have situations where the whole mask is -1, - # meaning there is nothing found in codes, so make all nan's if mask.size > 0 and mask.all(): + # we can have situations where the whole mask is -1, + # meaning there is nothing found in codes, so make all nan's + dtype = index.dtype fill_value = na_value_for_dtype(dtype) values = construct_1d_arraylike_from_scalar(fill_value, len(mask), dtype) + else: values = values.take(codes) - # TODO(https://github.com/pandas-dev/pandas/issues/24206) - # Push this into maybe_upcast_putmask? - # We can't pass EAs there right now. Looks a bit - # complicated. - # So we unbox the ndarray_values, op, re-box. - values_type = type(values) - values_dtype = values.dtype - - from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin - - if isinstance(values, DatetimeLikeArrayMixin): - values = values._data # TODO: can we de-kludge yet? - if mask.any(): if isinstance(values, np.ndarray): values, _ = maybe_upcast_putmask(values, mask, np.nan) else: values[mask] = np.nan - if issubclass(values_type, DatetimeLikeArrayMixin): - values = values_type(values, dtype=values_dtype) - return values @@ -552,7 +557,7 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.object_) else: try: - fill_value = tslibs.Timestamp(fill_value).to_datetime64() + fill_value = Timestamp(fill_value).to_datetime64() except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): @@ -565,7 +570,7 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.object_) else: try: - fv = tslibs.Timedelta(fill_value) + fv = Timedelta(fill_value) except ValueError: dtype = np.dtype(np.object_) else: @@ -660,7 +665,7 @@ def maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def _ensure_dtype_type(value, dtype): +def _ensure_dtype_type(value, dtype: DtypeObj): """ Ensure that the given value is an instance of the given dtype. @@ -738,8 +743,8 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, dtype = np.dtype(object) elif isinstance(val, (np.datetime64, datetime)): - val = tslibs.Timestamp(val) - if val is tslibs.NaT or val.tz is None: + val = Timestamp(val) + if val is NaT or val.tz is None: dtype = np.dtype("M8[ns]") else: if pandas_dtype: @@ -750,7 +755,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, val = val.value elif isinstance(val, (np.timedelta64, timedelta)): - val = tslibs.Timedelta(val).value + val = Timedelta(val).value dtype = np.dtype("m8[ns]") elif is_bool(val): @@ -786,8 +791,9 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, return dtype, val -# TODO: try to make the Any in the return annotation more specific -def infer_dtype_from_array(arr, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]: +def infer_dtype_from_array( + arr, pandas_dtype: bool = False +) -> Tuple[DtypeObj, ArrayLike]: """ Infer the dtype from an array. @@ -875,7 +881,12 @@ def maybe_infer_dtype_type(element): return tipo -def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): +def maybe_upcast( + values: ArrayLike, + fill_value: Scalar = np.nan, + dtype: Dtype = None, + copy: bool = False, +) -> Tuple[ArrayLike, Scalar]: """ Provide explicit type promotion and coercion. @@ -887,6 +898,13 @@ def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): dtype : if None, then use the dtype of the values, else coerce to this type copy : bool, default True If True always make a copy even if no upcast is required. + + Returns + ------- + values: ndarray or ExtensionArray + the original array, possibly upcast + fill_value: + the fill value, possibly upcast """ if not is_scalar(fill_value) and not is_object_dtype(values.dtype): # We allow arbitrary fill values for object dtype @@ -907,7 +925,7 @@ def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): return values, fill_value -def invalidate_string_dtypes(dtype_set): +def invalidate_string_dtypes(dtype_set: Set[DtypeObj]): """ Change string like dtypes to object for ``DataFrame.select_dtypes()``. @@ -929,7 +947,7 @@ def coerce_indexer_dtype(indexer, categories): return ensure_int64(indexer) -def coerce_to_dtypes(result, dtypes): +def coerce_to_dtypes(result: Sequence[Scalar], dtypes: Sequence[Dtype]) -> List[Scalar]: """ given a dtypes and a result set, coerce the result elements to the dtypes @@ -941,9 +959,9 @@ def conv(r, dtype): if np.any(isna(r)): pass elif dtype == DT64NS_DTYPE: - r = tslibs.Timestamp(r) + r = Timestamp(r) elif dtype == TD64NS_DTYPE: - r = tslibs.Timedelta(r) + r = Timedelta(r) elif dtype == np.bool_: # messy. non 0/1 integers do not get converted. if is_integer(r) and r not in [0, 1]: @@ -959,7 +977,9 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): +def astype_nansafe( + arr, dtype: DtypeObj, copy: bool = True, skipna: bool = False +) -> ArrayLike: """ Cast the elements of an array to a given dtype a nan-safe manner. @@ -1006,7 +1026,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): elif is_timedelta64_dtype(arr): if is_object_dtype(dtype): - return tslibs.ints_to_pytimedelta(arr.view(np.int64)) + return ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") @@ -1063,7 +1083,9 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): return arr.view(dtype) -def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True): +def maybe_convert_objects( + values: np.ndarray, convert_numeric: bool = True +) -> Union[np.ndarray, "DatetimeIndex"]: """ If we have an object dtype array, try to coerce dates and/or numbers. @@ -1184,7 +1206,7 @@ def soft_convert_objects( def convert_dtypes( - input_array, + input_array: AnyArrayLike, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, @@ -1195,7 +1217,7 @@ def convert_dtypes( Parameters ---------- - input_array : ExtensionArray or PandasArray + input_array : ExtensionArray, Index, Series or np.ndarray convert_string : bool, default True Whether object dtypes should be converted to ``StringDtype()``. convert_integer : bool, default True @@ -1250,9 +1272,11 @@ def convert_dtypes( return inferred_dtype -def maybe_castable(arr) -> bool: +def maybe_castable(arr: np.ndarray) -> bool: # return False to force a non-fastpath + assert isinstance(arr, np.ndarray) # GH 37024 + # check datetime64[ns]/timedelta64[ns] are valid # otherwise try to coerce kind = arr.dtype.kind @@ -1264,7 +1288,9 @@ def maybe_castable(arr) -> bool: return arr.dtype.name not in POSSIBLY_CAST_DTYPES -def maybe_infer_to_datetimelike(value, convert_dates: bool = False): +def maybe_infer_to_datetimelike( + value: Union[ArrayLike, Scalar], convert_dates: bool = False +): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a @@ -1317,8 +1343,6 @@ def try_datetime(v): # we might have a sequence of the same-datetimes with tz's # if so coerce to a DatetimeIndex; if they are not the same, # then these stay as object dtype, xref GH19671 - from pandas._libs.tslibs import conversion - from pandas import DatetimeIndex try: @@ -1373,7 +1397,7 @@ def try_timedelta(v): return value -def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): +def maybe_cast_to_datetime(value, dtype: DtypeObj, errors: str = "raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT @@ -1492,7 +1516,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): dtype = value.dtype if dtype.kind == "M" and dtype != DT64NS_DTYPE: - value = tslibs.conversion.ensure_datetime64ns(value) + value = conversion.ensure_datetime64ns(value) elif dtype.kind == "m" and dtype != TD64NS_DTYPE: value = to_timedelta(value) @@ -1566,7 +1590,9 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: return np.find_common_type(types, []) -def cast_scalar_to_array(shape, value, dtype: Optional[DtypeObj] = None) -> np.ndarray: +def cast_scalar_to_array( + shape: Tuple, value: Scalar, dtype: Optional[DtypeObj] = None +) -> np.ndarray: """ Create np.ndarray of specified shape and dtype, filled with values. @@ -1594,7 +1620,7 @@ def cast_scalar_to_array(shape, value, dtype: Optional[DtypeObj] = None) -> np.n def construct_1d_arraylike_from_scalar( - value, length: int, dtype: DtypeObj + value: Scalar, length: int, dtype: DtypeObj ) -> ArrayLike: """ create a np.ndarray / pandas type of specified shape and dtype @@ -1638,7 +1664,7 @@ def construct_1d_arraylike_from_scalar( return subarr -def construct_1d_object_array_from_listlike(values) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1664,7 +1690,7 @@ def construct_1d_object_array_from_listlike(values) -> np.ndarray: def construct_1d_ndarray_preserving_na( - values, dtype: Optional[DtypeObj] = None, copy: bool = False + values: Sequence, dtype: Optional[DtypeObj] = None, copy: bool = False ) -> np.ndarray: """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. @@ -1698,7 +1724,7 @@ def construct_1d_ndarray_preserving_na( return subarr -def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): +def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -1740,6 +1766,8 @@ def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): ... ValueError: Trying to coerce float values to integers """ + assert is_integer_dtype(dtype) + try: if not hasattr(arr, "astype"): casted = np.array(arr, dtype=dtype, copy=copy) @@ -1764,11 +1792,11 @@ def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): raise OverflowError("Trying to coerce negative values to unsigned integers") - if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): + if is_float_dtype(arr) or is_object_dtype(arr): raise ValueError("Trying to coerce float values to integers") -def convert_scalar_for_putitemlike(scalar, dtype: np.dtype): +def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar: """ Convert datetimelike scalar if we are setting into a datetime64 or timedelta64 ndarray. @@ -1799,7 +1827,7 @@ def convert_scalar_for_putitemlike(scalar, dtype: np.dtype): return scalar -def validate_numeric_casting(dtype: np.dtype, value): +def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None: """ Check that we can losslessly insert the given value into an array with the given dtype. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index bf8d50db8416e..01b34187997cb 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -406,8 +406,6 @@ def __repr__(self) -> str_type: @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: - from pandas.core.dtypes.common import DT64NS_DTYPE, is_datetime64tz_dtype - from pandas.core.util.hashing import ( combine_hash_arrays, hash_array, @@ -430,9 +428,9 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: hashed = hash((tuple(categories), ordered)) return hashed - if is_datetime64tz_dtype(categories.dtype): + if DatetimeTZDtype.is_dtype(categories.dtype): # Avoid future warning. - categories = categories.astype(DT64NS_DTYPE) + categories = categories.astype("datetime64[ns]") cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: @@ -909,6 +907,9 @@ def __eq__(self, other: Any) -> bool: return isinstance(other, PeriodDtype) and self.freq == other.freq + def __ne__(self, other: Any) -> bool: + return not self.__eq__(other) + def __setstate__(self, state): # for pickle compat. __getstate__ is defined in the # PandasExtensionDtype superclass and uses the public properties to @@ -1011,11 +1012,7 @@ class IntervalDtype(PandasExtensionDtype): _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, subtype=None): - from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_string_dtype, - pandas_dtype, - ) + from pandas.core.dtypes.common import is_string_dtype, pandas_dtype if isinstance(subtype, IntervalDtype): return subtype @@ -1038,7 +1035,7 @@ def __new__(cls, subtype=None): except TypeError as err: raise TypeError("could not construct IntervalDtype") from err - if is_categorical_dtype(subtype) or is_string_dtype(subtype): + if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype): # GH 19016 msg = ( "category, object, and string subtypes are not supported " diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 1f1017cfc1929..7d2549713c6bc 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -53,6 +53,7 @@ def _check(cls, inst) -> bool: }, ) +ABCNDFrame = create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")) ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ba2f11c87369f..a9a58cddaf222 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,6 +34,7 @@ Type, Union, cast, + overload, ) import warnings @@ -147,6 +148,7 @@ ) from pandas.core.reshape.melt import melt from pandas.core.series import Series +from pandas.core.sorting import get_group_index, lexsort_indexer, nargsort from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt @@ -154,6 +156,8 @@ import pandas.plotting if TYPE_CHECKING: + from typing import Literal + from pandas.core.groupby.generic import DataFrameGroupBy from pandas.io.formats.style import Styler @@ -419,7 +423,7 @@ def _constructor(self) -> Type[DataFrame]: return DataFrame _constructor_sliced: Type[Series] = Series - _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) + _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) _accessors: Set[str] = {"sparse"} @property @@ -585,7 +589,7 @@ def shape(self) -> Tuple[int, int]: See Also -------- - ndarray.shape + ndarray.shape : Tuple of array dimensions. Examples -------- @@ -787,10 +791,8 @@ def _repr_html_(self) -> Optional[str]: max_cols=max_cols, show_dimensions=show_dimensions, decimal=".", - table_id=None, - render_links=False, ) - return formatter.to_html(notebook=True) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) else: return None @@ -873,9 +875,12 @@ def to_string( max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, + ) + return fmt.DataFrameRenderer(formatter).to_string( + buf=buf, + encoding=encoding, line_width=line_width, ) - return formatter.to_string(buf=buf, encoding=encoding) # ---------------------------------------------------------------------- @@ -969,9 +974,6 @@ def iterrows(self) -> Iterable[Tuple[Label, Series]]: data : Series The data of the row as a Series. - it : generator - A generator that iterates over the rows of the frame. - See Also -------- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. @@ -2287,14 +2289,14 @@ def to_markdown( @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, - path: FilePathOrBuffer[AnyStr], + path: Optional[FilePathOrBuffer] = None, engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, storage_options: StorageOptions = None, **kwargs, - ) -> None: + ) -> Optional[bytes]: """ Write a DataFrame to the binary parquet format. @@ -2305,14 +2307,15 @@ def to_parquet( Parameters ---------- - path : str or file-like object + path : str or file-like object, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. - .. versionchanged:: 1.0.0 + .. versionchanged:: 1.2.0 Previously this was "fname" @@ -2355,6 +2358,10 @@ def to_parquet( Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. + Returns + ------- + bytes if no path argument is provided else None + See Also -------- read_parquet : Read a parquet file. @@ -2390,7 +2397,7 @@ def to_parquet( """ from pandas.io.parquet import to_parquet - to_parquet( + return to_parquet( self, path, engine, @@ -2475,29 +2482,29 @@ def to_html( columns=columns, col_space=col_space, na_rep=na_rep, + header=header, + index=index, formatters=formatters, float_format=float_format, + bold_rows=bold_rows, sparsify=sparsify, justify=justify, index_names=index_names, - header=header, - index=index, - bold_rows=bold_rows, escape=escape, + decimal=decimal, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, - decimal=decimal, - table_id=table_id, - render_links=render_links, ) # TODO: a generic formatter wld b in DataFrameFormatter - return formatter.to_html( + return fmt.DataFrameRenderer(formatter).to_html( buf=buf, classes=classes, notebook=notebook, border=border, encoding=encoding, + table_id=table_id, + render_links=render_links, ) # ---------------------------------------------------------------------- @@ -2603,7 +2610,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(DataFrameInfo.info) + @doc(DataFrameInfo.to_buffer) def info( self, verbose: Optional[bool] = None, @@ -2612,9 +2619,16 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: - return DataFrameInfo( - self, verbose, buf, max_cols, memory_usage, null_counts - ).info() + info = DataFrameInfo( + data=self, + memory_usage=memory_usage, + ) + info.to_buffer( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=null_counts, + ) def memory_usage(self, index=True, deep=False) -> Series: """ @@ -2655,16 +2669,16 @@ def memory_usage(self, index=True, deep=False) -> Series: Examples -------- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] - >>> data = dict([(t, np.ones(shape=5000).astype(t)) + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) ... for t in dtypes]) >>> df = pd.DataFrame(data) >>> df.head() int64 float64 complex128 object bool - 0 1 1.0 1.000000+0.000000j 1 True - 1 1 1.0 1.000000+0.000000j 1 True - 2 1 1.0 1.000000+0.000000j 1 True - 3 1 1.0 1.000000+0.000000j 1 True - 4 1 1.0 1.000000+0.000000j 1 True + 0 1 1.0 1.0+0.0j 1 True + 1 1 1.0 1.0+0.0j 1 True + 2 1 1.0 1.0+0.0j 1 True + 3 1 1.0 1.0+0.0j 1 True + 4 1 1.0 1.0+0.0j 1 True >>> df.memory_usage() Index 128 @@ -2690,7 +2704,7 @@ def memory_usage(self, index=True, deep=False) -> Series: int64 40000 float64 40000 complex128 80000 - object 160000 + object 180000 bool 5000 dtype: int64 @@ -2789,7 +2803,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: >>> df2_transposed 0 1 name Alice Bob - score 9.5 8 + score 9.5 8.0 employed False True kids 0 0 @@ -2906,6 +2920,8 @@ def __getitem__(self, key): # Do we have a slicer (on rows)? indexer = convert_to_index_sliceable(self, key) if indexer is not None: + if isinstance(indexer, np.ndarray): + indexer = lib.maybe_indices_to_slice(indexer, len(self)) # either we have a slice or we have a string that can be converted # to a slice for partial-string date indexing return self._slice(indexer, axis=0) @@ -2945,7 +2961,8 @@ def __getitem__(self, key): # - the key itself is repeated (test on data.shape, #9519), or # - we have a MultiIndex on columns (test on self.columns, #21309) if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): - data = data[key] + # GH#26490 using data[key] can cause RecursionError + data = data._get_item_cache(key) return data @@ -3090,7 +3107,7 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: - self.loc._ensure_listlike_indexer(key, axis=1) + self.loc._ensure_listlike_indexer(key, axis=1, value=value) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] @@ -4691,6 +4708,30 @@ def set_index( if not inplace: return frame + @overload + # https://github.com/python/mypy/issues/6580 + # Overloaded function signatures 1 and 2 overlap with incompatible return types + def reset_index( # type: ignore[misc] + self, + level: Optional[Union[Hashable, Sequence[Hashable]]] = ..., + drop: bool = ..., + inplace: Literal[False] = ..., + col_level: Hashable = ..., + col_fill: Label = ..., + ) -> DataFrame: + ... + + @overload + def reset_index( + self, + level: Optional[Union[Hashable, Sequence[Hashable]]] = ..., + drop: bool = ..., + inplace: Literal[True] = ..., + col_level: Hashable = ..., + col_fill: Label = ..., + ) -> None: + ... + def reset_index( self, level: Optional[Union[Hashable, Sequence[Hashable]]] = None, @@ -5251,8 +5292,6 @@ def duplicated( """ from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 - from pandas.core.sorting import get_group_index - if self.empty: return self._constructor_sliced(dtype=bool) @@ -5286,7 +5325,8 @@ def f(vals): labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - return self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) + result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) + return result.__finalize__(self, method="duplicated") # ---------------------------------------------------------------------- # Sorting @@ -5315,7 +5355,6 @@ def sort_values( # type: ignore[override] f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) if len(by) > 1: - from pandas.core.sorting import lexsort_indexer keys = [self._get_label_or_level_values(x, axis=axis) for x in by] @@ -5328,7 +5367,6 @@ def sort_values( # type: ignore[override] ) indexer = ensure_platform_int(indexer) else: - from pandas.core.sorting import nargsort by = by[0] k = self._get_label_or_level_values(by, axis=axis) @@ -5960,6 +5998,18 @@ def _construct_result(self, result) -> DataFrame: out.index = self.index return out + def __divmod__(self, other) -> Tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = self // other + mod = self - div * other + return div, mod + + def __rdivmod__(self, other) -> Tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = other // self + mod = other - div * self + return div, mod + # ---------------------------------------------------------------------- # Combination-Related @@ -7086,9 +7136,11 @@ def stack(self, level=-1, dropna=True): from pandas.core.reshape.reshape import stack, stack_multiple if isinstance(level, (tuple, list)): - return stack_multiple(self, level, dropna=dropna) + result = stack_multiple(self, level, dropna=dropna) else: - return stack(self, level, dropna=dropna) + result = stack(self, level, dropna=dropna) + + return result.__finalize__(self, method="stack") def explode( self, column: Union[str, Tuple], ignore_index: bool = False @@ -7159,8 +7211,6 @@ def explode( raise ValueError("columns must be unique") df = self.reset_index(drop=True) - # TODO: use overload to refine return type of reset_index - assert df is not None # needed for mypy result = df[column].explode() result = df.drop([column], axis=1).join(result) if ignore_index: @@ -7230,7 +7280,9 @@ def unstack(self, level=-1, fill_value=None): """ from pandas.core.reshape.reshape import unstack - return unstack(self, level, fill_value) + result = unstack(self, level, fill_value) + + return result.__finalize__(self, method="unstack") @Appender(_shared_docs["melt"] % dict(caller="df.melt(", other="melt")) def melt( @@ -7416,9 +7468,9 @@ def _gotitem( >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) A B - max NaN 8.0 - min 1.0 2.0 sum 12.0 NaN + min 1.0 2.0 + max NaN 8.0 Aggregate different functions over the columns and rename the index of the resulting DataFrame. @@ -8521,6 +8573,7 @@ def count(self, axis=0, level=None, numeric_only=False): See Also -------- Series.count: Number of non-NA elements in a Series. + DataFrame.value_counts: Count unique combinations of columns. DataFrame.shape: Number of DataFrame rows and columns (including NA elements). DataFrame.isna: Boolean same-sized DataFrame showing places of NA @@ -8655,11 +8708,10 @@ def _reduce( assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None + own_dtypes = [arr.dtype for arr in self._iter_column_arrays()] + dtype_is_dt = np.array( - [ - is_datetime64_any_dtype(values.dtype) - for values in self._iter_column_arrays() - ], + [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], dtype=bool, ) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): @@ -8673,7 +8725,11 @@ def _reduce( cols = self.columns[~dtype_is_dt] self = self[cols] - any_object = self.dtypes.apply(is_object_dtype).any() + any_object = np.array( + [is_object_dtype(dtype) for dtype in own_dtypes], + dtype=bool, + ).any() + # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) labels = self._get_agg_axis(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7d0b0d6683e21..fb5d1ec8fd0db 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4,7 +4,6 @@ from datetime import timedelta import functools import gc -from io import StringIO import json import operator import pickle @@ -34,7 +33,7 @@ from pandas._config import config from pandas._libs import lib -from pandas._libs.tslibs import Tick, Timestamp, to_offset +from pandas._libs.tslibs import Period, Tick, Timestamp, to_offset from pandas._typing import ( Axis, CompressionOptions, @@ -86,17 +85,21 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import missing, nanops +from pandas.core import indexing, missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.flags import Flags from pandas.core.indexes import base as ibase -from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index -from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import Period, PeriodIndex -import pandas.core.indexing as indexing +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + PeriodIndex, + RangeIndex, + ensure_index, +) from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME @@ -105,7 +108,11 @@ from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt -from pandas.io.formats.format import DataFrameFormatter, format_percentiles +from pandas.io.formats.format import ( + DataFrameFormatter, + DataFrameRenderer, + format_percentiles, +) from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: @@ -193,7 +200,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _deprecations: FrozenSet[str] = frozenset(["get_values", "tshift"]) + _hidden_attrs: FrozenSet[str] = frozenset(["get_values", "tshift"]) _metadata: List[str] = [] _is_copy = None _mgr: BlockManager @@ -253,7 +260,7 @@ def attrs(self) -> Dict[Optional[Hashable], Any]: See Also -------- - DataFrame.flags + DataFrame.flags : Global flags applying to this object. """ if self._attrs is None: self._attrs = {} @@ -274,8 +281,8 @@ def flags(self) -> Flags: See Also -------- - Flags - DataFrame.attrs + Flags : Flags that apply to pandas objects. + DataFrame.attrs : Global metadata applying to this dataset. Notes ----- @@ -2276,6 +2283,10 @@ def to_json( and the default ``indent=None`` are equivalent in pandas, though this may change in a future release. + ``orient='table'`` contains a 'pandas_version' field under 'schema'. + This stores the version of `pandas` used in the latest revision of the + schema. + Examples -------- >>> import json @@ -2769,7 +2780,7 @@ def to_pickle( protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible - values are 0, 1, 2, 3, 4. A negative value for the protocol + values are 0, 1, 2, 3, 4, 5. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html. @@ -3005,6 +3016,9 @@ def to_latex( .. versionchanged:: 1.0.0 Added caption and label arguments. + .. versionchanged:: 1.2.0 + Added position argument, changed meaning of caption argument. + Parameters ---------- buf : str, Path or StringIO-like, optional, default None @@ -3066,11 +3080,16 @@ def to_latex( centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module. - caption : str, optional - The LaTeX caption to be placed inside ``\caption{{}}`` in the output. + caption : str or tuple, optional + Tuple (full_caption, short_caption), + which results in ``\caption[short_caption]{{full_caption}}``; + if a single string is passed, no short caption will be set. .. versionadded:: 1.0.0 + .. versionchanged:: 1.2.0 + Optionally allow caption to be a tuple ``(full_caption, short_caption)``. + label : str, optional The LaTeX label to be placed inside ``\label{{}}`` in the output. This is used with ``\ref{{}}`` in the main ``.tex`` file. @@ -3079,6 +3098,8 @@ def to_latex( position : str, optional The LaTeX positional argument for tables, to be placed after ``\begin{{}}`` in the output. + + .. versionadded:: 1.2.0 {returns} See Also -------- @@ -3089,8 +3110,8 @@ def to_latex( Examples -------- >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'], - ... mask=['red', 'purple'], - ... weapon=['sai', 'bo staff'])) + ... mask=['red', 'purple'], + ... weapon=['sai', 'bo staff'])) >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE \begin{{tabular}}{{lll}} \toprule @@ -3131,7 +3152,7 @@ def to_latex( escape=escape, decimal=decimal, ) - return formatter.to_latex( + return DataFrameRenderer(formatter).to_latex( buf=buf, column_format=column_format, longtable=longtable, @@ -3164,7 +3185,7 @@ def to_csv( date_format: Optional[str] = None, doublequote: bool_t = True, escapechar: Optional[str] = None, - decimal: Optional[str] = ".", + decimal: str = ".", errors: str = "strict", storage_options: StorageOptions = None, ) -> Optional[str]: @@ -3322,10 +3343,16 @@ def to_csv( """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() - from pandas.io.formats.csvs import CSVFormatter + formatter = DataFrameFormatter( + frame=df, + header=header, + index=index, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + ) - formatter = CSVFormatter( - df, + return DataFrameRenderer(formatter).to_csv( path_or_buf, line_terminator=line_terminator, sep=sep, @@ -3333,11 +3360,7 @@ def to_csv( errors=errors, compression=compression, quoting=quoting, - na_rep=na_rep, - float_format=float_format, - cols=columns, - header=header, - index=index, + columns=columns, index_label=index_label, mode=mode, chunksize=chunksize, @@ -3345,16 +3368,8 @@ def to_csv( date_format=date_format, doublequote=doublequote, escapechar=escapechar, - decimal=decimal, storage_options=storage_options, ) - formatter.save() - - if path_or_buf is None: - assert isinstance(formatter.path_or_buf, StringIO) - return formatter.path_or_buf.getvalue() - - return None # ---------------------------------------------------------------------- # Lookup Caching @@ -3669,7 +3684,9 @@ class animal locomotion index = self.index if isinstance(index, MultiIndex): try: - loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + loc, new_index = self.index._get_loc_level( + key, level=0, drop_level=drop_level + ) except TypeError as e: raise TypeError(f"Expected label or tuple of labels, got {key}") from e else: @@ -5335,7 +5352,7 @@ def __finalize__( A passed method name providing context on where ``__finalize__`` was called. - .. warning: + .. warning:: The value passed as `method` are not currently considered stable across pandas releases. @@ -5346,7 +5363,7 @@ def __finalize__( self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. - for name in self._metadata: + for name in set(self._metadata) & set(other._metadata): assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) @@ -5422,12 +5439,10 @@ def _dir_additions(self) -> Set[str]: add the string-like attributes from the info_axis. If info_axis is a MultiIndex, it's first level values are used. """ - additions = { - c - for c in self._info_axis.unique(level=0)[:100] - if isinstance(c, str) and c.isidentifier() - } - return super()._dir_additions().union(additions) + additions = super()._dir_additions() + if self._info_axis._can_hold_strings: + additions.update(self._info_axis._dir_additions_for_owner) + return additions # ---------------------------------------------------------------------- # Consolidation of internals @@ -5466,7 +5481,7 @@ def _consolidate(self): @property def _is_mixed_type(self) -> bool_t: - if len(self._mgr.blocks) == 1: + if self._mgr.is_single_block: return False if self._mgr.any_extension_types: @@ -6254,7 +6269,7 @@ def fillna( axis = self._get_axis_number(axis) if value is None: - if len(self._mgr.blocks) > 1 and axis == 1: + if not self._mgr.is_single_block and axis == 1: if inplace: raise NotImplementedError() result = self.T.fillna(method=method, limit=limit).T @@ -6829,6 +6844,8 @@ def interpolate( **kwargs, ) -> Optional[FrameOrSeries]: """ + Fill NaN values using an interpolation method. + Please note that only ``method='linear'`` is supported for DataFrame/Series with a MultiIndex. @@ -7299,7 +7316,7 @@ def isna(self: FrameOrSeries) -> FrameOrSeries: ------- {klass} Mask of bool values for each element in {klass} that - indicates whether an element is not an NA value. + indicates whether an element is an NA value. See Also -------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4ab40e25db84e..5d9028adb7372 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -79,8 +79,9 @@ class providing the base-class of operations. _common_see_also = """ See Also -------- - Series.%(name)s - DataFrame.%(name)s + Series.%(name)s : Apply a function %(name)s to a Series. + DataFrame.%(name)s : Apply a function %(name)s + to each row or column of a DataFrame. """ _apply_docs = dict( @@ -318,9 +319,12 @@ class providing the base-class of operations. See Also -------- -%(klass)s.groupby.apply -%(klass)s.groupby.aggregate -%(klass)s.transform +%(klass)s.groupby.apply : Apply function func group-wise + and combine the results together. +%(klass)s.groupby.aggregate : Aggregate using one or more + operations over the specified axis. +%(klass)s.transform : Transforms the Series on each group + based on the given function. Notes ----- @@ -427,9 +431,12 @@ class providing the base-class of operations. See Also -------- -{klass}.groupby.apply -{klass}.groupby.transform -{klass}.aggregate +{klass}.groupby.apply : Apply function func group-wise + and combine the results together. +{klass}.groupby.transform : Aggregate using one or more + operations over the specified axis. +{klass}.aggregate : Transforms the Series on each group + based on the given function. Notes ----- @@ -489,6 +496,21 @@ def group_selection_context(groupby: "BaseGroupBy") -> Iterator["BaseGroupBy"]: class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): _group_selection: Optional[IndexLabel] = None _apply_allowlist: FrozenSet[str] = frozenset() + _hidden_attrs = PandasObject._hidden_attrs | { + "as_index", + "axis", + "dropna", + "exclusions", + "grouper", + "group_keys", + "keys", + "level", + "mutated", + "obj", + "observed", + "sort", + "squeeze", + } def __init__( self, @@ -1004,8 +1026,9 @@ def _agg_general( ): with group_selection_context(self): # try a cython aggregation if we can + result = None try: - return self._cython_agg_general( + result = self._cython_agg_general( how=alias, alt=npfunc, numeric_only=numeric_only, @@ -1024,8 +1047,9 @@ def _agg_general( raise # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result + if result is None: + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result.__finalize__(self.obj, method="groupby") def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1194,12 +1218,12 @@ def reset_identity(values): # when the ax has duplicates # so we resort to this # GH 14776, 30667 - if ax.has_duplicates: + if ax.has_duplicates and not result.axes[self.axis].equals(ax): indexer, _ = result.index.get_indexer_non_unique(ax.values) indexer = algorithms.unique1d(indexer) result = result.take(indexer, axis=self.axis) else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis, copy=False) elif self.group_keys: @@ -1854,8 +1878,8 @@ def _fill(self, direction, limit=None): See Also -------- - pad - backfill + pad : Returns Series with minimum number of char in object. + backfill : Backward fill the missing values in the dataset. """ # Need int value for Cython if limit is None: @@ -1889,10 +1913,10 @@ def pad(self, limit=None): See Also -------- - Series.pad - DataFrame.pad - Series.fillna - DataFrame.fillna + Series.pad: Returns Series with minimum number of char in object. + DataFrame.pad: Object with missing values filled or None if inplace=True. + Series.fillna: Fill NaN values of a Series. + DataFrame.fillna: Fill NaN values of a DataFrame. """ return self._fill("ffill", limit=limit) @@ -1915,10 +1939,10 @@ def backfill(self, limit=None): See Also -------- - Series.backfill - DataFrame.backfill - Series.fillna - DataFrame.fillna + Series.backfill : Backward fill the missing values in the dataset. + DataFrame.backfill: Backward fill the missing values in the dataset. + Series.fillna: Fill NaN values of a Series. + DataFrame.fillna: Fill NaN values of a DataFrame. """ return self._fill("bfill", limit=limit) @@ -2571,9 +2595,9 @@ def _get_cythonized_result( except TypeError as e: error_msg = str(e) continue + vals = vals.astype(cython_dtype, copy=False) if needs_2d: vals = vals.reshape((-1, 1)) - vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) func = partial(func, labels) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index b9b2c4b07d37a..c97778f98387e 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -28,6 +28,11 @@ class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): + _hidden_attrs = PandasObject._hidden_attrs | { + "orig", + "name", + } + def __init__(self, data: "Series", orig): if not isinstance(data, ABCSeries): raise TypeError( @@ -236,8 +241,10 @@ def isocalendar(self): See Also -------- - Timestamp.isocalendar - datetime.date.isocalendar + Timestamp.isocalendar : Function return a 3-tuple containing ISO year, + week number, and weekday for the given Timestamp object. + datetime.date.isocalendar : Return a named tuple object with + three components: year, week and weekday. Examples -------- @@ -326,7 +333,8 @@ def to_pytimedelta(self) -> np.ndarray: See Also -------- - datetime.timedelta + datetime.timedelta : A duration expressing the difference + between two date, time, or datetime. Examples -------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7d3c2c2297d5d..8c9747f551c0a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -9,8 +9,10 @@ FrozenSet, Hashable, List, + NewType, Optional, Sequence, + Set, Tuple, TypeVar, Union, @@ -23,15 +25,13 @@ from pandas._libs import algos as libalgos, index as libindex, lib import pandas._libs.join as libjoin from pandas._libs.lib import is_datetime_array, no_default -from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp -from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime, Timestamp from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label from pandas.compat.numpy import function as nv from pandas.errors import DuplicateLabelError, InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc -from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.cast import ( maybe_cast_to_integer_array, validate_numeric_casting, @@ -66,7 +66,6 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( - ABCCategorical, ABCDatetimeIndex, ABCMultiIndex, ABCPandasArray, @@ -77,16 +76,16 @@ ) from pandas.core.dtypes.missing import array_equivalent, isna -from pandas.core import ops +from pandas.core import missing, ops from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.frozen import FrozenList -import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ensure_key_mapped, nargsort @@ -121,7 +120,9 @@ _o_dtype = np.dtype(object) -_Identity = object + + +_Identity = NewType("_Identity", object) def _new_Index(cls, d): @@ -193,9 +194,9 @@ class Index(IndexOpsMixin, PandasObject): """ # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations: FrozenSet[str] = ( - PandasObject._deprecations - | IndexOpsMixin._deprecations + _hidden_attrs: FrozenSet[str] = ( + PandasObject._hidden_attrs + | IndexOpsMixin._hidden_attrs | frozenset(["contains", "set_value"]) ) @@ -220,7 +221,7 @@ def _outer_indexer(self, left, right): _typ = "index" _data: Union[ExtensionArray, np.ndarray] - _id = None + _id: Optional[_Identity] = None _name: Label = None # MultiIndex.levels previously allowed setting the index name. We # don't allow this anymore, and raise if it happens rather than @@ -230,6 +231,7 @@ def _outer_indexer(self, left, right): _attributes = ["name"] _is_numeric_dtype = False _can_hold_na = True + _can_hold_strings = True # would we like our indexing holder to defer to us _defer_to_indexing = False @@ -435,8 +437,9 @@ def _simple_new(cls, values, name: Label = None): result._index_data = values result._name = name result._cache = {} + result._reset_identity() - return result._reset_identity() + return result @cache_readonly def _constructor(self): @@ -540,15 +543,20 @@ def is_(self, other) -> bool: -------- Index.identical : Works like ``Index.is_`` but also checks metadata. """ - # use something other than None to be clearer - return self._id is getattr(other, "_id", Ellipsis) and self._id is not None + if self is other: + return True + elif not hasattr(other, "_id"): + return False + elif self._id is None or other._id is None: + return False + else: + return self._id is other._id - def _reset_identity(self): + def _reset_identity(self) -> None: """ Initializes or resets ``_id`` attribute with new object. """ - self._id = _Identity() - return self + self._id = _Identity(object()) def _cleanup(self): self._engine.clear_mapping() @@ -562,6 +570,19 @@ def _engine(self): target_values = self._get_engine_target() return self._engine_type(lambda: target_values, len(self)) + @cache_readonly + def _dir_additions_for_owner(self) -> Set[str_t]: + """ + Add the string-like labels to the owner dataframe/series dir output. + + If this is a MultiIndex, it's first level values are used. + """ + return { + c + for c in self.unique(level=0)[:100] + if isinstance(c, str) and c.isidentifier() + } + # -------------------------------------------------------------------- # Array-Like Methods @@ -607,7 +628,7 @@ def ravel(self, order="C"): See Also -------- - numpy.ndarray.ravel + numpy.ndarray.ravel : Return a flattened array. """ warnings.warn( "Index.ravel returning ndarray is deprecated; in a future version " @@ -703,7 +724,8 @@ def astype(self, dtype, copy=True): See Also -------- - numpy.ndarray.take + numpy.ndarray.take: Return an array formed from the + elements of a at the given indices. """ @Appender(_index_shared_docs["take"] % _index_doc_kwargs) @@ -1550,12 +1572,19 @@ def droplevel(self, level=0): levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] - if len(level) == 0: + return self._drop_level_numbers(levnums) + + def _drop_level_numbers(self, levnums: List[int]): + """ + Drop MultiIndex levels by level _number_, not name. + """ + + if len(levnums) == 0: return self - if len(level) >= self.nlevels: + if len(levnums) >= self.nlevels: raise ValueError( - f"Cannot remove {len(level)} levels from an index with {self.nlevels} " - "levels: at least one level must be left." + f"Cannot remove {len(levnums)} levels from an index with " + f"{self.nlevels} levels: at least one level must be left." ) # The two checks above guarantee that here self is a MultiIndex self = cast("MultiIndex", self) @@ -2296,8 +2325,8 @@ def unique(self, level=None): See Also -------- - unique - Series.unique + unique : Numpy array of unique values in that column. + Series.unique : Return unique values of Series object. """ if level is not None: self._validate_index_level(level) @@ -2662,6 +2691,11 @@ def _union(self, other, sort): return self._shallow_copy(result) def _wrap_setop_result(self, other, result): + if isinstance(self, (ABCDatetimeIndex, ABCTimedeltaIndex)) and isinstance( + result, np.ndarray + ): + result = type(self._data)._simple_new(result, dtype=self.dtype) + name = get_op_result_name(self, other) if isinstance(result, Index): if result.name != name: @@ -2738,10 +2772,10 @@ def intersection(self, other, sort=False): indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] - result = other.take(indexer) + result = other.take(indexer)._values if sort is None: - result = algos.safe_sort(result.values) + result = algos.safe_sort(result) return self._wrap_setop_result(other, result) @@ -2798,7 +2832,7 @@ def difference(self, other, sort=None): indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - the_diff = this.values.take(label_diff) + the_diff = this._values.take(label_diff) if sort is None: try: the_diff = algos.safe_sort(the_diff) @@ -3402,6 +3436,10 @@ def _reindex_non_unique(self, target): """ target = ensure_index(target) + if len(target) == 0: + # GH#13691 + return self[:0], np.array([], dtype=np.intp), None + indexer, missing = self.get_indexer_non_unique(target) check = indexer != -1 new_labels = self.take(indexer[check]) @@ -3575,8 +3613,12 @@ def _join_multi(self, other, how, return_indexers=True): from pandas.core.reshape.merge import restore_dropped_levels_multijoin # figure out join names - self_names = set(com.not_none(*self.names)) - other_names = set(com.not_none(*other.names)) + self_names_list = list(com.not_none(*self.names)) + other_names_list = list(com.not_none(*other.names)) + self_names_order = self_names_list.index + other_names_order = other_names_list.index + self_names = set(self_names_list) + other_names = set(other_names_list) overlap = self_names & other_names # need at least 1 in common @@ -3586,8 +3628,8 @@ def _join_multi(self, other, how, return_indexers=True): if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): # Drop the non-matching levels from left and right respectively - ldrop_names = list(self_names - overlap) - rdrop_names = list(other_names - overlap) + ldrop_names = sorted(self_names - overlap, key=self_names_order) + rdrop_names = sorted(other_names - overlap, key=other_names_order) # if only the order differs if not len(ldrop_names + rdrop_names): @@ -3912,7 +3954,7 @@ def _values(self) -> Union[ExtensionArray, np.ndarray]: This is an ndarray or ExtensionArray. - ``_values`` are consistent between``Series`` and ``Index``. + ``_values`` are consistent between ``Series`` and ``Index``. It may differ from the public '.values' method. @@ -3927,7 +3969,7 @@ def _values(self) -> Union[ExtensionArray, np.ndarray]: See Also -------- - values + values : Values """ return self._data @@ -4205,7 +4247,7 @@ def _concat(self, to_concat, name): """ to_concat = [x._values if isinstance(x, Index) else x for x in to_concat] - result = _concat.concat_compat(to_concat) + result = concat_compat(to_concat) return Index(result, name=name) def putmask(self, mask, value): @@ -4218,7 +4260,8 @@ def putmask(self, mask, value): See Also -------- - numpy.ndarray.putmask + numpy.ndarray.putmask : Changes elements of an array + based on conditional and input values. """ values = self.values.copy() try: @@ -4983,7 +5026,7 @@ def isin(self, values, level=None): self._validate_index_level(level) return algos.isin(self, values) - def _get_string_slice(self, key: str_t, use_lhs: bool = True, use_rhs: bool = True): + def _get_string_slice(self, key: str_t): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex raise NotImplementedError @@ -5365,11 +5408,13 @@ def _cmp_method(self, other, op): if len(self) != len(other): raise ValueError("Lengths must match to compare") - if is_object_dtype(self.dtype) and isinstance(other, ABCCategorical): - left = type(other)(self._values, dtype=other.dtype) - return op(left, other) - elif is_object_dtype(self.dtype) and isinstance(other, ExtensionArray): - # e.g. PeriodArray + if not isinstance(other, ABCMultiIndex): + other = extract_array(other, extract_numpy=True) + else: + other = np.asarray(other) + + if is_object_dtype(self.dtype) and isinstance(other, ExtensionArray): + # e.g. PeriodArray, Categorical with np.errstate(all="ignore"): result = op(self._values, other) @@ -5384,7 +5429,7 @@ def _cmp_method(self, other, op): else: with np.errstate(all="ignore"): - result = ops.comparison_op(self._values, np.asarray(other), op) + result = ops.comparison_op(self._values, other, op) return result diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 8038bc6bf1c72..6b64fd6a20e9a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -161,9 +161,14 @@ class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): _typ = "categoricalindex" + @property + def _can_hold_strings(self): + return self.categories._can_hold_strings + codes: np.ndarray categories: Index _data: Categorical + _values: Categorical @property def _engine_type(self): @@ -234,20 +239,23 @@ def _shallow_copy(self, values=None, name: Label = no_default): return super()._shallow_copy(values=values, name=name) - def _is_dtype_compat(self, other) -> bool: + def _is_dtype_compat(self, other) -> Categorical: """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) + Returns + ------- + Categorical + Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): - if isinstance(other, CategoricalIndex): - other = other._values + other = extract_array(other) if not other.is_dtype_equal(self): raise TypeError( "categories must match existing categories when appending" @@ -262,6 +270,7 @@ def _is_dtype_compat(self, other) -> bool: raise TypeError( "cannot append a non-category item to a CategoricalIndex" ) + other = other._values return other @@ -372,11 +381,6 @@ def astype(self, dtype, copy=True): return Index.astype(self, dtype=dtype, copy=copy) - @cache_readonly - def _isnan(self): - """ return if each value is nan""" - return self._data.codes == -1 - @doc(Index.fillna) def fillna(self, value, downcast=None): value = self._validate_scalar(value) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 3845a601000a0..08ebae7d5c9aa 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -1,13 +1,13 @@ """ Base and utility classes for tseries type pandas objects. """ -from datetime import datetime, tzinfo -from typing import TYPE_CHECKING, Any, List, Optional, TypeVar, Union, cast +from datetime import datetime +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, TypeVar, Union, cast import numpy as np from pandas._libs import NaT, Timedelta, iNaT, join as libjoin, lib -from pandas._libs.tslibs import BaseOffset, Resolution, Tick, timezones +from pandas._libs.tslibs import BaseOffset, Resolution, Tick from pandas._typing import Callable, Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -78,7 +78,7 @@ def wrapper(left, right): @inherit_names( - ["inferred_freq", "_isnan", "_resolution_obj", "resolution"], + ["inferred_freq", "_resolution_obj", "resolution"], DatetimeLikeArrayMixin, cache=True, ) @@ -88,6 +88,7 @@ class DatetimeIndexOpsMixin(ExtensionIndex): Common ops mixin to support a unified interface datetimelike Index. """ + _can_hold_strings = False _data: Union[DatetimeArray, TimedeltaArray, PeriodArray] freq: Optional[BaseOffset] freqstr: Optional[str] @@ -398,16 +399,12 @@ def _partial_date_slice( self, reso: Resolution, parsed: datetime, - use_lhs: bool = True, - use_rhs: bool = True, ): """ Parameters ---------- reso : Resolution parsed : datetime - use_lhs : bool, default True - use_rhs : bool, default True Returns ------- @@ -416,14 +413,13 @@ def _partial_date_slice( self._validate_partial_date_slice(reso) t1, t2 = self._parsed_string_to_bounds(reso, parsed) - i8vals = self.asi8 - unbox = self._data._unbox_scalar + vals = self._data._ndarray + unbox = self._data._unbox if self.is_monotonic: if len(self) and ( - (use_lhs and t1 < self[0] and t2 < self[0]) - or (use_rhs and t1 > self[-1] and t2 > self[-1]) + (t1 < self[0] and t2 < self[0]) or (t1 > self[-1] and t2 > self[-1]) ): # we are out of range raise KeyError @@ -431,14 +427,13 @@ def _partial_date_slice( # TODO: does this depend on being monotonic _increasing_? # a monotonic (sorted) series can be sliced - # Use asi8.searchsorted to avoid re-validating Periods/Timestamps - left = i8vals.searchsorted(unbox(t1), side="left") if use_lhs else None - right = i8vals.searchsorted(unbox(t2), side="right") if use_rhs else None + left = vals.searchsorted(unbox(t1), side="left") + right = vals.searchsorted(unbox(t2), side="right") return slice(left, right) else: - lhs_mask = (i8vals >= unbox(t1)) if use_lhs else True - rhs_mask = (i8vals <= unbox(t2)) if use_rhs else True + lhs_mask = vals >= unbox(t1) + rhs_mask = vals <= unbox(t2) # try to find the dates return (lhs_mask & rhs_mask).nonzero()[0] @@ -489,7 +484,7 @@ def isin(self, values, level=None): @Appender(Index.where.__doc__) def where(self, cond, other=None): - values = self.view("i8") + values = self._data._ndarray try: other = self._data._validate_where_value(other) @@ -498,7 +493,7 @@ def where(self, cond, other=None): oth = getattr(other, "dtype", other) raise TypeError(f"Where requires matching dtype, not {oth}") from err - result = np.where(cond, values, other).astype("i8") + result = np.where(cond, values, other) arr = self._data._from_backing_data(result) return type(self)._simple_new(arr, name=self.name) @@ -615,7 +610,8 @@ def insert(self, loc: int, item): ------- new_index : Index """ - item = self._data._validate_insert_value(item) + value = self._data._validate_insert_value(item) + item = self._data._box_func(value) freq = None if is_period_dtype(self.dtype): @@ -635,10 +631,8 @@ def insert(self, loc: int, item): freq = self.freq arr = self._data - item = arr._unbox_scalar(item) - item = arr._rebox_native(item) - new_values = np.concatenate([arr._ndarray[:loc], [item], arr._ndarray[loc:]]) + new_values = np.concatenate([arr._ndarray[:loc], [value], arr._ndarray[loc:]]) new_arr = self._data._from_backing_data(new_values) new_arr._freq = freq @@ -668,9 +662,7 @@ def _wrap_joined_index(self, joined: np.ndarray, other): @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): try: - return self._data._validate_listlike( - keyarr, "convert_arr_indexer", allow_object=True - ) + return self._data._validate_listlike(keyarr, allow_object=True) except (ValueError, TypeError): return com.asarray_tuplesafe(keyarr) @@ -681,8 +673,6 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): but not PeriodIndex """ - tz: Optional[tzinfo] - # Compat for frequency inference, see GH#23789 _is_monotonic_increasing = Index.is_monotonic_increasing _is_monotonic_decreasing = Index.is_monotonic_decreasing @@ -696,9 +686,6 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name if values is not None: - # TODO: We would rather not get here - if isinstance(values, np.ndarray): - values = type(self._data)(values, dtype=self.dtype) return self._simple_new(values, name=name) result = self._simple_new(self._data, name=name) @@ -943,22 +930,9 @@ def join( sort=sort, ) - def _maybe_utc_convert(self, other): - this = self - if not hasattr(self, "tz"): - return this, other - - if isinstance(other, type(self)): - if self.tz is not None: - if other.tz is None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - elif other.tz is not None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert("UTC") - other = other.tz_convert("UTC") - return this, other + def _maybe_utc_convert(self: _T, other: Index) -> Tuple[_T, Index]: + # Overridden by DatetimeIndex + return self, other # -------------------------------------------------------------------- # List-Like Methods diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 479e2023a00cb..005272750997e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,12 +1,18 @@ from datetime import date, datetime, time, timedelta, tzinfo import operator -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional, Tuple import warnings import numpy as np from pandas._libs import NaT, Period, Timestamp, index as libindex, lib -from pandas._libs.tslibs import Resolution, ints_to_pydatetime, parsing, to_offset +from pandas._libs.tslibs import ( + Resolution, + ints_to_pydatetime, + parsing, + timezones, + to_offset, +) from pandas._libs.tslibs.offsets import prefix_mapping from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError @@ -158,9 +164,11 @@ class DatetimeIndex(DatetimeTimedeltaMixin): time timetz dayofyear + day_of_year weekofyear week dayofweek + day_of_week weekday quarter tz @@ -411,6 +419,21 @@ def union_many(self, others): return this.rename(res_name) return this + def _maybe_utc_convert(self, other: Index) -> Tuple["DatetimeIndex", Index]: + this = self + + if isinstance(other, DatetimeIndex): + if self.tz is not None: + if other.tz is None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + elif other.tz is not None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") + return this, other + # -------------------------------------------------------------------- def _get_time_micros(self): @@ -729,11 +752,11 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): self._deprecate_mismatched_indexing(label) return self._maybe_cast_for_get_loc(label) - def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): + def _get_string_slice(self, key: str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) parsed, reso = parsing.parse_time_string(key, freq) reso = Resolution.from_attrname(reso) - loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) + loc = self._partial_date_slice(reso, parsed) return loc def slice_indexer(self, start=None, end=None, step=None, kind=None): diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index c9367b7e2ee1d..4da1a43468b57 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -277,3 +277,7 @@ def astype(self, dtype, copy=True): # pass copy=False because any copying will be done in the # _data.astype call above return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) + + @cache_readonly + def _isnan(self) -> np.ndarray: + return self._data.isna() diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index cb25ef1241ce0..d8fab1283dfdc 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -37,7 +37,6 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs @@ -192,11 +191,9 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): # we would like our indexing holder to defer to us _defer_to_indexing = True - # Immutable, so we are able to cache computations like isna in '_mask' - _mask = None - _data: IntervalArray _values: IntervalArray + _can_hold_strings = False # -------------------------------------------------------------------- # Constructors @@ -342,15 +339,6 @@ def _shallow_copy( result._cache = self._cache return result - @cache_readonly - def _isnan(self): - """ - Return a mask indicating if each value is NA. - """ - if self._mask is None: - self._mask = isna(self.left) - return self._mask - @cache_readonly def _engine(self): left = self._maybe_convert_i8(self.left) @@ -1302,10 +1290,8 @@ def interval_range( else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): - range_func = date_range + breaks = date_range(start=start, end=end, periods=periods, freq=freq) else: - range_func = timedelta_range - - breaks = range_func(start=start, end=end, periods=periods, freq=freq) + breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2ce4538a63d25..cbc0617ae96d3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -258,7 +258,7 @@ class MultiIndex(Index): of the mentioned helper methods. """ - _deprecations = Index._deprecations | frozenset() + _hidden_attrs = Index._hidden_attrs | frozenset() # initialize to zero-length tuples to make everything work _typ = "multiindex" @@ -316,7 +316,9 @@ def __new__( new_codes = result._verify_integrity() result._codes = new_codes - return result._reset_identity() + result._reset_identity() + + return result def _validate_codes(self, level: List, code: List): """ @@ -461,7 +463,7 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> "MultiInde if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] - return MultiIndex( + return cls( levels=levels, codes=codes, sortorder=sortorder, @@ -532,7 +534,7 @@ def from_tuples( else: arrays = zip(*tuples) - return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) + return cls.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod def from_product(cls, iterables, sortorder=None, names=lib.no_default): @@ -591,7 +593,7 @@ def from_product(cls, iterables, sortorder=None, names=lib.no_default): # codes are all ndarrays, so cartesian_product is lossless codes = cartesian_product(codes) - return MultiIndex(levels, codes, sortorder=sortorder, names=names) + return cls(levels, codes, sortorder=sortorder, names=names) @classmethod def from_frame(cls, df, sortorder=None, names=None): @@ -1537,7 +1539,10 @@ def is_monotonic_increasing(self) -> bool: return if the index is monotonic increasing (only equal or increasing) values. """ - if all(x.is_monotonic for x in self.levels): + if any(-1 in code for code in self.codes): + return False + + if all(level.is_monotonic for level in self.levels): # If each level is sorted, we can operate on the codes directly. GH27495 return libalgos.is_lexsorted( [x.astype("int64", copy=False) for x in self.codes] @@ -2862,16 +2867,29 @@ def get_loc_level(self, key, level=0, drop_level: bool = True): >>> mi.get_loc_level(['b', 'e']) (1, None) """ + if not isinstance(level, (list, tuple)): + level = self._get_level_number(level) + else: + level = [self._get_level_number(lev) for lev in level] + return self._get_loc_level(key, level=level, drop_level=drop_level) + + def _get_loc_level( + self, key, level: Union[int, List[int]] = 0, drop_level: bool = True + ): + """ + get_loc_level but with `level` known to be positional, not name-based. + """ + # different name to distinguish from maybe_droplevels def maybe_mi_droplevels(indexer, levels, drop_level: bool): if not drop_level: return self[indexer] # kludge around orig_index = new_index = self[indexer] - levels = [self._get_level_number(i) for i in levels] + for i in sorted(levels, reverse=True): try: - new_index = new_index.droplevel(i) + new_index = new_index._drop_level_numbers([i]) except ValueError: # no dropping here @@ -2885,7 +2903,7 @@ def maybe_mi_droplevels(indexer, levels, drop_level: bool): ) result = None for lev, k in zip(level, key): - loc, new_index = self.get_loc_level(k, level=lev) + loc, new_index = self._get_loc_level(k, level=lev) if isinstance(loc, slice): mask = np.zeros(len(self), dtype=bool) mask[loc] = True @@ -2895,8 +2913,6 @@ def maybe_mi_droplevels(indexer, levels, drop_level: bool): return result, maybe_mi_droplevels(result, level, drop_level) - level = self._get_level_number(level) - # kludge for #1796 if isinstance(key, list): key = tuple(key) @@ -2961,7 +2977,8 @@ def partial_selection(key, indexer=None): indexer = self._get_level_indexer(key, level=level) return indexer, maybe_mi_droplevels(indexer, [level], drop_level) - def _get_level_indexer(self, key, level=0, indexer=None): + def _get_level_indexer(self, key, level: int = 0, indexer=None): + # `level` kwarg is _always_ positional, never name # return an indexer, boolean array or a slice showing where the key is # in the totality of values # if the indexer is provided, then use this @@ -3093,7 +3110,6 @@ def get_locs(self, seq): >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP array([2], dtype=int64) """ - from pandas.core.indexes.numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -3766,13 +3782,13 @@ def maybe_droplevels(index, key): if isinstance(key, tuple): for _ in key: try: - index = index.droplevel(0) + index = index._drop_level_numbers([0]) except ValueError: # we have dropped too much, so back out return original_index else: try: - index = index.droplevel(0) + index = index._drop_level_numbers([0]) except ValueError: pass diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 385bffa4bc315..facaae3a65f16 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -42,6 +42,7 @@ class NumericIndex(Index): _default_dtype: np.dtype _is_numeric_dtype = True + _can_hold_strings = False def __new__(cls, data=None, dtype=None, copy=False, name=None): cls._validate_dtype(dtype) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ce2839ab9a8e1..e90d31c6957b7 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -95,7 +95,9 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): ---------- day dayofweek + day_of_week dayofyear + day_of_year days_in_month daysinmonth end_time @@ -148,7 +150,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): _supports_partial_string_indexing = True # -------------------------------------------------------------------- - # methods that dispatch to array and wrap result in PeriodIndex + # methods that dispatch to array and wrap result in Index # These are defined here instead of via inherit_names for mypy @doc(PeriodArray.asfreq) @@ -161,6 +163,24 @@ def to_timestamp(self, freq=None, how="start") -> DatetimeIndex: arr = self._data.to_timestamp(freq, how) return DatetimeIndex._simple_new(arr, name=self.name) + # error: Decorated property not supported [misc] + @property # type:ignore[misc] + @doc(PeriodArray.hour.fget) + def hour(self) -> Int64Index: + return Int64Index(self._data.hour, name=self.name) + + # error: Decorated property not supported [misc] + @property # type:ignore[misc] + @doc(PeriodArray.minute.fget) + def minute(self) -> Int64Index: + return Int64Index(self._data.minute, name=self.name) + + # error: Decorated property not supported [misc] + @property # type:ignore[misc] + @doc(PeriodArray.second.fget) + def second(self) -> Int64Index: + return Int64Index(self._data.second, name=self.name) + # ------------------------------------------------------------------------ # Index Constructors @@ -214,7 +234,7 @@ def __new__( if data is None and ordinal is not None: # we strangely ignore `ordinal` if data is passed. ordinal = np.asarray(ordinal, dtype=np.int64) - data = PeriodArray(ordinal, freq) + data = PeriodArray(ordinal, freq=freq) else: # don't pass copy here, since we copy later. data = period_array(data=data, freq=freq) @@ -622,12 +642,11 @@ def _validate_partial_date_slice(self, reso: Resolution): # why is that check not needed? raise ValueError - def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): - # TODO: Check for non-True use_lhs/use_rhs + def _get_string_slice(self, key: str): parsed, reso = parse_time_string(key, self.freq) reso = Resolution.from_attrname(reso) try: - return self._partial_date_slice(reso, parsed, use_lhs, use_rhs) + return self._partial_date_slice(reso, parsed) except KeyError as err: raise KeyError(key) from err diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 14098ddadb8e2..74ec892d5f8a0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -811,6 +811,15 @@ def any(self, *args, **kwargs) -> bool: # -------------------------------------------------------------------- + def _cmp_method(self, other, op): + if isinstance(other, RangeIndex) and self._range == other._range: + if op in {operator.eq, operator.le, operator.ge}: + return np.ones(len(self), dtype=bool) + elif op in {operator.ne, operator.lt, operator.gt}: + return np.zeros(len(self), dtype=bool) + + return super()._cmp_method(other, op) + def _arith_method(self, other, op): """ Parameters diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a49c0741b64fe..dad9ba446941f 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -633,7 +633,7 @@ def _get_setitem_indexer(self, key): raise raise IndexingError(key) from e - def _ensure_listlike_indexer(self, key, axis=None): + def _ensure_listlike_indexer(self, key, axis=None, value=None): """ Ensure that a list-like of column labels are all present by adding them if they do not already exist. @@ -663,9 +663,14 @@ def _ensure_listlike_indexer(self, key, axis=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - for k in key: + for i, k in enumerate(key): if k not in self.obj: - self.obj[k] = np.nan + if value is None: + self.obj[k] = np.nan + elif is_list_like(value): + self.obj[k] = value[i] + else: + self.obj[k] = value def __setitem__(self, key, value): if isinstance(key, tuple): @@ -1535,14 +1540,15 @@ def _setitem_with_indexer(self, indexer, value): info_axis = self.obj._info_axis_number # maybe partial set - take_split_path = len(self.obj._mgr.blocks) > 1 + take_split_path = not self.obj._mgr.is_single_block # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value if not take_split_path and self.obj._mgr.blocks: - (blk,) = self.obj._mgr.blocks - if 1 < blk.ndim: # in case of dict, keys are indices + if self.ndim > 1: + # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value + blk = self.obj._mgr.blocks[0] take_split_path = not blk._can_hold_element(val) # if we have any multi-indexes that have non-trivial slices @@ -1576,10 +1582,7 @@ def _setitem_with_indexer(self, indexer, value): # must have all defined axes if we have a scalar # or a list-like on the non-info axes if we have a # list-like - len_non_info_axes = ( - len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i - ) - if any(not l for l in len_non_info_axes): + if not len(self.obj): if not is_list_like_indexer(value): raise ValueError( "cannot set a frame with no " @@ -1645,10 +1648,7 @@ def _setitem_with_indexer(self, indexer, value): labels = item_labels[info_idx] # Ensure we have something we can iterate over - ilocs = info_idx - if isinstance(info_idx, slice): - ri = Index(range(len(self.obj.columns))) - ilocs = ri[info_idx] + ilocs = self._ensure_iterable_column_indexer(indexer[1]) plane_indexer = indexer[:1] lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) @@ -1667,8 +1667,6 @@ def _setitem_with_indexer(self, indexer, value): "length than the value" ) - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer - # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: @@ -1695,7 +1693,7 @@ def _setitem_with_indexer(self, indexer, value): else: v = np.nan - self._setitem_single_column(loc, v, pi) + self._setitem_single_column(loc, v, plane_indexer) elif not unique_cols: raise ValueError( @@ -1713,7 +1711,7 @@ def _setitem_with_indexer(self, indexer, value): else: v = np.nan - self._setitem_single_column(loc, v, pi) + self._setitem_single_column(loc, v, plane_indexer) # we have an equal len ndarray/convertible to our labels # hasattr first, to avoid coercing to ndarray without reason. @@ -1732,7 +1730,9 @@ def _setitem_with_indexer(self, indexer, value): for i, loc in enumerate(ilocs): # setting with a list, re-coerces - self._setitem_single_column(loc, value[:, i].tolist(), pi) + self._setitem_single_column( + loc, value[:, i].tolist(), plane_indexer + ) elif ( len(labels) == 1 @@ -1741,7 +1741,7 @@ def _setitem_with_indexer(self, indexer, value): ): # we have an equal len list/ndarray # We only get here with len(labels) == len(ilocs) == 1 - self._setitem_single_column(ilocs[0], value, pi) + self._setitem_single_column(ilocs[0], value, plane_indexer) elif lplane_indexer == 0 and len(value) == len(self.obj.index): # We get here in one case via .loc with a all-False mask @@ -1756,15 +1756,15 @@ def _setitem_with_indexer(self, indexer, value): ) for loc, v in zip(ilocs, value): - self._setitem_single_column(loc, v, pi) + self._setitem_single_column(loc, v, plane_indexer) else: # scalar value for loc in ilocs: - self._setitem_single_column(loc, value, pi) + self._setitem_single_column(loc, value, plane_indexer) else: - self._setitem_single_block_inplace(indexer, value) + self._setitem_single_block(indexer, value) def _setitem_single_column(self, loc: int, value, plane_indexer): # positional setting on column loc @@ -1791,10 +1791,9 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # reset the sliced object if unique self.obj._iset_item(loc, ser) - def _setitem_single_block_inplace(self, indexer, value): + def _setitem_single_block(self, indexer, value): """ - _setitem_with_indexer for the case when we have a single Block - and the value can be set into it without casting. + _setitem_with_indexer for the case when we have a single Block. """ from pandas import Series @@ -1898,6 +1897,20 @@ def _setitem_with_indexer_missing(self, indexer, value): self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) + def _ensure_iterable_column_indexer(self, column_indexer): + """ + Ensure that our column indexer is something that can be iterated over. + """ + # Ensure we have something we can iterate over + if is_integer(column_indexer): + ilocs = [column_indexer] + elif isinstance(column_indexer, slice): + ri = Index(range(len(self.obj.columns))) + ilocs = ri[column_indexer] + else: + ilocs = column_indexer + return ilocs + def _align_series(self, indexer, ser: "Series", multiindex_indexer: bool = False): """ Parameters diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 61d9833a50a9f..24b00199611bf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -6,8 +6,7 @@ import numpy as np -from pandas._libs import NaT, algos as libalgos, lib, writers -import pandas._libs.internals as libinternals +from pandas._libs import NaT, algos as libalgos, internals as libinternals, lib, writers from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare @@ -341,7 +340,7 @@ def dtype(self): def iget(self, i): return self.values[i] - def set(self, locs, values): + def set_inplace(self, locs, values): """ Modify block values in-place with new item value. @@ -1677,7 +1676,9 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values - def set(self, locs, values): + def set_inplace(self, locs, values): + # NB: This is a misnomer, is supposed to be inplace but is not, + # see GH#33457 assert locs.tolist() == [0] self.values = values @@ -2232,7 +2233,7 @@ def _can_hold_element(self, element: Any) -> bool: return is_valid_nat_for_dtype(element, self.dtype) - def set(self, locs, values): + def set_inplace(self, locs, values): """ See Block.set.__doc__ """ diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 8d54f88558066..8559fe72972b8 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -115,7 +115,7 @@ def _get_mgr_concatenation_plan(mgr, indexers): blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1) else: - if mgr._is_single_block: + if mgr.is_single_block: blk = mgr.blocks[0] return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] @@ -217,9 +217,7 @@ def is_na(self) -> bool: # a block is NOT null, chunks should help in such cases. 1000 value # was chosen rather arbitrarily. values = self.block.values - if self.block.is_categorical: - values_flat = values.categories - elif is_sparse(self.block.values.dtype): + if is_sparse(self.block.values.dtype): return False elif self.block.is_extension: # TODO(EA2D): no need for special case with 2D EAs diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a1b255640a37d..8b67788b1a1a1 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -225,7 +225,7 @@ def set_axis(self, axis: int, new_labels: Index) -> None: self.axes[axis] = new_labels @property - def _is_single_block(self) -> bool: + def is_single_block(self) -> bool: # Assumes we are 2D; overridden by SingleBlockManager return len(self.blocks) == 1 @@ -833,7 +833,7 @@ def as_array( # mutating the original object copy = copy or na_value is not lib.no_default - if self._is_single_block: + if self.is_single_block: blk = self.blocks[0] if blk.is_extension: # Avoid implicit conversion of extension blocks to object @@ -1083,7 +1083,7 @@ def value_getitem(placement): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] if blk.should_store(value): - blk.set(blk_locs, value_getitem(val_locs)) + blk.set_inplace(blk_locs, value_getitem(val_locs)) else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) @@ -1313,7 +1313,7 @@ def _slice_take_blocks_ax0( slice_or_indexer, self.shape[0], allow_fill=allow_fill ) - if self._is_single_block: + if self.is_single_block: blk = self.blocks[0] if sl_type in ("slice", "mask"): @@ -1503,7 +1503,7 @@ class SingleBlockManager(BlockManager): _is_consolidated = True _known_consolidated = True __slots__ = () - _is_single_block = True + is_single_block = True def __init__( self, diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f2354f649b1e3..3ab08ab2cda56 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -228,7 +228,7 @@ def _maybe_get_mask( # Boolean data cannot contain nulls, so signal via mask being None return None - if skipna: + if skipna or needs_i8_conversion(values.dtype): mask = isna(values) return mask @@ -327,7 +327,10 @@ def _na_ok_dtype(dtype: DtypeObj) -> bool: def _wrap_results(result, dtype: DtypeObj, fill_value=None): """ wrap our results if needed """ - if is_datetime64_any_dtype(dtype): + if result is NaT: + pass + + elif is_datetime64_any_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT @@ -336,7 +339,13 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan - result = Timestamp(result, tz=tz) + if tz is not None: + # we get here e.g. via nanmean when we call it on a DTA[tz] + result = Timestamp(result, tz=tz) + elif isna(result): + result = np.datetime64("NaT", "ns") + else: + result = np.int64(result).view("datetime64[ns]") else: # If we have float dtype, taking a view will give the wrong result result = result.astype(dtype) @@ -378,18 +387,17 @@ def _na_for_min_count( if is_numeric_dtype(values): values = values.astype("float64") fill_value = na_value_for_dtype(values.dtype) + if fill_value is NaT: + fill_value = values.dtype.type("NaT", "ns") if values.ndim == 1: return fill_value + elif axis is None: + return fill_value else: - assert axis is not None # assertion to make mypy happy result_shape = values.shape[:axis] + values.shape[axis + 1 :] - # calling np.full with dtype parameter throws an ValueError when called - # with dtype=np.datetime64 and and fill_value=pd.NaT - try: - result = np.full(result_shape, fill_value, dtype=values.dtype) - except ValueError: - result = np.full(result_shape, fill_value) + + result = np.full(result_shape, fill_value, dtype=values.dtype) return result @@ -498,18 +506,43 @@ def nansum( >>> nanops.nansum(s) 3.0 """ + orig_values = values + values, mask, dtype, dtype_max, _ = _get_values( values, skipna, fill_value=0, mask=mask ) dtype_sum = dtype_max + datetimelike = False if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): + datetimelike = True dtype_sum = np.float64 + the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) - return _wrap_results(the_sum, dtype) + the_sum = _wrap_results(the_sum, dtype) + if datetimelike and not skipna: + the_sum = _mask_datetimelike_result(the_sum, axis, mask, orig_values) + return the_sum + + +def _mask_datetimelike_result( + result: Union[np.ndarray, np.datetime64, np.timedelta64], + axis: Optional[int], + mask: np.ndarray, + orig_values: np.ndarray, +): + if isinstance(result, np.ndarray): + # we need to apply the mask + result = result.astype("i8").view(orig_values.dtype) + axis_mask = mask.any(axis=axis) + result[axis_mask] = iNaT + else: + if mask.any(): + result = NaT + return result @disallow(PeriodDtype) @@ -544,21 +577,25 @@ def nanmean( >>> nanops.nanmean(s) 1.5 """ + orig_values = values + values, mask, dtype, dtype_max, _ = _get_values( values, skipna, fill_value=0, mask=mask ) dtype_sum = dtype_max dtype_count = np.float64 + # not using needs_i8_conversion because that includes period - if ( - is_integer_dtype(dtype) - or is_datetime64_any_dtype(dtype) - or is_timedelta64_dtype(dtype) - ): + datetimelike = False + if dtype.kind in ["m", "M"]: + datetimelike = True + dtype_sum = np.float64 + elif is_integer_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype + count = _get_counts(values.shape, mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) @@ -573,7 +610,10 @@ def nanmean( else: the_mean = the_sum / count if count > 0 else np.nan - return _wrap_results(the_mean, dtype) + the_mean = _wrap_results(the_mean, dtype) + if datetimelike and not skipna: + the_mean = _mask_datetimelike_result(the_mean, axis, mask, orig_values) + return the_mean @bottleneck_switch() @@ -639,16 +679,37 @@ def get_median(x): # empty set so return nans of shape "everything but the passed axis" # since "axis" is where the reduction would occur if we had a nonempty # array - shp = np.array(values.shape) - dims = np.arange(values.ndim) - ret = np.empty(shp[dims != axis]) - ret.fill(np.nan) + ret = get_empty_reduction_result(values.shape, axis, np.float_, np.nan) return _wrap_results(ret, dtype) # otherwise return a scalar value return _wrap_results(get_median(values) if notempty else np.nan, dtype) +def get_empty_reduction_result( + shape: Tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any +) -> np.ndarray: + """ + The result from a reduction on an empty ndarray. + + Parameters + ---------- + shape : Tuple[int] + axis : int + dtype : np.dtype + fill_value : Any + + Returns + ------- + np.ndarray + """ + shp = np.array(shape) + dims = np.arange(len(shape)) + ret = np.empty(shp[dims != axis], dtype=dtype) + ret.fill(fill_value) + return ret + + def _get_counts_nanvar( value_counts: Tuple[int], mask: Optional[np.ndarray], @@ -859,10 +920,13 @@ def reduction( mask: Optional[np.ndarray] = None, ) -> Dtype: + orig_values = values values, mask, dtype, dtype_max, fill_value = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask ) + datetimelike = orig_values.dtype.kind in ["m", "M"] + if (axis is not None and values.shape[axis] == 0) or values.size == 0: try: result = getattr(values, meth)(axis, dtype=dtype_max) @@ -873,7 +937,12 @@ def reduction( result = getattr(values, meth)(axis) result = _wrap_results(result, dtype, fill_value) - return _maybe_null_out(result, axis, mask, values.shape) + result = _maybe_null_out(result, axis, mask, values.shape) + + if datetimelike and not skipna: + result = _mask_datetimelike_result(result, axis, mask, orig_values) + + return result return reduction diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 69b5abe65057a..2b159c607b0a0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -13,8 +13,8 @@ from pandas._typing import Level from pandas.util._decorators import Appender -from pandas.core.dtypes.common import is_list_like -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.common import is_array_like, is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import algorithms @@ -25,11 +25,14 @@ get_array_op, logical_op, ) -from pandas.core.ops.common import unpack_zerodim_and_defer # noqa:F401 +from pandas.core.ops.common import ( # noqa:F401 + get_op_result_name, + unpack_zerodim_and_defer, +) from pandas.core.ops.docstrings import ( _flex_comp_doc_FRAME, - _make_flex_doc, _op_descriptions, + make_flex_doc, ) from pandas.core.ops.invalid import invalid_comparison # noqa:F401 from pandas.core.ops.mask_ops import kleene_and, kleene_or, kleene_xor # noqa: F401 @@ -76,67 +79,6 @@ COMPARISON_BINOPS: Set[str] = {"eq", "ne", "lt", "gt", "le", "ge"} -# ----------------------------------------------------------------------------- -# Ops Wrapping Utilities - - -def get_op_result_name(left, right): - """ - Find the appropriate name to pin to an operation result. This result - should always be either an Index or a Series. - - Parameters - ---------- - left : {Series, Index} - right : object - - Returns - ------- - name : object - Usually a string - """ - # `left` is always a Series when called from within ops - if isinstance(right, (ABCSeries, ABCIndexClass)): - name = _maybe_match_name(left, right) - else: - name = left.name - return name - - -def _maybe_match_name(a, b): - """ - Try to find a name to attach to the result of an operation between - a and b. If only one of these has a `name` attribute, return that - name. Otherwise return a consensus name if they match of None if - they have different names. - - Parameters - ---------- - a : object - b : object - - Returns - ------- - name : str or None - - See Also - -------- - pandas.core.common.consensus_name_attr - """ - a_has = hasattr(a, "name") - b_has = hasattr(b, "name") - if a_has and b_has: - if a.name == b.name: - return a.name - else: - # TODO: what if they both have np.nan for their names? - return None - elif a_has: - return a.name - elif b_has: - return b.name - return None - # ----------------------------------------------------------------------------- # Masking NA values and fallbacks for operations numpy does not support @@ -209,7 +151,7 @@ def align_method_SERIES(left: "Series", right, align_asobject: bool = False): def flex_method_SERIES(op): name = op.__name__.strip("_") - doc = _make_flex_doc(name, "series") + doc = make_flex_doc(name, "series") @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): @@ -311,6 +253,11 @@ def to_series(right): ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): + # GH 36702. Raise when attempting arithmetic with list of array-like. + if any(is_array_like(el) for el in right): + raise ValueError( + f"Unable to coerce list of {type(right[0])} to Series/DataFrame" + ) # GH17901 right = to_series(right) @@ -445,7 +392,7 @@ def flex_arith_method_FRAME(op): default_axis = "columns" na_op = get_array_op(op) - doc = _make_flex_doc(op_name, "dataframe") + doc = make_flex_doc(op_name, "dataframe") @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index fd5f126051c53..97fa7988c1774 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -57,7 +57,7 @@ def comp_method_OBJECT_ARRAY(op, x, y): return result.reshape(x.shape) -def masked_arith_op(x: np.ndarray, y, op): +def _masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). @@ -116,7 +116,7 @@ def masked_arith_op(x: np.ndarray, y, op): return result -def na_arithmetic_op(left, right, op, is_cmp: bool = False): +def _na_arithmetic_op(left, right, op, is_cmp: bool = False): """ Return the result of evaluating op on the passed in values. @@ -147,7 +147,7 @@ def na_arithmetic_op(left, right, op, is_cmp: bool = False): # In this case we do not fall back to the masked op, as that # will handle complex numbers incorrectly, see GH#32047 raise - result = masked_arith_op(left, right, op) + result = _masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): # numpy returned a scalar instead of operating element-wise @@ -179,7 +179,7 @@ def arithmetic_op(left: ArrayLike, right: Any, op): # on `left` and `right`. lvalues = maybe_upcast_datetimelike_array(left) rvalues = maybe_upcast_datetimelike_array(right) - rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) + rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): # Timedelta is included because numexpr will fail on it, see GH#31457 @@ -187,7 +187,7 @@ def arithmetic_op(left: ArrayLike, right: Any, op): else: with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op) + res_values = _na_arithmetic_op(lvalues, rvalues, op) return res_values @@ -248,7 +248,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # suppress warnings from numpy about element-wise comparison warnings.simplefilter("ignore", DeprecationWarning) with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) + res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) return res_values @@ -427,7 +427,7 @@ def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike: return obj -def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): +def _maybe_upcast_for_op(obj, shape: Tuple[int, ...]): """ Cast non-pandas objects to pandas types to unify behavior of arithmetic and comparison operations. diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index 515a0a5198d74..a6bcab44e5519 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -65,3 +65,60 @@ def new_method(self, other): return method(self, other) return new_method + + +def get_op_result_name(left, right): + """ + Find the appropriate name to pin to an operation result. This result + should always be either an Index or a Series. + + Parameters + ---------- + left : {Series, Index} + right : object + + Returns + ------- + name : object + Usually a string + """ + if isinstance(right, (ABCSeries, ABCIndexClass)): + name = _maybe_match_name(left, right) + else: + name = left.name + return name + + +def _maybe_match_name(a, b): + """ + Try to find a name to attach to the result of an operation between + a and b. If only one of these has a `name` attribute, return that + name. Otherwise return a consensus name if they match of None if + they have different names. + + Parameters + ---------- + a : object + b : object + + Returns + ------- + name : str or None + + See Also + -------- + pandas.core.common.consensus_name_attr + """ + a_has = hasattr(a, "name") + b_has = hasattr(b, "name") + if a_has and b_has: + if a.name == b.name: + return a.name + else: + # TODO: what if they both have np.nan for their names? + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 839bdbfb2444a..06ed321327e06 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -4,7 +4,7 @@ from typing import Dict, Optional -def _make_flex_doc(op_name, typ: str): +def make_flex_doc(op_name: str, typ: str) -> str: """ Make the appropriate substitutions for the given operation and class-typ into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring @@ -26,6 +26,8 @@ def _make_flex_doc(op_name, typ: str): assert op_desc_op is not None # for mypy if op_name.startswith("r"): equiv = "other " + op_desc_op + " " + typ + elif op_name == "divmod": + equiv = f"{op_name}({typ}, other)" else: equiv = typ + " " + op_desc_op + " other" @@ -162,6 +164,25 @@ def _make_flex_doc(op_name, typ: str): """ ) +_divmod_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.divmod(b, fill_value=0) +(a 1.0 + b NaN + c NaN + d 0.0 + e NaN + dtype: float64, + a 0.0 + b NaN + c NaN + d 0.0 + e NaN + dtype: float64) +""" +) + _mod_example_SERIES = ( _common_examples_algebra_SERIES + """ @@ -332,7 +353,7 @@ def _make_flex_doc(op_name, typ: str): "op": "divmod", "desc": "Integer division and modulo", "reverse": "rdivmod", - "series_examples": None, + "series_examples": _divmod_example_SERIES, "series_returns": _returns_tuple, "df_examples": None, }, @@ -427,33 +448,6 @@ def _make_flex_doc(op_name, typ: str): Series.{reverse} : {see_also_desc}. """ -_arith_doc_FRAME = """ -Binary operator %s with support to substitute a fill_value for missing data in -one of the inputs - -Parameters ----------- -other : Series, DataFrame, or constant -axis : {0, 1, 'index', 'columns'} - For Series input, axis to match Series index on -fill_value : None or float value, default None - Fill existing missing (NaN) values, and any new element needed for - successful DataFrame alignment, with this value before computation. - If data in both corresponding DataFrame locations is missing - the result will be missing -level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level - -Returns -------- -result : DataFrame - -Notes ------ -Mismatched indices will be unioned together -""" - _flex_doc_FRAME = """ Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3f1b1dac080a7..78d217c4688b6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -131,7 +131,7 @@ def __iter__(self): See Also -------- - GroupBy.__iter__ + GroupBy.__iter__ : Generator yielding sequence for each group. """ self._set_binner() return super().__iter__() @@ -235,9 +235,12 @@ def pipe(self, func, *args, **kwargs): """ See Also -------- - DataFrame.groupby.aggregate - DataFrame.resample.transform - DataFrame.aggregate + DataFrame.groupby.aggregate : Aggregate using callable, string, dict, + or list of string/callables. + DataFrame.resample.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate: Aggregate using one or more + operations over the specified axis. """ ) @@ -369,8 +372,9 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) - except DataError: + except (DataError, AttributeError, KeyError): # we have a non-reducing function; try to evaluate + # alternatively we want to evaluate only a column of the input result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -453,8 +457,8 @@ def pad(self, limit=None): See Also -------- - Series.fillna - DataFrame.fillna + Series.fillna: Fill NA/NaN values using the specified method. + DataFrame.fillna: Fill NA/NaN values using the specified method. """ return self._upsample("pad", limit=limit) @@ -828,8 +832,8 @@ def asfreq(self, fill_value=None): See Also -------- - Series.asfreq - DataFrame.asfreq + Series.asfreq: Convert TimeSeries to specified frequency. + DataFrame.asfreq: Convert TimeSeries to specified frequency. """ return self._upsample("asfreq", fill_value=fill_value) @@ -915,8 +919,13 @@ def quantile(self, q=0.5, **kwargs): See Also -------- Series.quantile + Return a series, where the index is q and the values are the quantiles. DataFrame.quantile + Return a DataFrame, where the columns are the columns of self, + and the values are the quantiles. DataFrameGroupBy.quantile + Return a DataFrame, where the coulmns are groupby columns, + and the values are its quantiles. """ return self._downsample("quantile", q=q, **kwargs) @@ -1072,7 +1081,7 @@ def _upsample(self, method, limit=None, fill_value=None): See Also -------- - .fillna + .fillna: Fill NA/NaN values using the specified method. """ self._set_binner() @@ -1208,7 +1217,7 @@ def _upsample(self, method, limit=None, fill_value=None): See Also -------- - .fillna + .fillna: Fill NA/NaN values using the specified method. """ # we may need to actually resample as if we are timestamps diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 13052c23a9f11..77b1076920f20 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -3,7 +3,7 @@ """ from collections import abc -from typing import TYPE_CHECKING, Iterable, List, Mapping, Union, overload +from typing import TYPE_CHECKING, Iterable, List, Mapping, Type, Union, cast, overload import numpy as np @@ -30,7 +30,7 @@ from pandas.core.internals import concatenate_block_managers if TYPE_CHECKING: - from pandas import DataFrame + from pandas import DataFrame, Series from pandas.core.generic import NDFrame # --------------------------------------------------------------------- @@ -455,14 +455,17 @@ def __init__( self.new_axes = self._get_new_axes() def get_result(self): + cons: Type[FrameOrSeriesUnion] + sample: FrameOrSeriesUnion # series only if self._is_series: + sample = cast("Series", self.objs[0]) # stack blocks if self.bm_axis == 0: name = com.consensus_name_attr(self.objs) - cons = self.objs[0]._constructor + cons = sample._constructor arrs = [ser._values for ser in self.objs] @@ -475,7 +478,7 @@ def get_result(self): data = dict(zip(range(len(self.objs)), self.objs)) # GH28330 Preserves subclassed objects through concat - cons = self.objs[0]._constructor_expanddim + cons = sample._constructor_expanddim index, columns = self.new_axes df = cons(data, index=index) @@ -484,6 +487,8 @@ def get_result(self): # combine block managers else: + sample = cast("DataFrame", self.objs[0]) + mgrs_indexers = [] for obj in self.objs: indexers = {} @@ -506,7 +511,7 @@ def get_result(self): if not self.copy: new_data._consolidate_inplace() - cons = self.objs[0]._constructor + cons = sample._constructor return cons(new_data).__finalize__(self, method="concat") def _get_result_dim(self) -> int: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 493ba87565220..5012be593820e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -11,8 +11,7 @@ import numpy as np -from pandas._libs import Timedelta, hashtable as libhashtable, lib -import pandas._libs.join as libjoin +from pandas._libs import Timedelta, hashtable as libhashtable, join as libjoin, lib from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution diff --git a/pandas/core/series.py b/pandas/core/series.py index fb684da20bac8..379c4ac1c9526 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -67,7 +67,6 @@ remove_na_arraylike, ) -import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.aggregation import aggregate, transform @@ -76,6 +75,7 @@ from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com from pandas.core.construction import ( + array as pd_array, create_series_with_explicit_dtype, extract_array, is_empty_data, @@ -181,9 +181,9 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _metadata: List[str] = ["name"] _internal_names_set = {"index"} | generic.NDFrame._internal_names_set _accessors = {"dt", "cat", "str", "sparse"} - _deprecations = ( - base.IndexOpsMixin._deprecations - | generic.NDFrame._deprecations + _hidden_attrs = ( + base.IndexOpsMixin._hidden_attrs + | generic.NDFrame._hidden_attrs | frozenset(["compress", "ptp"]) ) @@ -1472,6 +1472,10 @@ def to_markdown( str {klass} in Markdown-friendly format. + Notes + ----- + Requires the `tabulate `_ package. + Examples -------- >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") @@ -4193,7 +4197,7 @@ def f(x): if len(mapped) and isinstance(mapped[0], Series): # GH 25959 use pd.array instead of tolist # so extension arrays can be used - return self._constructor_expanddim(pd.array(mapped), index=self.index) + return self._constructor_expanddim(pd_array(mapped), index=self.index) else: return self._constructor(mapped, index=self.index).__finalize__( self, method="apply" diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index d595b03a535ed..38e36c8ff8d01 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -4,7 +4,7 @@ _shared_docs[ "aggregate" -] = """\ +] = """ Aggregate using one or more operations over the specified axis. Parameters @@ -46,7 +46,7 @@ _shared_docs[ "compare" -] = """\ +] = """ Compare to another %(klass)s and show the differences. .. versionadded:: 1.1.0 @@ -75,7 +75,7 @@ _shared_docs[ "groupby" -] = """\ +] = """ Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the @@ -144,7 +144,7 @@ _shared_docs[ "melt" -] = """\ +] = """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. This function is useful to massage a DataFrame into a format where one @@ -258,7 +258,7 @@ _shared_docs[ "transform" -] = """\ +] = """ Call ``func`` on self producing a {klass} with transformed values. Produced {klass} will have same axis length as self. diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e02b565ed5d7b..1132234ae7f8d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -327,6 +327,7 @@ def nargsort( ascending: bool = True, na_position: str = "last", key: Optional[Callable] = None, + mask: Optional[np.ndarray] = None, ): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. @@ -341,19 +342,27 @@ def nargsort( ascending : bool, default True na_position : {'first', 'last'}, default 'last' key : Optional[Callable], default None + mask : Optional[np.ndarray], default None + Passed when called by ExtensionArray.argsort. """ if key is not None: items = ensure_key_mapped(items, key) return nargsort( - items, kind=kind, ascending=ascending, na_position=na_position, key=None + items, + kind=kind, + ascending=ascending, + na_position=na_position, + key=None, + mask=mask, ) items = extract_array(items) - mask = np.asarray(isna(items)) + if mask is None: + mask = np.asarray(isna(items)) if is_extension_array_dtype(items): - items = items._values_for_argsort() + return items.argsort(ascending=ascending, kind=kind, na_position=na_position) else: items = np.asanyarray(items) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7b384c9bbb47d..1553deeef4059 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -16,8 +16,16 @@ import numpy as np -from pandas._libs import tslib, tslibs -from pandas._libs.tslibs import Timestamp, conversion, parsing +from pandas._libs import tslib +from pandas._libs.tslibs import ( + OutOfBoundsDatetime, + Timedelta, + Timestamp, + conversion, + iNaT, + nat_strings, + parsing, +) from pandas._libs.tslibs.parsing import ( # noqa DateParseError, format_is_iso, @@ -404,7 +412,7 @@ def _convert_listlike_datetimes( # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, tslibs.OutOfBoundsDatetime) as err: + except (ValueError, TypeError, OutOfBoundsDatetime) as err: raise ValueError( "cannot convert the input to '%Y%m%d' date format" ) from err @@ -419,13 +427,13 @@ def _convert_listlike_datetimes( return _return_parsed_timezone_results( result, timezones, tz, name ) - except tslibs.OutOfBoundsDatetime: + except OutOfBoundsDatetime: if errors == "raise": raise elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult.fill(tslibs.iNaT) + iresult.fill(iNaT) else: result = arg except ValueError: @@ -438,7 +446,7 @@ def _convert_listlike_datetimes( elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult.fill(tslibs.iNaT) + iresult.fill(iNaT) else: result = arg except ValueError as e: @@ -508,7 +516,7 @@ def _adjust_to_origin(arg, origin, unit): j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): - raise tslibs.OutOfBoundsDatetime( + raise OutOfBoundsDatetime( f"{original} is Out of Bounds for origin='julian'" ) else: @@ -525,10 +533,8 @@ def _adjust_to_origin(arg, origin, unit): # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) - except tslibs.OutOfBoundsDatetime as err: - raise tslibs.OutOfBoundsDatetime( - f"origin {origin} is Out of Bounds" - ) from err + except OutOfBoundsDatetime as err: + raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err except ValueError as err: raise ValueError( f"origin {origin} cannot be converted to a Timestamp" @@ -540,7 +546,7 @@ def _adjust_to_origin(arg, origin, unit): # convert the offset to the unit of the arg # this should be lossless in terms of precision - offset = offset // tslibs.Timedelta(1, unit=unit) + offset = offset // Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)): @@ -809,7 +815,7 @@ def to_datetime( elif is_list_like(arg): try: cache_array = _maybe_cache(arg, format, cache, convert_listlike) - except tslibs.OutOfBoundsDatetime: + except OutOfBoundsDatetime: # caching attempts to create a DatetimeIndex, which may raise # an OOB. If that's the desired behavior, then just reraise... if errors == "raise": @@ -965,7 +971,7 @@ def calc(carg): def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult[~mask] = tslibs.iNaT + iresult[~mask] = iNaT masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) result[mask] = masked_result.astype("M8[ns]") @@ -986,7 +992,7 @@ def calc_with_mask(carg, mask): # string with NaN-like try: - mask = ~algorithms.isin(arg, list(tslibs.nat_strings)) + mask = ~algorithms.isin(arg, list(nat_strings)) return calc_with_mask(arg, mask) except (ValueError, OverflowError, TypeError): pass diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index cff4695603d06..32ca83787c4c1 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -186,7 +186,7 @@ def to_numeric(arg, errors="raise", downcast=None): break if is_series: - return pd.Series(values, index=arg.index, name=arg.name) + return arg._constructor(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy diff --git a/pandas/core/tools/times.py b/pandas/core/tools/times.py index 3bac4cf0edb63..643c1165180b4 100644 --- a/pandas/core/tools/times.py +++ b/pandas/core/tools/times.py @@ -5,11 +5,9 @@ from pandas._libs.lib import is_list_like -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import notna -from pandas.core.indexes.base import Index - def to_time(arg, format=None, infer_time_format=False, errors="raise"): """ @@ -105,7 +103,7 @@ def _convert_listlike(arg, format): elif isinstance(arg, ABCSeries): values = _convert_listlike(arg._values, format) return arg._constructor(values, index=arg.index, name=arg.name) - elif isinstance(arg, Index): + elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) elif is_list_like(arg): return _convert_listlike(arg, format) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 25938b57d9720..9f7040943d9a3 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -1,7 +1,7 @@ import datetime from functools import partial from textwrap import dedent -from typing import Optional, Union +from typing import TYPE_CHECKING, Optional, Union import numpy as np @@ -17,6 +17,10 @@ from pandas.core.window.common import _doc_template, _shared_docs, zsqrt from pandas.core.window.rolling import BaseWindow, flex_binary_moment +if TYPE_CHECKING: + from pandas import Series + + _bias_template = """ Parameters ---------- @@ -60,6 +64,15 @@ def get_center_of_mass( return float(comass) +def wrap_result(obj: "Series", result: np.ndarray) -> "Series": + """ + Wrap a single 1D result. + """ + obj = obj._selected_obj + + return obj._constructor(result, obj.index, name=obj.name) + + class ExponentialMovingWindow(BaseWindow): r""" Provide exponential weighted (EW) functions. @@ -413,7 +426,7 @@ def _get_cov(X, Y): self.min_periods, bias, ) - return X._wrap_result(cov) + return wrap_result(X, cov) return flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) @@ -467,7 +480,7 @@ def _cov(x, y): x_var = _cov(x_values, x_values) y_var = _cov(y_values, y_values) corr = cov / zsqrt(x_var * y_var) - return X._wrap_result(corr) + return wrap_result(X, corr) return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 44e0e36b61d46..94875ba86db65 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -1,11 +1,14 @@ from textwrap import dedent -from typing import Dict, Optional +from typing import Any, Callable, Dict, Optional, Tuple, Union +import numpy as np + +from pandas._typing import FrameOrSeries from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, doc from pandas.core.window.common import _doc_template, _shared_docs -from pandas.core.window.indexers import ExpandingIndexer, GroupbyIndexer +from pandas.core.window.indexers import BaseIndexer, ExpandingIndexer, GroupbyIndexer from pandas.core.window.rolling import BaseWindowGroupby, RollingAndExpandingMixin @@ -65,9 +68,17 @@ def __init__(self, obj, min_periods=1, center=None, axis=0, **kwargs): def _constructor(self): return Expanding - def _get_window(self, other=None, **kwargs): + def _get_window_indexer(self) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds """ - Get the window length over which to perform some operation. + return ExpandingIndexer() + + def _get_cov_corr_window( + self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None, **kwargs + ) -> int: + """ + Get the window length over which to perform cov and corr operations. Parameters ---------- @@ -128,19 +139,19 @@ def aggregate(self, func, *args, **kwargs): @Substitution(name="expanding") @Appender(_shared_docs["count"]) - def count(self, **kwargs): - return super().count(**kwargs) + def count(self): + return super().count() @Substitution(name="expanding") @Appender(_shared_docs["apply"]) def apply( self, - func, + func: Callable[..., Any], raw: bool = False, engine: Optional[str] = None, engine_kwargs: Optional[Dict[str, bool]] = None, - args=None, - kwargs=None, + args: Optional[Tuple[Any, ...]] = None, + kwargs: Optional[Dict[str, Any]] = None, ): return super().apply( func, @@ -183,19 +194,19 @@ def median(self, **kwargs): @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["std"]) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof: int = 1, *args, **kwargs): nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["var"]) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof: int = 1, *args, **kwargs): nv.validate_expanding_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) @Substitution(name="expanding") @Appender(_shared_docs["sem"]) - def sem(self, ddof=1, *args, **kwargs): + def sem(self, ddof: int = 1, *args, **kwargs): return super().sem(ddof=ddof, **kwargs) @Substitution(name="expanding", func_name="skew") @@ -245,12 +256,23 @@ def quantile(self, quantile, interpolation="linear", **kwargs): @Substitution(name="expanding", func_name="cov") @Appender(_doc_template) @Appender(_shared_docs["cov"]) - def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + def cov( + self, + other: Optional[Union[np.ndarray, FrameOrSeries]] = None, + pairwise: Optional[bool] = None, + ddof: int = 1, + **kwargs, + ): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) @Substitution(name="expanding") @Appender(_shared_docs["corr"]) - def corr(self, other=None, pairwise=None, **kwargs): + def corr( + self, + other: Optional[Union[np.ndarray, FrameOrSeries]] = None, + pairwise: Optional[bool] = None, + **kwargs, + ): return super().corr(other=other, pairwise=pairwise, **kwargs) @@ -259,15 +281,10 @@ class ExpandingGroupby(BaseWindowGroupby, Expanding): Provide a expanding groupby implementation. """ - def _get_window_indexer(self, window: int) -> GroupbyIndexer: + def _get_window_indexer(self) -> GroupbyIndexer: """ Return an indexer class that will compute the window start and end bounds - Parameters - ---------- - window : int - window size for FixedWindowIndexer (unused) - Returns ------- GroupbyIndexer diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 71e77f97d8797..a8229257bb7bb 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -85,6 +85,10 @@ def get_window_bounds( end = np.arange(1 + offset, num_values + 1 + offset, dtype="int64") start = end - self.window_size + if closed in ["left", "both"]: + start -= 1 + if closed in ["left", "neither"]: + end -= 1 end = np.clip(end, 0, num_values) start = np.clip(start, 0, num_values) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index cc0927437ad1d..bfc31021a8f87 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -8,6 +8,7 @@ from textwrap import dedent from typing import ( TYPE_CHECKING, + Any, Callable, Dict, List, @@ -73,80 +74,6 @@ from pandas.core.internals import Block # noqa:F401 -def calculate_center_offset(window: np.ndarray) -> int: - """ - Calculate an offset necessary to have the window label to be centered - for weighted windows. - - Parameters - ---------- - window: ndarray - window weights - - Returns - ------- - int - """ - return (len(window) - 1) // 2 - - -def calculate_min_periods( - window: int, - min_periods: Optional[int], - num_values: int, - required_min_periods: int, - floor: int, -) -> int: - """ - Calculate final minimum periods value for rolling aggregations. - - Parameters - ---------- - window : passed window value - min_periods : passed min periods value - num_values : total number of values - required_min_periods : required min periods per aggregation function - floor : required min periods per aggregation function - - Returns - ------- - min_periods : int - """ - if min_periods is None: - min_periods = window - else: - min_periods = max(required_min_periods, min_periods) - if min_periods > window: - raise ValueError(f"min_periods {min_periods} must be <= window {window}") - elif min_periods > num_values: - min_periods = num_values + 1 - elif min_periods < 0: - raise ValueError("min_periods must be >= 0") - return max(min_periods, floor) - - -def get_weighted_roll_func(cfunc: Callable) -> Callable: - """ - Wrap weighted rolling cython function with min periods argument. - - Parameters - ---------- - cfunc : function - Cython weighted rolling function - - Returns - ------- - function - """ - - def func(arg, window, min_periods=None): - if min_periods is None: - min_periods = len(window) - return cfunc(arg, window, min_periods) - - return func - - class BaseWindow(ShallowMixin, SelectionMixin): """Provides utilities for performing windowing operations.""" @@ -201,8 +128,15 @@ def is_freq_type(self) -> bool: def validate(self) -> None: if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") - if self.min_periods is not None and not is_integer(self.min_periods): - raise ValueError("min_periods must be an integer") + if self.min_periods is not None: + if not is_integer(self.min_periods): + raise ValueError("min_periods must be an integer") + elif self.min_periods < 0: + raise ValueError("min_periods must be >= 0") + elif is_integer(self.window) and self.min_periods > self.window: + raise ValueError( + f"min_periods {self.min_periods} must be <= window {self.window}" + ) if self.closed is not None and self.closed not in [ "right", "both", @@ -213,25 +147,19 @@ def validate(self) -> None: if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): raise TypeError(f"invalid type: {type(self)}") if isinstance(self.window, BaseIndexer): - self._validate_get_window_bounds_signature(self.window) - - @staticmethod - def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: - """ - Validate that the passed BaseIndexer subclass has - a get_window_bounds with the correct signature. - """ - get_window_bounds_signature = inspect.signature( - window.get_window_bounds - ).parameters.keys() - expected_signature = inspect.signature( - BaseIndexer().get_window_bounds - ).parameters.keys() - if get_window_bounds_signature != expected_signature: - raise ValueError( - f"{type(window).__name__} does not implement the correct signature for " - f"get_window_bounds" - ) + # Validate that the passed BaseIndexer subclass has + # a get_window_bounds with the correct signature. + get_window_bounds_signature = inspect.signature( + self.window.get_window_bounds + ).parameters.keys() + expected_signature = inspect.signature( + BaseIndexer().get_window_bounds + ).parameters.keys() + if get_window_bounds_signature != expected_signature: + raise ValueError( + f"{type(self.window).__name__} does not implement " + f"the correct signature for get_window_bounds" + ) def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: """ @@ -285,38 +213,21 @@ def __getattr__(self, attr: str): def _dir_additions(self): return self.obj._dir_additions() - def _get_win_type(self, kwargs: Dict): - """ - Exists for compatibility, overridden by subclass Window. - - Parameters - ---------- - kwargs : dict - ignored, exists for compatibility - - Returns - ------- - None - """ - return None - - def _get_window(self, other=None, win_type: Optional[str] = None) -> int: + def _get_cov_corr_window( + self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None + ) -> Optional[Union[int, timedelta, BaseOffset, BaseIndexer]]: """ Return window length. Parameters ---------- other : - ignored, exists for compatibility - win_type : - ignored, exists for compatibility + Used in Expanding Returns ------- window : int """ - if isinstance(self.window, BaseIndexer): - return self.min_periods or 0 return self.window @property @@ -336,11 +247,10 @@ def __repr__(self) -> str: return f"{self._window_type} [{attrs}]" def __iter__(self): - window = self._get_window(win_type=None) obj = self._create_data(self._selected_obj) - index = self._get_window_indexer(window=window) + indexer = self._get_window_indexer() - start, end = index.get_window_bounds( + start, end = indexer.get_window_bounds( num_values=len(obj), min_periods=self.min_periods, center=self.center, @@ -382,14 +292,6 @@ def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: return values - def _wrap_result(self, result: np.ndarray) -> "Series": - """ - Wrap a single 1D result. - """ - obj = self._selected_obj - - return obj._constructor(result, obj.index, name=obj.name) - def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): # if we have an 'on' column we want to put it back into # the results in the same location @@ -415,21 +317,7 @@ def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): # insert at the end result[name] = extra_col - def _center_window(self, result: np.ndarray, window: np.ndarray) -> np.ndarray: - """ - Center the result in the window for weighted rolling aggregations. - """ - if self.axis > result.ndim - 1: - raise ValueError("Requested axis is larger then no. of argument dimensions") - - offset = calculate_center_offset(window) - if offset > 0: - lead_indexer = [slice(None)] * result.ndim - lead_indexer[self.axis] = slice(offset, None) - result = np.copy(result[tuple(lead_indexer)]) - return result - - def _get_roll_func(self, func_name: str) -> Callable: + def _get_roll_func(self, func_name: str) -> Callable[..., Any]: """ Wrap rolling function to check values passed. @@ -449,15 +337,17 @@ def _get_roll_func(self, func_name: str) -> Callable: ) return window_func - def _get_window_indexer(self, window: int) -> BaseIndexer: + def _get_window_indexer(self) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds """ if isinstance(self.window, BaseIndexer): return self.window if self.is_freq_type: - return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) - return FixedWindowIndexer(window_size=window) + return VariableWindowIndexer( + index_array=self._on.asi8, window_size=self.window + ) + return FixedWindowIndexer(window_size=self.window) def _apply_series( self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None @@ -513,10 +403,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def _apply( self, - func: Callable, - require_min_periods: int = 0, - floor: int = 1, - is_weighted: bool = False, + func: Callable[..., Any], name: Optional[str] = None, use_numba_cache: bool = False, **kwargs, @@ -529,11 +416,7 @@ def _apply( Parameters ---------- func : callable function to apply - require_min_periods : int - floor : int - is_weighted : bool name : str, - compatibility with groupby.rolling use_numba_cache : bool whether to cache a numba compiled function. Only available for numba enabled methods (so far only apply) @@ -544,9 +427,12 @@ def _apply( ------- y : type of input """ - win_type = self._get_win_type(kwargs) - window = self._get_window(win_type=win_type) - window_indexer = self._get_window_indexer(window) + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) def homogeneous_func(values: np.ndarray): # calculation function @@ -554,36 +440,14 @@ def homogeneous_func(values: np.ndarray): if values.size == 0: return values.copy() - if not is_weighted: - - def calc(x): - if not isinstance(self.window, BaseIndexer): - min_periods = calculate_min_periods( - window, self.min_periods, len(x), require_min_periods, floor - ) - else: - min_periods = calculate_min_periods( - window_indexer.window_size, - self.min_periods, - len(x), - require_min_periods, - floor, - ) - start, end = window_indexer.get_window_bounds( - num_values=len(x), - min_periods=self.min_periods, - center=self.center, - closed=self.closed, - ) - return func(x, start, end, min_periods) - - else: - - def calc(x): - offset = calculate_center_offset(window) if self.center else 0 - additional_nans = np.array([np.nan] * offset) - x = np.concatenate((x, additional_nans)) - return func(x, window, self.min_periods) + def calc(x): + start, end = window_indexer.get_window_bounds( + num_values=len(x), + min_periods=min_periods, + center=self.center, + closed=self.closed, + ) + return func(x, start, end, min_periods) with np.errstate(all="ignore"): if values.ndim > 1: @@ -595,9 +459,6 @@ def calc(x): if use_numba_cache: NUMBA_FUNC_CACHE[(kwargs["original_func"], "rolling_apply")] = func - if self.center and is_weighted: - result = self._center_window(result, window) - return result return self._apply_blockwise(homogeneous_func, name) @@ -890,19 +751,13 @@ def __init__(self, obj, *args, **kwargs): def _apply( self, - func: Callable, - require_min_periods: int = 0, - floor: int = 1, - is_weighted: bool = False, + func: Callable[..., Any], name: Optional[str] = None, use_numba_cache: bool = False, **kwargs, - ): + ) -> FrameOrSeries: result = super()._apply( func, - require_min_periods, - floor, - is_weighted, name, use_numba_cache, **kwargs, @@ -995,10 +850,11 @@ class Window(BaseWindow): axis : int or str, default 0 closed : str, default None Make the interval closed on the 'right', 'left', 'both' or - 'neither' endpoints. - For offset-based windows, it defaults to 'right'. - For fixed windows, defaults to 'both'. Remaining cases not implemented - for fixed windows. + 'neither' endpoints. Defaults to 'right'. + + .. versionchanged:: 1.2.0 + + The closed parameter with fixed windows is now supported. Returns ------- @@ -1166,7 +1022,7 @@ def validate(self): else: raise ValueError(f"Invalid window {window}") - def _get_win_type(self, kwargs: Dict) -> Union[str, Tuple]: + def _get_win_type(self, kwargs: Dict[str, Any]) -> Union[str, Tuple]: """ Extract arguments for the window type, provide validation for it and return the validated window type. @@ -1210,16 +1066,27 @@ def _pop_args(win_type, arg_names, kwargs): return _validate_win_type(self.win_type, kwargs) - def _get_window( - self, other=None, win_type: Optional[Union[str, Tuple]] = None + def _center_window(self, result: np.ndarray, offset: int) -> np.ndarray: + """ + Center the result in the window for weighted rolling aggregations. + """ + if self.axis > result.ndim - 1: + raise ValueError("Requested axis is larger then no. of argument dimensions") + + if offset > 0: + lead_indexer = [slice(None)] * result.ndim + lead_indexer[self.axis] = slice(offset, None) + result = np.copy(result[tuple(lead_indexer)]) + return result + + def _get_window_weights( + self, win_type: Optional[Union[str, Tuple]] = None ) -> np.ndarray: """ Get the window, weights. Parameters ---------- - other : - ignored, exists for compatibility win_type : str, or tuple type of window to create @@ -1237,6 +1104,61 @@ def _get_window( # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) + def _apply( + self, + func: Callable[[np.ndarray, int, int], np.ndarray], + name: Optional[str] = None, + use_numba_cache: bool = False, + **kwargs, + ): + """ + Rolling with weights statistical measure using supplied function. + + Designed to be used with passed-in Cython array-based functions. + + Parameters + ---------- + func : callable function to apply + name : str, + use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) + **kwargs + additional arguments for rolling function and window function + + Returns + ------- + y : type of input + """ + win_type = self._get_win_type(kwargs) + window = self._get_window_weights(win_type=win_type) + offset = (len(window) - 1) // 2 if self.center else 0 + + def homogeneous_func(values: np.ndarray): + # calculation function + + if values.size == 0: + return values.copy() + + def calc(x): + additional_nans = np.array([np.nan] * offset) + x = np.concatenate((x, additional_nans)) + return func(x, window, self.min_periods or len(window)) + + with np.errstate(all="ignore"): + if values.ndim > 1: + result = np.apply_along_axis(calc, self.axis, values) + else: + result = calc(values) + result = np.asarray(result) + + if self.center: + result = self._center_window(result, offset) + + return result + + return self._apply_blockwise(homogeneous_func, name) + _agg_see_also_doc = dedent( """ See Also @@ -1288,29 +1210,26 @@ def aggregate(self, func, *args, **kwargs): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = self._get_roll_func("roll_weighted_sum") - window_func = get_weighted_roll_func(window_func) - return self._apply(window_func, is_weighted=True, name="sum", **kwargs) + return self._apply(window_func, name="sum", **kwargs) @Substitution(name="window") @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) window_func = self._get_roll_func("roll_weighted_mean") - window_func = get_weighted_roll_func(window_func) - return self._apply(window_func, is_weighted=True, name="mean", **kwargs) + return self._apply(window_func, name="mean", **kwargs) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["var"]) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) - window_func = get_weighted_roll_func(window_func) kwargs.pop("name", None) - return self._apply(window_func, is_weighted=True, name="var", **kwargs) + return self._apply(window_func, name="var", **kwargs) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["std"]) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) @@ -1425,12 +1344,12 @@ def count(self): def apply( self, - func, + func: Callable[..., Any], raw: bool = False, engine: Optional[str] = None, - engine_kwargs: Optional[Dict] = None, - args: Optional[Tuple] = None, - kwargs: Optional[Dict] = None, + engine_kwargs: Optional[Dict[str, bool]] = None, + args: Optional[Tuple[Any, ...]] = None, + kwargs: Optional[Dict[str, Any]] = None, ): if args is None: args = () @@ -1453,14 +1372,19 @@ def apply( return self._apply( apply_func, - floor=0, use_numba_cache=maybe_use_numba(engine), original_func=func, args=args, kwargs=kwargs, ) - def _generate_cython_apply_func(self, args, kwargs, raw, func): + def _generate_cython_apply_func( + self, + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + raw: bool, + function: Callable[..., Any], + ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, int], np.ndarray]: from pandas import Series window_func = partial( @@ -1468,7 +1392,7 @@ def _generate_cython_apply_func(self, args, kwargs, raw, func): args=args, kwargs=kwargs, raw=raw, - func=func, + function=function, ) def apply_func(values, begin, end, min_periods, raw=raw): @@ -1481,7 +1405,7 @@ def apply_func(values, begin, end, min_periods, raw=raw): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = self._get_roll_func("roll_sum") - return self._apply(window_func, floor=0, name="sum", **kwargs) + return self._apply(window_func, name="sum", **kwargs) _shared_docs["max"] = dedent( """ @@ -1589,7 +1513,7 @@ def median(self, **kwargs): # the median function implementation return self._apply(window_func, name="median", **kwargs) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) window_func = self._get_roll_func("roll_var") @@ -1598,17 +1522,15 @@ def zsqrt_func(values, begin, end, min_periods): return self._apply( zsqrt_func, - require_min_periods=1, name="std", **kwargs, ) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(self._get_roll_func("roll_var"), ddof=ddof) return self._apply( window_func, - require_min_periods=1, name="var", **kwargs, ) @@ -1628,7 +1550,6 @@ def skew(self, **kwargs): window_func = self._get_roll_func("roll_skew") return self._apply( window_func, - require_min_periods=3, name="skew", **kwargs, ) @@ -1665,7 +1586,7 @@ def skew(self, **kwargs): """ ) - def sem(self, ddof=1, *args, **kwargs): + def sem(self, ddof: int = 1, *args, **kwargs): return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) _shared_docs["sem"] = dedent( @@ -1722,7 +1643,6 @@ def kurt(self, **kwargs): window_func = self._get_roll_func("roll_kurt") return self._apply( window_func, - require_min_periods=4, name="kurt", **kwargs, ) @@ -1781,7 +1701,7 @@ def kurt(self, **kwargs): """ ) - def quantile(self, quantile, interpolation="linear", **kwargs): + def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): if quantile == 1.0: window_func = self._get_roll_func("roll_max") elif quantile == 0.0: @@ -1789,14 +1709,10 @@ def quantile(self, quantile, interpolation="linear", **kwargs): else: window_func = partial( self._get_roll_func("roll_quantile"), - win=self._get_window(), quantile=quantile, interpolation=interpolation, ) - # Pass through for groupby.rolling - kwargs["quantile"] = quantile - kwargs["interpolation"] = interpolation return self._apply(window_func, name="quantile", **kwargs) _shared_docs[ @@ -1833,14 +1749,10 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): # GH 32865. We leverage rolling.mean, so we pass # to the rolling constructors the data used when constructing self: # window width, frequency data, or a BaseIndexer subclass - if isinstance(self.window, BaseIndexer): - window = self.window - else: - # GH 16058: offset window - if self.is_freq_type: - window = self.win_freq - else: - window = self._get_window(other) + # GH 16058: offset window + window = ( + self._get_cov_corr_window(other) if not self.is_freq_type else self.win_freq + ) def _get_cov(X, Y): # GH #12373 : rolling functions error on float32 data @@ -1982,10 +1894,10 @@ def corr(self, other=None, pairwise=None, **kwargs): # GH 32865. We leverage rolling.cov and rolling.std here, so we pass # to the rolling constructors the data used when constructing self: # window width, frequency data, or a BaseIndexer subclass - if isinstance(self.window, BaseIndexer): - window = self.window - else: - window = self._get_window(other) if not self.is_freq_type else self.win_freq + # GH 16058: offset window + window = ( + self._get_cov_corr_window(other) if not self.is_freq_type else self.win_freq + ) def _get_corr(a, b): a = a.rolling( @@ -1994,8 +1906,10 @@ def _get_corr(a, b): b = b.rolling( window=window, min_periods=self.min_periods, center=self.center ) - - return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) + # GH 31286: Through using var instead of std we can avoid numerical + # issues when the result of var is withing floating proint precision + # while std is not. + return a.cov(b, **kwargs) / (a.var(**kwargs) * b.var(**kwargs)) ** 0.5 return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) @@ -2065,11 +1979,6 @@ def validate(self): elif self.window < 0: raise ValueError("window must be non-negative") - if not self.is_datetimelike and self.closed is not None: - raise ValueError( - "closed only implemented for datetimelike and offset based windows" - ) - def _determine_window_length(self) -> Union[int, float]: """ Calculate freq for PeriodIndexes based on Index freq. Can not use @@ -2085,10 +1994,13 @@ def _validate_monotonic(self): Validate monotonic (increasing or decreasing). """ if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing): - formatted = self.on - if self.on is None: - formatted = "index" - raise ValueError(f"{formatted} must be monotonic") + self._raise_monotonic_error() + + def _raise_monotonic_error(self): + formatted = self.on + if self.on is None: + formatted = "index" + raise ValueError(f"{formatted} must be monotonic") def _validate_freq(self): """ @@ -2288,28 +2200,25 @@ class RollingGroupby(BaseWindowGroupby, Rolling): Provide a rolling groupby implementation. """ - def _get_window_indexer(self, window: int) -> GroupbyIndexer: + def _get_window_indexer(self) -> GroupbyIndexer: """ Return an indexer class that will compute the window start and end bounds - Parameters - ---------- - window : int - window size for FixedWindowIndexer - Returns ------- GroupbyIndexer """ rolling_indexer: Type[BaseIndexer] - indexer_kwargs: Optional[Dict] = None + indexer_kwargs: Optional[Dict[str, Any]] = None index_array = self._on.asi8 + window = self.window if isinstance(self.window, BaseIndexer): rolling_indexer = type(self.window) indexer_kwargs = self.window.__dict__ assert isinstance(indexer_kwargs, dict) # for mypy # We'll be using the index of each group later indexer_kwargs.pop("index_array", None) + window = 0 elif self.is_freq_type: rolling_indexer = VariableWindowIndexer else: @@ -2323,3 +2232,12 @@ def _get_window_indexer(self, window: int) -> GroupbyIndexer: indexer_kwargs=indexer_kwargs, ) return window_indexer + + def _validate_monotonic(self): + """ + Validate that on is monotonic; + in this case we have to check only for nans, because + monotonicy was already validated at a higher level. + """ + if self._on.hasnans: + self._raise_monotonic_error() diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 604b7e12ec243..3461652f4ea24 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -785,14 +785,19 @@ def _value_with_fmt(self, val): return val, fmt @classmethod - def check_extension(cls, ext): + def check_extension(cls, ext: str): """ checks that path's extension against the Writer's supported extensions. If it isn't supported, raises UnsupportedFiletypeError. """ if ext.startswith("."): ext = ext[1:] - if not any(ext in extension for extension in cls.supported_extensions): + # error: "Callable[[ExcelWriter], Any]" has no attribute "__iter__" + # (not iterable) [attr-defined] + if not any( + ext in extension + for extension in cls.supported_extensions # type: ignore[attr-defined] + ): raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") else: return True diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index d0e9163fc5f11..6c62d6825bc84 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,7 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Any, Dict, Hashable, Iterator, List, Optional, Sequence, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union import numpy as np @@ -13,6 +13,7 @@ from pandas._typing import ( CompressionOptions, FilePathOrBuffer, + FloatFormatType, IndexLabel, Label, StorageOptions, @@ -30,18 +31,17 @@ from pandas.io.common import get_filepath_or_buffer, get_handle +if TYPE_CHECKING: + from pandas.io.formats.format import DataFrameFormatter + class CSVFormatter: def __init__( self, - obj, + formatter: "DataFrameFormatter", path_or_buf: Optional[FilePathOrBuffer[str]] = None, sep: str = ",", - na_rep: str = "", - float_format: Optional[str] = None, cols: Optional[Sequence[Label]] = None, - header: Union[bool, Sequence[Hashable]] = True, - index: bool = True, index_label: Optional[IndexLabel] = None, mode: str = "w", encoding: Optional[str] = None, @@ -54,10 +54,11 @@ def __init__( date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, - decimal=".", storage_options: StorageOptions = None, ): - self.obj = obj + self.fmt = formatter + + self.obj = self.fmt.frame self.encoding = encoding or "utf-8" @@ -79,35 +80,45 @@ def __init__( self.mode = ioargs.mode self.sep = sep - self.na_rep = na_rep - self.float_format = float_format - self.decimal = decimal - self.header = header - self.index = index - self.index_label = index_label + self.index_label = self._initialize_index_label(index_label) self.errors = errors self.quoting = quoting or csvlib.QUOTE_MINIMAL - self.quotechar = quotechar + self.quotechar = self._initialize_quotechar(quotechar) self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep self.date_format = date_format - self.cols = cols # type: ignore[assignment] - self.chunksize = chunksize # type: ignore[assignment] + self.cols = self._initialize_columns(cols) + self.chunksize = self._initialize_chunksize(chunksize) + + @property + def na_rep(self) -> str: + return self.fmt.na_rep + + @property + def float_format(self) -> Optional["FloatFormatType"]: + return self.fmt.float_format @property - def index_label(self) -> IndexLabel: - return self._index_label + def decimal(self) -> str: + return self.fmt.decimal - @index_label.setter - def index_label(self, index_label: Optional[IndexLabel]) -> None: + @property + def header(self) -> Union[bool, Sequence[str]]: + return self.fmt.header + + @property + def index(self) -> bool: + return self.fmt.index + + def _initialize_index_label(self, index_label: Optional[IndexLabel]) -> IndexLabel: if index_label is not False: if index_label is None: - index_label = self._get_index_label_from_obj() + return self._get_index_label_from_obj() elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): # given a string for a DF with Index - index_label = [index_label] - self._index_label = index_label + return [index_label] + return index_label def _get_index_label_from_obj(self) -> List[str]: if isinstance(self.obj.index, ABCMultiIndex): @@ -122,30 +133,17 @@ def _get_index_label_flat(self) -> List[str]: index_label = self.obj.index.name return [""] if index_label is None else [index_label] - @property - def quotechar(self) -> Optional[str]: + def _initialize_quotechar(self, quotechar: Optional[str]) -> Optional[str]: if self.quoting != csvlib.QUOTE_NONE: # prevents crash in _csv - return self._quotechar + return quotechar return None - @quotechar.setter - def quotechar(self, quotechar: Optional[str]) -> None: - self._quotechar = quotechar - @property def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) - @property - def cols(self) -> Sequence[Label]: - return self._cols - - @cols.setter - def cols(self, cols: Optional[Sequence[Label]]) -> None: - self._cols = self._refine_cols(cols) - - def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: + def _initialize_columns(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # validate mi options if self.has_mi_columns: if cols is not None: @@ -161,12 +159,16 @@ def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels - cols = self.obj.columns - if isinstance(cols, ABCIndexClass): - return cols._format_native_types(**self._number_format) + new_cols = self.obj.columns + if isinstance(new_cols, ABCIndexClass): + return new_cols._format_native_types(**self._number_format) else: - assert isinstance(cols, Sequence) - return list(cols) + return list(new_cols) + + def _initialize_chunksize(self, chunksize: Optional[int]) -> int: + if chunksize is None: + return (100000 // (len(self.cols) or 1)) or 1 + return int(chunksize) @property def _number_format(self) -> Dict[str, Any]: @@ -179,17 +181,6 @@ def _number_format(self) -> Dict[str, Any]: decimal=self.decimal, ) - @property - def chunksize(self) -> int: - return self._chunksize - - @chunksize.setter - def chunksize(self, chunksize: Optional[int]) -> None: - if chunksize is None: - chunksize = (100000 // (len(self.cols) or 1)) or 1 - assert chunksize is not None - self._chunksize = int(chunksize) - @property def data_index(self) -> Index: data_index = self.obj.index diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3e4780ec21378..3c759f477899b 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -39,8 +39,15 @@ from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType -from pandas._typing import FilePathOrBuffer, Label -from pandas.errors import AbstractMethodError +from pandas._typing import ( + ArrayLike, + CompressionOptions, + FilePathOrBuffer, + FloatFormatType, + IndexLabel, + Label, + StorageOptions, +) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -75,10 +82,10 @@ if TYPE_CHECKING: from pandas import Categorical, DataFrame, Series + FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] ] -FloatFormatType = Union[str, Callable, "EngFormatter"] ColspaceType = Mapping[Label, Union[str, int]] ColspaceArgType = Union[ str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]] @@ -98,7 +105,7 @@ index : bool, optional, default True Whether to print index (row) labels. na_rep : str, optional, default 'NaN' - String representation of NAN to use. + String representation of ``NaN`` to use. formatters : list, tuple or dict of one-param. functions, optional Formatter functions to apply to columns' elements by position or name. @@ -106,7 +113,12 @@ List/tuple must be of length equal to the number of columns. float_format : one-parameter function, optional, default None Formatter function to apply to columns' elements if they are - floats. The result of this function must be a unicode string. + floats. This function must return a unicode string and will be + applied only to the non-``NaN`` elements, with ``NaN`` being + handled by ``na_rep``. + + .. versionchanged:: 1.2.0 + sparsify : bool, optional, default True Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. @@ -449,95 +461,8 @@ def get_adjustment() -> TextAdjustment: return TextAdjustment() -class TableFormatter: - - show_dimensions: Union[bool, str] - formatters: FormattersType - columns: Index - _is_truncated: bool - - @property - def is_truncated(self) -> bool: - return self._is_truncated - - @property - def should_show_dimensions(self) -> bool: - return self.show_dimensions is True or ( - self.show_dimensions == "truncate" and self.is_truncated - ) - - def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: - if isinstance(self.formatters, (list, tuple)): - if is_integer(i): - i = cast(int, i) - return self.formatters[i] - else: - return None - else: - if is_integer(i) and i not in self.columns: - i = self.columns[i] - return self.formatters.get(i, None) - - @contextmanager - def get_buffer( - self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None - ): - """ - Context manager to open, yield and close buffer for filenames or Path-like - objects, otherwise yield buf unchanged. - """ - if buf is not None: - buf = stringify_path(buf) - else: - buf = StringIO() - - if encoding is None: - encoding = "utf-8" - elif not isinstance(buf, str): - raise ValueError("buf is not a file name and encoding is specified.") - - if hasattr(buf, "write"): - yield buf - elif isinstance(buf, str): - with open(buf, "w", encoding=encoding, newline="") as f: - # GH#30034 open instead of codecs.open prevents a file leak - # if we have an invalid encoding argument. - # newline="" is needed to roundtrip correctly on - # windows test_to_latex_filename - yield f - else: - raise TypeError("buf is not a file name and it has no write method") - - def write_result(self, buf: IO[str]) -> None: - """ - Write the result of serialization to buf. - """ - raise AbstractMethodError(self) - - def get_result( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - """ - Perform serialization. Write to buf or return as string if buf is None. - """ - with self.get_buffer(buf, encoding=encoding) as f: - self.write_result(buf=f) - if buf is None: - return f.getvalue() - return None - - -class DataFrameFormatter(TableFormatter): - """ - Render a DataFrame - - self.to_string() : console-friendly tabular output - self.to_html() : html table - self.to_latex() : LaTeX tabular environment table - - """ +class DataFrameFormatter: + """Class for processing dataframe formatting options and data.""" __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring @@ -555,46 +480,94 @@ def __init__( float_format: Optional[FloatFormatType] = None, sparsify: Optional[bool] = None, index_names: bool = True, - line_width: Optional[int] = None, max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, show_dimensions: Union[bool, str] = False, decimal: str = ".", - table_id: Optional[str] = None, - render_links: bool = False, bold_rows: bool = False, escape: bool = True, ): self.frame = frame - self.show_index_names = index_names - self.sparsify = self._initialize_sparsify(sparsify) - self.float_format = float_format - self.formatters = self._initialize_formatters(formatters) - self.na_rep = na_rep - self.decimal = decimal + self.columns = self._initialize_columns(columns) self.col_space = self._initialize_colspace(col_space) self.header = header self.index = index - self.line_width = line_width + self.na_rep = na_rep + self.formatters = self._initialize_formatters(formatters) + self.justify = self._initialize_justify(justify) + self.float_format = float_format + self.sparsify = self._initialize_sparsify(sparsify) + self.show_index_names = index_names + self.decimal = decimal + self.bold_rows = bold_rows + self.escape = escape self.max_rows = max_rows self.min_rows = min_rows self.max_cols = max_cols self.show_dimensions = show_dimensions - self.table_id = table_id - self.render_links = render_links - self.justify = self._initialize_justify(justify) - self.bold_rows = bold_rows - self.escape = escape - self.columns = self._initialize_columns(columns) self.max_cols_fitted = self._calc_max_cols_fitted() self.max_rows_fitted = self._calc_max_rows_fitted() self.tr_frame = self.frame - self._truncate() + self.truncate() self.adj = get_adjustment() + def get_strcols(self) -> List[List[str]]: + """ + Render a DataFrame to a list of columns (as lists of strings). + """ + strcols = self._get_strcols_without_index() + + if self.index: + str_index = self._get_formatted_index(self.tr_frame) + strcols.insert(0, str_index) + + return strcols + + @property + def should_show_dimensions(self) -> bool: + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) + + @property + def is_truncated(self) -> bool: + return bool(self.is_truncated_horizontally or self.is_truncated_vertically) + + @property + def is_truncated_horizontally(self) -> bool: + return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) + + @property + def is_truncated_vertically(self) -> bool: + return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) + + @property + def dimensions_info(self) -> str: + return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" + + @property + def has_index_names(self) -> bool: + return _has_names(self.frame.index) + + @property + def has_column_names(self) -> bool: + return _has_names(self.frame.columns) + + @property + def show_row_idx_names(self) -> bool: + return all((self.has_index_names, self.index, self.show_index_names)) + + @property + def show_col_idx_names(self) -> bool: + return all((self.has_column_names, self.show_index_names, self.header)) + + @property + def max_rows_displayed(self) -> int: + return min(self.max_rows or len(self.frame), len(self.frame)) + def _initialize_sparsify(self, sparsify: Optional[bool]) -> bool: if sparsify is None: return get_option("display.multi_sparse") @@ -653,10 +626,6 @@ def _initialize_colspace( result = dict(zip(self.frame.columns, col_space)) return result - @property - def max_rows_displayed(self) -> int: - return min(self.max_rows or len(self.frame), len(self.frame)) - def _calc_max_cols_fitted(self) -> Optional[int]: """Number of columns fitting the screen.""" if not self._is_in_terminal(): @@ -707,26 +676,14 @@ def _get_number_of_auxillary_rows(self) -> int: num_rows = dot_row + prompt_row if self.show_dimensions: - num_rows += len(self._dimensions_info.splitlines()) + num_rows += len(self.dimensions_info.splitlines()) if self.header: num_rows += 1 return num_rows - @property - def is_truncated_horizontally(self) -> bool: - return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) - - @property - def is_truncated_vertically(self) -> bool: - return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) - - @property - def is_truncated(self) -> bool: - return bool(self.is_truncated_horizontally or self.is_truncated_vertically) - - def _truncate(self) -> None: + def truncate(self) -> None: """ Check whether the frame should be truncated. If so, slice the frame up. """ @@ -785,7 +742,7 @@ def _get_strcols_without_index(self) -> List[List[str]]: if not is_list_like(self.header) and not self.header: for i, c in enumerate(self.tr_frame): - fmt_values = self._format_col(i) + fmt_values = self.format_col(i) fmt_values = _make_fixed_width( strings=fmt_values, justify=self.justify, @@ -816,7 +773,7 @@ def _get_strcols_without_index(self) -> List[List[str]]: header_colwidth = max( int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) ) - fmt_values = self._format_col(i) + fmt_values = self.format_col(i) fmt_values = _make_fixed_width( fmt_values, self.justify, minimum=header_colwidth, adj=self.adj ) @@ -827,223 +784,7 @@ def _get_strcols_without_index(self) -> List[List[str]]: return strcols - def _get_strcols(self) -> List[List[str]]: - strcols = self._get_strcols_without_index() - - str_index = self._get_formatted_index(self.tr_frame) - if self.index: - strcols.insert(0, str_index) - - return strcols - - def _to_str_columns(self) -> List[List[str]]: - """ - Render a DataFrame to a list of columns (as lists of strings). - """ - strcols = self._get_strcols() - - if self.is_truncated: - strcols = self._insert_dot_separators(strcols) - - return strcols - - def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: - str_index = self._get_formatted_index(self.tr_frame) - index_length = len(str_index) - - if self.is_truncated_horizontally: - strcols = self._insert_dot_separator_horizontal(strcols, index_length) - - if self.is_truncated_vertically: - strcols = self._insert_dot_separator_vertical(strcols, index_length) - - return strcols - - def _insert_dot_separator_horizontal( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: - strcols.insert(self.tr_col_num + 1, [" ..."] * index_length) - return strcols - - def _insert_dot_separator_vertical( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: - n_header_rows = index_length - len(self.tr_frame) - row_num = self.tr_row_num - for ix, col in enumerate(strcols): - cwidth = self.adj.len(col[row_num]) - - if self.is_truncated_horizontally: - is_dot_col = ix == self.tr_col_num + 1 - else: - is_dot_col = False - - if cwidth > 3 or is_dot_col: - dots = "..." - else: - dots = ".." - - if ix == 0: - dot_mode = "left" - elif is_dot_col: - cwidth = 4 - dot_mode = "right" - else: - dot_mode = "right" - - dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] - col.insert(row_num + n_header_rows, dot_str) - return strcols - - def write_result(self, buf: IO[str]) -> None: - """ - Render a DataFrame to a console-friendly tabular output. - """ - text = self._get_string_representation() - - buf.writelines(text) - - if self.should_show_dimensions: - buf.write(self._dimensions_info) - - @property - def _dimensions_info(self) -> str: - return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" - - def _get_string_representation(self) -> str: - if self.frame.empty: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(self.frame.columns)}\n" - f"Index: {pprint_thing(self.frame.index)}" - ) - return info_line - - strcols = self._to_str_columns() - - if self.line_width is None: - # no need to wrap around just print the whole frame - return self.adj.adjoin(1, *strcols) - - if self.max_cols is None or self.max_cols > 0: - # need to wrap around - return self._join_multiline(*strcols) - - # max_cols == 0. Try to fit frame to terminal - return self._fit_strcols_to_terminal_width(strcols) - - def _fit_strcols_to_terminal_width(self, strcols) -> str: - from pandas import Series - - lines = self.adj.adjoin(1, *strcols).split("\n") - max_len = Series(lines).str.len().max() - # plus truncate dot col - width, _ = get_terminal_size() - dif = max_len - width - # '+ 1' to avoid too wide repr (GH PR #17023) - adj_dif = dif + 1 - col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) - n_cols = len(col_lens) - counter = 0 - while adj_dif > 0 and n_cols > 1: - counter += 1 - mid = int(round(n_cols / 2.0)) - mid_ix = col_lens.index[mid] - col_len = col_lens[mid_ix] - # adjoin adds one - adj_dif -= col_len + 1 - col_lens = col_lens.drop(mid_ix) - n_cols = len(col_lens) - - # subtract index column - max_cols_fitted = n_cols - self.index - # GH-21180. Ensure that we print at least two. - max_cols_fitted = max(max_cols_fitted, 2) - self.max_cols_fitted = max_cols_fitted - - # Call again _truncate to cut frame appropriately - # and then generate string representation - self._truncate() - strcols = self._to_str_columns() - return self.adj.adjoin(1, *strcols) - - def _join_multiline(self, *args) -> str: - lwidth = self.line_width - adjoin_width = 1 - strcols = list(args) - if self.index: - idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width - - col_widths = [ - np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 - for col in strcols - ] - - assert lwidth is not None - col_bins = _binify(col_widths, lwidth) - nbins = len(col_bins) - - if self.is_truncated_vertically: - assert self.max_rows_fitted is not None - nrows = self.max_rows_fitted + 1 - else: - nrows = len(self.frame) - - str_lst = [] - start = 0 - for i, end in enumerate(col_bins): - row = strcols[start:end] - if self.index: - row.insert(0, idx) - if nbins > 1: - if end <= len(strcols) and i < nbins - 1: - row.append([" \\"] + [" "] * (nrows - 1)) - else: - row.append([" "] * nrows) - str_lst.append(self.adj.adjoin(adjoin_width, *row)) - start = end - return "\n\n".join(str_lst) - - def to_string( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - return self.get_result(buf=buf, encoding=encoding) - - def to_latex( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - column_format: Optional[str] = None, - longtable: bool = False, - encoding: Optional[str] = None, - multicolumn: bool = False, - multicolumn_format: Optional[str] = None, - multirow: bool = False, - caption: Optional[str] = None, - label: Optional[str] = None, - position: Optional[str] = None, - ) -> Optional[str]: - """ - Render a DataFrame to a LaTeX tabular/longtable environment output. - """ - from pandas.io.formats.latex import LatexFormatter - - latex_formatter = LatexFormatter( - self, - longtable=longtable, - column_format=column_format, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow, - caption=caption, - label=label, - position=position, - ) - return latex_formatter.get_result(buf=buf, encoding=encoding) - - def _format_col(self, i: int) -> List[str]: + def format_col(self, i: int) -> List[str]: frame = self.tr_frame formatter = self._get_formatter(i) return format_array( @@ -1056,34 +797,17 @@ def _format_col(self, i: int) -> List[str]: leading_space=self.index, ) - def to_html( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - classes: Optional[Union[str, List, Tuple]] = None, - notebook: bool = False, - border: Optional[int] = None, - ) -> Optional[str]: - """ - Render a DataFrame to a html table. - - Parameters - ---------- - classes : str or list-like - classes to include in the `class` attribute of the opening - ```` tag, in addition to the default "dataframe". - notebook : {True, False}, optional, default False - Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - ``
          `` tag. Default ``pd.options.display.html.border``. - """ - from pandas.io.formats.html import HTMLFormatter, NotebookFormatter - - Klass = NotebookFormatter if notebook else HTMLFormatter - return Klass(self, classes=classes, border=border).get_result( - buf=buf, encoding=encoding - ) + def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: + if isinstance(self.formatters, (list, tuple)): + if is_integer(i): + i = cast(int, i) + return self.formatters[i] + else: + return None + else: + if is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: from pandas.core.indexes.multi import sparsify_labels @@ -1126,22 +850,6 @@ def space_format(x, y): # self.str_columns = str_columns return str_columns - @property - def has_index_names(self) -> bool: - return _has_names(self.frame.index) - - @property - def has_column_names(self) -> bool: - return _has_names(self.frame.columns) - - @property - def show_row_idx_names(self) -> bool: - return all((self.has_index_names, self.index, self.show_index_names)) - - @property - def show_col_idx_names(self) -> bool: - return all((self.has_column_names, self.show_index_names, self.header)) - def _get_formatted_index(self, frame: "DataFrame") -> List[str]: # Note: this is only used by to_string() and to_latex(), not by # to_html(). so safe to cast col_space here. @@ -1192,6 +900,224 @@ def _get_column_name_list(self) -> List[str]: return names +class DataFrameRenderer: + """Class for creating dataframe output in multiple formats. + + Called in pandas.core.generic.NDFrame: + - to_csv + - to_latex + + Called in pandas.core.frame.DataFrame: + - to_html + - to_string + + Parameters + ---------- + fmt : DataFrameFormatter + Formatter with the formating options. + """ + + def __init__(self, fmt: DataFrameFormatter): + self.fmt = fmt + + def to_latex( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + column_format: Optional[str] = None, + longtable: bool = False, + encoding: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + from pandas.io.formats.latex import LatexFormatter + + latex_formatter = LatexFormatter( + self.fmt, + longtable=longtable, + column_format=column_format, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + caption=caption, + label=label, + position=position, + ) + string = latex_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_html( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + classes: Optional[Union[str, List, Tuple]] = None, + notebook: bool = False, + border: Optional[int] = None, + table_id: Optional[str] = None, + render_links: bool = False, + ) -> Optional[str]: + """ + Render a DataFrame to a html table. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + encoding : str, default “utf-8” + Set character encoding. + classes : str or list-like + classes to include in the `class` attribute of the opening + ``
          `` tag, in addition to the default "dataframe". + notebook : {True, False}, optional, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + ``
          `` tag. Default ``pd.options.display.html.border``. + table_id : str, optional + A css id is included in the opening `
          ` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. + """ + from pandas.io.formats.html import HTMLFormatter, NotebookFormatter + + Klass = NotebookFormatter if notebook else HTMLFormatter + + html_formatter = Klass( + self.fmt, + classes=classes, + border=border, + table_id=table_id, + render_links=render_links, + ) + string = html_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + line_width: Optional[int] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a console-friendly tabular output. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + encoding: str, default “utf-8” + Set character encoding. + line_width : int, optional + Width to wrap a line in characters. + """ + from pandas.io.formats.string import StringFormatter + + string_formatter = StringFormatter(self.fmt, line_width=line_width) + string = string_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_csv( + self, + path_or_buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + sep: str = ",", + columns: Optional[Sequence[Label]] = None, + index_label: Optional[IndexLabel] = None, + mode: str = "w", + compression: CompressionOptions = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool = True, + escapechar: Optional[str] = None, + errors: str = "strict", + storage_options: StorageOptions = None, + ) -> Optional[str]: + """ + Render dataframe as comma-separated file. + """ + from pandas.io.formats.csvs import CSVFormatter + + csv_formatter = CSVFormatter( + path_or_buf=path_or_buf, + line_terminator=line_terminator, + sep=sep, + encoding=encoding, + errors=errors, + compression=compression, + quoting=quoting, + cols=columns, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + storage_options=storage_options, + formatter=self.fmt, + ) + csv_formatter.save() + + if path_or_buf is None: + assert isinstance(csv_formatter.path_or_buf, StringIO) + return csv_formatter.path_or_buf.getvalue() + + return None + + +def save_to_buffer( + string: str, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, +) -> Optional[str]: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with get_buffer(buf, encoding=encoding) as f: + f.write(string) + if buf is None: + return f.getvalue() + return None + + +@contextmanager +def get_buffer(buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None): + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() + + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") + + if hasattr(buf, "write"): + yield buf + elif isinstance(buf, str): + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f + else: + raise TypeError("buf is not a file name and it has no write method") + + # ---------------------------------------------------------------------- # Array formatters @@ -1311,7 +1237,7 @@ def _format_strings(self) -> List[str]: float_format = get_option("display.float_format") if float_format is None: precision = get_option("display.precision") - float_format = lambda x: f"{x: .{precision:d}g}" + float_format = lambda x: f"{x: .{precision:d}f}" else: float_format = self.float_format @@ -1372,6 +1298,8 @@ def _format(x): tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) + fmt_values = _trim_zeros_float(str_floats=fmt_values, decimal=".") + return fmt_values @@ -1442,8 +1370,19 @@ def get_result_as_array(self) -> np.ndarray: Returns the float values converted into strings using the parameters given at initialisation, as a numpy array """ + + def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str): + mask = isna(values) + formatted = np.array( + [ + formatter(val) if not m else na_rep + for val, m in zip(values.ravel(), mask.ravel()) + ] + ).reshape(values.shape) + return formatted + if self.formatter is not None: - return np.array([self.formatter(x) for x in self.values]) + return format_with_na_rep(self.values, self.formatter, self.na_rep) if self.fixed_width: threshold = get_option("display.chop_threshold") @@ -1464,19 +1403,13 @@ def format_values_with(float_format): # separate the wheat from the chaff values = self.values is_complex = is_complex_dtype(values) - mask = isna(values) - values = np.array(values, dtype="object") - values[mask] = na_rep - imask = (~mask).ravel() - values.flat[imask] = np.array( - [formatter(val) for val in values.ravel()[imask]] - ) + values = format_with_na_rep(values, formatter, na_rep) if self.fixed_width: if is_complex: - result = _trim_zeros_complex(values, self.decimal, na_rep) + result = _trim_zeros_complex(values, self.decimal) else: - result = _trim_zeros_float(values, self.decimal, na_rep) + result = _trim_zeros_float(values, self.decimal) return np.asarray(result, dtype="object") return values @@ -1532,10 +1465,6 @@ def format_values_with(float_format): return formatted_values def _format_strings(self) -> List[str]: - # shortcut - if self.formatter is not None: - return [self.formatter(x) for x in self.values] - return list(self.get_result_as_array()) @@ -1835,9 +1764,11 @@ def _make_fixed_width( return strings if adj is None: - adj = get_adjustment() + adjustment = get_adjustment() + else: + adjustment = adj - max_len = max(adj.len(x) for x in strings) + max_len = max(adjustment.len(x) for x in strings) if minimum is not None: max_len = max(minimum, max_len) @@ -1846,57 +1777,74 @@ def _make_fixed_width( if conf_max is not None and max_len > conf_max: max_len = conf_max - def just(x): + def just(x: str) -> str: if conf_max is not None: - if (conf_max > 3) & (adj.len(x) > max_len): + if (conf_max > 3) & (adjustment.len(x) > max_len): x = x[: max_len - 3] + "..." return x strings = [just(x) for x in strings] - result = adj.justify(strings, max_len, mode=justify) + result = adjustment.justify(strings, max_len, mode=justify) return result -def _trim_zeros_complex( - str_complexes: np.ndarray, decimal: str = ".", na_rep: str = "NaN" -) -> List[str]: +def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> List[str]: """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. """ - return [ - "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal, na_rep)) + trimmed = [ + "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal)) for x in str_complexes ] + # pad strings to the length of the longest trimmed string for alignment + lengths = [len(s) for s in trimmed] + max_length = max(lengths) + padded = [ + s[: -((k - 1) // 2 + 1)] # real part + + (max_length - k) // 2 * "0" + + s[-((k - 1) // 2 + 1) : -((k - 1) // 2)] # + / - + + s[-((k - 1) // 2) : -1] # imaginary part + + (max_length - k) // 2 * "0" + + s[-1] + for s, k in zip(trimmed, lengths) + ] + return padded + def _trim_zeros_float( - str_floats: Union[np.ndarray, List[str]], decimal: str = ".", na_rep: str = "NaN" + str_floats: Union[np.ndarray, List[str]], decimal: str = "." ) -> List[str]: """ Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats + number_regex = re.compile(fr"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$") - def _is_number(x): - return x != na_rep and not x.endswith("inf") + def is_number_with_decimal(x): + return re.match(number_regex, x) is not None - def _cond(values): - finite = [x for x in values if _is_number(x)] - has_decimal = [decimal in x for x in finite] + def should_trim(values: Union[np.ndarray, List[str]]) -> bool: + """ + Determine if an array of strings should be trimmed. - return ( - len(finite) > 0 - and all(has_decimal) - and all(x.endswith("0") for x in finite) - and not (any(("e" in x) or ("E" in x) for x in finite)) - ) + Returns True if all numbers containing decimals (defined by the + above regular expression) within the array end in a zero, otherwise + returns False. + """ + numbers = [x for x in values if is_number_with_decimal(x)] + return len(numbers) > 0 and all(x.endswith("0") for x in numbers) - while _cond(trimmed): - trimmed = [x[:-1] if _is_number(x) else x for x in trimmed] + while should_trim(trimmed): + trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed] # leave one 0 after the decimal points if need be. - return [x + "0" if x.endswith(decimal) and _is_number(x) else x for x in trimmed] + result = [ + x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x + for x in trimmed + ] + return result def _has_names(index: Index) -> bool: @@ -2015,26 +1963,6 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non set_option("display.column_space", max(12, accuracy + 9)) -def _binify(cols: List[int], line_width: int) -> List[int]: - adjoin_width = 1 - bins = [] - curr_width = 0 - i_last_column = len(cols) - 1 - for i, w in enumerate(cols): - w_adjoined = w + adjoin_width - curr_width += w_adjoined - if i_last_column == i: - wrap = curr_width + 1 > line_width and i > 0 - else: - wrap = curr_width + 2 > line_width and i > 0 - if wrap: - bins.append(i) - curr_width = w_adjoined - - bins.append(len(cols)) - return bins - - def get_level_lengths( levels: Any, sentinel: Union[bool, object, str] = "" ) -> List[Dict[int, int]]: diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index c8eb89afdd849..b4f7e3922f02f 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -3,7 +3,7 @@ """ from textwrap import dedent -from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option @@ -12,16 +12,11 @@ from pandas import MultiIndex, option_context from pandas.io.common import is_url -from pandas.io.formats.format import ( - DataFrameFormatter, - TableFormatter, - buffer_put_lines, - get_level_lengths, -) +from pandas.io.formats.format import DataFrameFormatter, get_level_lengths from pandas.io.formats.printing import pprint_thing -class HTMLFormatter(TableFormatter): +class HTMLFormatter: """ Internal class for formatting output data in html. This class is intended for shared functionality between @@ -38,6 +33,8 @@ def __init__( formatter: DataFrameFormatter, classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, + table_id: Optional[str] = None, + render_links: bool = False, ) -> None: self.fmt = formatter self.classes = classes @@ -51,14 +48,35 @@ def __init__( if border is None: border = cast(int, get_option("display.html.border")) self.border = border - self.table_id = self.fmt.table_id - self.render_links = self.fmt.render_links + self.table_id = table_id + self.render_links = render_links self.col_space = { column: f"{value}px" if isinstance(value, int) else value for column, value in self.fmt.col_space.items() } + def to_string(self) -> str: + lines = self.render() + if any(isinstance(x, str) for x in lines): + lines = [str(x) for x in lines] + return "\n".join(lines) + + def render(self) -> List[str]: + self._write_table() + + if self.should_show_dimensions: + by = chr(215) # × + self.write( + f"

          {len(self.frame)} rows {by} {len(self.frame.columns)} columns

          " + ) + + return self.elements + + @property + def should_show_dimensions(self): + return self.fmt.should_show_dimensions + @property def show_row_idx_names(self) -> bool: return self.fmt.show_row_idx_names @@ -187,20 +205,6 @@ def write_tr( indent -= indent_delta self.write("", indent) - def render(self) -> List[str]: - self._write_table() - - if self.should_show_dimensions: - by = chr(215) # × - self.write( - f"

          {len(self.frame)} rows {by} {len(self.frame.columns)} columns

          " - ) - - return self.elements - - def write_result(self, buf: IO[str]) -> None: - buffer_put_lines(buf, self.render()) - def _write_table(self, indent: int = 0) -> None: _classes = ["dataframe"] # Default class. use_mathjax = get_option("display.html.use_mathjax") @@ -370,7 +374,7 @@ def _write_header(self, indent: int) -> None: def _get_formatted_values(self) -> Dict[int, List[str]]: with option_context("display.max_colwidth", None): - fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} + fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)} return fmt_values def _write_body(self, indent: int) -> None: @@ -565,7 +569,7 @@ class NotebookFormatter(HTMLFormatter): """ def _get_formatted_values(self) -> Dict[int, List[str]]: - return {i: self.fmt._format_col(i) for i in range(self.ncols)} + return {i: self.fmt.format_col(i) for i in range(self.ncols)} def _get_columns_formatted_values(self) -> List[str]: return self.columns.format() diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 970bb8c535534..891b3ea7af0e2 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,6 +1,6 @@ -from abc import ABCMeta, abstractmethod +from abc import ABC, abstractmethod import sys -from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, Iterator, List, Mapping, Optional, Sequence, Union from pandas._config import get_option @@ -12,6 +12,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from pandas.core.frame import DataFrame from pandas.core.series import Series @@ -72,92 +73,148 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" -class BaseInfo(metaclass=ABCMeta): +def _initialize_memory_usage( + memory_usage: Optional[Union[bool, str]] = None, +) -> Union[bool, str]: + """Get memory usage based on inputs and display options.""" + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + return memory_usage + + +class BaseInfo(ABC): + """Base class for DataFrameInfo and SeriesInfo. + + Parameters + ---------- + data : FrameOrSeries + Either dataframe or series. + memory_usage : bool or str, optional + If "deep", introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + """ + def __init__( self, data: FrameOrSeries, - verbose: Optional[bool] = None, - buf: Optional[IO[str]] = None, - max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, ): - if buf is None: # pragma: no cover - buf = sys.stdout - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - self.data = data - self.verbose = verbose - self.buf = buf - self.max_cols = max_cols - self.memory_usage = memory_usage - self.null_counts = null_counts + self.memory_usage = _initialize_memory_usage(memory_usage) + + @property + @abstractmethod + def ids(self) -> Index: + """Column names or index names.""" + @property @abstractmethod - def _get_mem_usage(self, deep: bool) -> int: + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + + @property + @abstractmethod + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns or column (if series).""" + + @property + @abstractmethod + def dtypes(self) -> "Series": + """Dtypes. + + Returns + ------- + dtypes : Series + Dtype of each of the DataFrame's columns. """ - Get memory usage in bytes. + return self.data.dtypes - Parameters - ---------- - deep : bool - If True, introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. + @property + def memory_usage_bytes(self) -> int: + """Memory usage in bytes. Returns ------- - mem_usage : int + memory_usage_bytes : int Object's total memory usage in bytes. """ + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep).sum() - @abstractmethod - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - """ - Get column names and dtypes. + @property + def memory_usage_string(self) -> str: + """Memory usage in a form of human readable string.""" + return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n" + + @property + def size_qualifier(self) -> str: + size_qualifier = "" + if self.memory_usage: + if self.memory_usage != "deep": + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + if ( + "object" in self.dtype_counts + or self.data.index._is_memory_usage_qualified() + ): + size_qualifier = "+" + return size_qualifier + + +class DataFrameInfo(BaseInfo): + """Class storing dataframe-specific info.""" + + @property + def ids(self) -> Index: + """Column names. Returns ------- ids : Index DataFrame's column names. - dtypes : Series - Dtype of each of the DataFrame's columns. """ + return self.data.columns - @abstractmethod - def _verbose_repr( - self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool - ) -> None: - """ - Append name, non-null count (optional), and dtype for each column to `lines`. + @property + def dtypes(self) -> "Series": + """Dtypes. - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. + Returns + ------- dtypes : Series - The DataFrame's columns' dtypes. - show_counts : bool - If True, count of non-NA cells for each column will be appended to `lines`. + Dtype of each of the DataFrame's columns. """ + return self.data.dtypes - @abstractmethod - def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: - """ - Append short summary of columns' names to `lines`. + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + # groupby dtype.name to collect e.g. Categorical columns + return self.dtypes.value_counts().groupby(lambda x: x.name).sum() - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. - """ + @property + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns.""" + return self.data.count() + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) - def info(self) -> None: + def to_buffer( + self, + *, + buf: Optional[IO[str]], + max_cols: Optional[int], + verbose: Optional[bool], + show_counts: Optional[bool], + ) -> None: """ Print a concise summary of a %(klass)s. @@ -209,151 +266,359 @@ def info(self) -> None: -------- %(examples_sub)s """ - lines = [] + printer = InfoPrinter( + info=self, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + printer.to_buffer(buf) - lines.append(str(type(self.data))) - lines.append(self.data.index._summary()) - ids, dtypes = self._get_ids_and_dtypes() - col_count = len(ids) +class InfoPrinter: + """Class for printing dataframe or series info. - if col_count == 0: - lines.append(f"Empty {type(self.data).__name__}") - fmt.buffer_put_lines(self.buf, lines) - return + Parameters + ---------- + info : DataFrameInfo + Instance of DataFrameInfo. + max_cols : int, optional + When to switch from the verbose to the truncated output. + verbose : bool, optional + Whether to print the full summary. + show_counts : bool, optional + Whether to show the non-null counts. + """ - # hack - max_cols = self.max_cols + def __init__( + self, + info: DataFrameInfo, + max_cols: Optional[int] = None, + verbose: Optional[bool] = None, + show_counts: Optional[bool] = None, + ): + self.info = info + self.data = info.data + self.verbose = verbose + self.max_cols = self._initialize_max_cols(max_cols) + self.show_counts = self._initialize_show_counts(show_counts) + + @property + def max_rows(self) -> int: + """Maximum info rows to be displayed.""" + return get_option("display.max_info_rows", len(self.data) + 1) + + @property + def exceeds_info_cols(self) -> bool: + """Check if number of columns to be summarized does not exceed maximum.""" + return bool(self.col_count > self.max_cols) + + @property + def exceeds_info_rows(self) -> bool: + """Check if number of rows to be summarized does not exceed maximum.""" + return bool(len(self.data) > self.max_rows) + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return self.info.col_count + + def _initialize_max_cols(self, max_cols: Optional[int]) -> int: if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) - - max_rows = get_option("display.max_info_rows", len(self.data) + 1) + return get_option("display.max_info_columns", self.col_count + 1) + return max_cols - if self.null_counts is None: - show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: + if show_counts is None: + return bool(not self.exceeds_info_cols and not self.exceeds_info_rows) else: - show_counts = self.null_counts - exceeds_info_cols = col_count > max_cols + return show_counts + + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + """Save dataframe info into buffer.""" + table_builder = self._create_table_builder() + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) + def _create_table_builder(self) -> "DataFrameTableBuilder": + """ + Create instance of table builder based on verbosity and display settings. + """ if self.verbose: - self._verbose_repr(lines, ids, dtypes, show_counts) + return DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) elif self.verbose is False: # specifically set to False, not necessarily None - self._non_verbose_repr(lines, ids) + return DataFrameTableBuilderNonVerbose(info=self.info) else: - if exceeds_info_cols: - self._non_verbose_repr(lines, ids) + if self.exceeds_info_cols: + return DataFrameTableBuilderNonVerbose(info=self.info) else: - self._verbose_repr(lines, ids, dtypes, show_counts) + return DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") - if self.memory_usage: - # append memory usage of df to display - size_qualifier = "" - if self.memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or self.data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = self._get_mem_usage(deep=deep) - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(self.buf, lines) +class TableBuilderAbstract(ABC): + """Abstract builder for info table. + Parameters + ---------- + info : BaseInfo + Instance of DataFrameInfo or SeriesInfo. + """ -class DataFrameInfo(BaseInfo): - def _get_mem_usage(self, deep: bool) -> int: - return self.data.memory_usage(index=True, deep=deep).sum() + _lines: List[str] - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - return self.data.columns, self.data.dtypes + def __init__(self, *, info): + self.info = info - def _verbose_repr( - self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool - ) -> None: - col_count = len(ids) - lines.append(f"Data columns (total {col_count} columns):") - - id_head = " # " - column_head = "Column" - col_space = 2 - - max_col = max(len(pprint_thing(k)) for k in ids) - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - - # GH #36765 - # add one space in max_id because there is a one-space padding - # in front of the number - # this allows maintain two spaces gap between columns - max_id = len(pprint_thing(col_count)) + 1 - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space - - if show_counts: - counts = self.data.count() - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null + @abstractmethod + def get_lines(self) -> List[str]: + """Product in a form of list of lines (strings).""" + + +class DataFrameTableBuilder(TableBuilderAbstract): + """Abstract builder for dataframe info table.""" + + def get_lines(self) -> List[str]: + self._lines = [] + if self.col_count == 0: + self._fill_empty_info() else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" + self._fill_non_empty_info() + return self._lines + + def _fill_empty_info(self) -> None: + """Add lines to the info table, pertaining to empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self._lines.append(f"Empty {type(self.data).__name__}") + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + @property + def data(self) -> "DataFrame": + """DataFrame.""" + return self.info.data + + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + return self.info.dtype_counts + + @property + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts + + @property + def display_memory_usage(self) -> bool: + """Whether to display memory usage.""" + return self.info.memory_usage + + @property + def memory_usage_string(self) -> str: + """Memory usage string with proper size qualifier.""" + return self.info.memory_usage_string + + @property + def ids(self) -> Index: + """Dataframe columns.""" + return self.info.ids + + @property + def dtypes(self) -> "Series": + """Dtypes of each of the DataFrame's columns.""" + return self.info.dtypes + + @property + def col_count(self) -> int: + """Number of dataframe columns to be summarized.""" + return self.info.col_count + + def add_object_type_line(self) -> None: + """Add line with string representation of dataframe to the table.""" + self._lines.append(str(type(self.data))) + + def add_index_range_line(self) -> None: + """Add line with range of indices to the table.""" + self._lines.append(self.data.index._summary()) + + @abstractmethod + def add_columns_summary_line(self) -> None: + """Add line with columns summary to the table.""" + + @abstractmethod + def add_header_line(self) -> None: + """Add header line to the table.""" + + @abstractmethod + def add_separator_line(self) -> None: + """Add separator line between header and body of the table.""" + + @abstractmethod + def add_body_lines(self) -> None: + """Add content of the table body.""" + + def add_dtypes_line(self) -> None: + """Add summary line with dtypes present in dataframe.""" + collected_dtypes = [ + f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) + ] + self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + def add_memory_usage_line(self) -> None: + """Add line containing memory usage.""" + self._lines.append(f"memory usage: {self.memory_usage_string}") + + +class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): + """Info table builder for non-verbose output.""" - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) + def add_columns_summary_line(self) -> None: + self._lines.append(self.ids._summary(name="Columns")) - header = "".join( + def add_header_line(self) -> None: + """No header in non-verbose output.""" + + def add_separator_line(self) -> None: + """No separator in non-verbose output.""" + + def add_body_lines(self) -> None: + """No body in non-verbose output.""" + + +class DataFrameTableBuilderVerbose(DataFrameTableBuilder): + """Info table builder for verbose output.""" + + SPACING = " " * 2 + + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ): + super().__init__(info=info) + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + + def _gen_rows(self) -> Iterator[Sequence[str]]: + """Generator function yielding rows content. + + Each element represents a row comprising a sequence of strings. + """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() + + def add_columns_summary_line(self) -> None: + self._lines.append(f"Data columns (total {self.col_count} columns):") + + @property + def header_column_widths(self) -> Sequence[int]: + """Widths of header columns (only titles).""" + return [len(col) for col in self.headers] + + def _get_gross_column_widths(self) -> Sequence[int]: + """Get widths of columns containing both headers and actual content.""" + body_column_widths = self._get_body_column_widths() + return [ + max(*widths) + for widths in zip(self.header_column_widths, body_column_widths) + ] + + def _get_body_column_widths(self) -> Sequence[int]: + """Get widths of table content columns.""" + strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) + return [max(len(x) for x in col) for col in strcols] + + def add_header_line(self) -> None: + header_line = self.SPACING.join( [ - _put_str(id_head, space_num), - _put_str(column_head, space), - _put_str(count_header, space_count), - _put_str(dtype_header, space_dtype), + _put_str(header, col_width) + for header, col_width in zip(self.headers, self.gross_column_widths) ] ) - lines.append(header) + self._lines.append(header_line) - top_separator = "".join( + def add_separator_line(self) -> None: + separator_line = self.SPACING.join( [ - _put_str("-" * len_id, space_num), - _put_str("-" * len_column, space), - _put_str("-" * len_count, space_count), - _put_str("-" * len_dtype, space_dtype), + _put_str("-" * header_colwidth, gross_colwidth) + for header_colwidth, gross_colwidth in zip( + self.header_column_widths, self.gross_column_widths + ) ] ) - lines.append(top_separator) - - for i, col in enumerate(ids): - dtype = dtypes[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts[i] - - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) + self._lines.append(separator_line) + + def add_body_lines(self) -> None: + for row in self.strrows: + body_line = self.SPACING.join( + [ + _put_str(col, gross_colwidth) + for col, gross_colwidth in zip(row, self.gross_column_widths) + ] ) + self._lines.append(body_line) + + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_dtypes(), + ) + + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_non_null_counts(), + self._gen_dtypes(), + ) - def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: - lines.append(ids._summary(name="Columns")) + def _gen_line_numbers(self) -> Iterator[str]: + """Iterator with string representation of column numbers.""" + for i, _ in enumerate(self.ids): + yield f" {i}" + + def _gen_columns(self) -> Iterator[str]: + """Iterator with string representation of column names.""" + for col in self.ids: + yield pprint_thing(col) + + def _gen_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" + for dtype in self.dtypes: + yield pprint_thing(dtype) + + def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" + for count in self.non_null_counts: + yield f"{count} non-null" diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 170df193bef00..f3c49e1cd3801 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -2,13 +2,46 @@ Module for formatting output data in Latex. """ from abc import ABC, abstractmethod -from typing import IO, Iterator, List, Optional, Type +from typing import Iterator, List, Optional, Tuple, Type, Union import numpy as np from pandas.core.dtypes.generic import ABCMultiIndex -from pandas.io.formats.format import DataFrameFormatter, TableFormatter +from pandas.io.formats.format import DataFrameFormatter + + +def _split_into_full_short_caption( + caption: Optional[Union[str, Tuple[str, str]]] +) -> Tuple[str, str]: + """Extract full and short captions from caption string/tuple. + + Parameters + ---------- + caption : str or tuple, optional + Either table caption string or tuple (full_caption, short_caption). + If string is provided, then it is treated as table full caption, + while short_caption is considered an empty string. + + Returns + ------- + full_caption, short_caption : tuple + Tuple of full_caption, short_caption strings. + """ + if caption: + if isinstance(caption, str): + full_caption = caption + short_caption = "" + else: + try: + full_caption, short_caption = caption + except ValueError as err: + msg = "caption must be either a string or a tuple of two strings" + raise ValueError(msg) from err + else: + full_caption = "" + short_caption = "" + return full_caption, short_caption class RowStringConverter(ABC): @@ -100,17 +133,12 @@ def header_levels(self) -> int: def _get_strcols(self) -> List[List[str]]: """String representation of the columns.""" - if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {self.frame.columns}\n" - f"Index: {self.frame.index}" - ) - strcols = [[info_line]] + if self.fmt.frame.empty: + strcols = [[self._empty_info_line]] else: - strcols = self.fmt._to_str_columns() + strcols = self.fmt.get_strcols() - # reestablish the MultiIndex that has been joined by _to_str_column + # reestablish the MultiIndex that has been joined by get_strcols() if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( adjoin=False, @@ -143,6 +171,14 @@ def pad_empties(x): strcols = out + strcols[1:] return strcols + @property + def _empty_info_line(self): + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {self.frame.columns}\n" + f"Index: {self.frame.index}" + ) + def _preprocess_row(self, row: List[str]) -> List[str]: """Preprocess elements of the row.""" if self.fmt.escape: @@ -275,6 +311,8 @@ class TableBuilderAbstract(ABC): Use multirow to enhance MultiIndex rows. caption: str, optional Table caption. + short_caption: str, optional + Table short caption. label: str, optional LaTeX label. position: str, optional @@ -289,6 +327,7 @@ def __init__( multicolumn_format: Optional[str] = None, multirow: bool = False, caption: Optional[str] = None, + short_caption: Optional[str] = None, label: Optional[str] = None, position: Optional[str] = None, ): @@ -298,6 +337,7 @@ def __init__( self.multicolumn_format = multicolumn_format self.multirow = multirow self.caption = caption + self.short_caption = short_caption self.label = label self.position = position @@ -384,8 +424,23 @@ def _position_macro(self) -> str: @property def _caption_macro(self) -> str: - r"""Caption macro, extracted from self.caption, like \caption{cap}.""" - return f"\\caption{{{self.caption}}}" if self.caption else "" + r"""Caption macro, extracted from self.caption. + + With short caption: + \caption[short_caption]{caption_string}. + + Without short caption: + \caption{caption_string}. + """ + if self.caption: + return "".join( + [ + r"\caption", + f"[{self.short_caption}]" if self.short_caption else "", + f"{{{self.caption}}}", + ] + ) + return "" @property def _label_macro(self) -> str: @@ -595,16 +650,33 @@ def env_end(self) -> str: return "\\end{tabular}" -class LatexFormatter(TableFormatter): - """ +class LatexFormatter: + r""" Used to render a DataFrame to a LaTeX tabular/longtable environment output. Parameters ---------- formatter : `DataFrameFormatter` + longtable : bool, default False + Use longtable environment. column_format : str, default None The columns format as specified in `LaTeX table format `__ e.g 'rcl' for 3 columns + multicolumn : bool, default False + Use \multicolumn to enhance MultiIndex columns. + multicolumn_format : str, default 'l' + The alignment for multicolumns, similar to `column_format` + multirow : bool, default False + Use \multirow to enhance MultiIndex rows. + caption : str or tuple, optional + Tuple (full_caption, short_caption), + which results in \caption[short_caption]{full_caption}; + if a single string is passed, no short caption will be set. + label : str, optional + The LaTeX label to be placed inside ``\label{}`` in the output. + position : str, optional + The LaTeX positional argument for tables, to be placed after + ``\begin{}`` in the output. See Also -------- @@ -619,28 +691,27 @@ def __init__( multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, - caption: Optional[str] = None, + caption: Optional[Union[str, Tuple[str, str]]] = None, label: Optional[str] = None, position: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame self.longtable = longtable - self.column_format = column_format # type: ignore[assignment] + self.column_format = column_format self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow - self.caption = caption + self.caption, self.short_caption = _split_into_full_short_caption(caption) self.label = label self.position = position - def write_result(self, buf: IO[str]) -> None: + def to_string(self) -> str: """ Render a DataFrame to a LaTeX tabular, longtable, or table/tabular environment output. """ - table_string = self.builder.get_result() - buf.write(table_string) + return self.builder.get_result() @property def builder(self) -> TableBuilderAbstract: @@ -658,6 +729,7 @@ def builder(self) -> TableBuilderAbstract: multicolumn_format=self.multicolumn_format, multirow=self.multirow, caption=self.caption, + short_caption=self.short_caption, label=self.label, position=self.position, ) @@ -671,7 +743,7 @@ def _select_builder(self) -> Type[TableBuilderAbstract]: return TabularBuilder @property - def column_format(self) -> str: + def column_format(self) -> Optional[str]: """Column format.""" return self._column_format diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py new file mode 100644 index 0000000000000..4ebb78f29c739 --- /dev/null +++ b/pandas/io/formats/string.py @@ -0,0 +1,201 @@ +""" +Module for formatting output data in console (to string). +""" +from shutil import get_terminal_size +from typing import Iterable, List, Optional + +import numpy as np + +from pandas.io.formats.format import DataFrameFormatter +from pandas.io.formats.printing import pprint_thing + + +class StringFormatter: + """Formatter for string representation of a dataframe.""" + + def __init__(self, fmt: DataFrameFormatter, line_width: Optional[int] = None): + self.fmt = fmt + self.adj = fmt.adj + self.frame = fmt.frame + self.line_width = line_width + + def to_string(self) -> str: + text = self._get_string_representation() + if self.fmt.should_show_dimensions: + text = "".join([text, self.fmt.dimensions_info]) + return text + + def _get_strcols(self) -> List[List[str]]: + strcols = self.fmt.get_strcols() + if self.fmt.is_truncated: + strcols = self._insert_dot_separators(strcols) + return strcols + + def _get_string_representation(self) -> str: + if self.fmt.frame.empty: + return self._empty_info_line + + strcols = self._get_strcols() + + if self.line_width is None: + # no need to wrap around just print the whole frame + return self.adj.adjoin(1, *strcols) + + if self._need_to_wrap_around: + return self._join_multiline(strcols) + + return self._fit_strcols_to_terminal_width(strcols) + + @property + def _empty_info_line(self) -> str: + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" + ) + + @property + def _need_to_wrap_around(self) -> bool: + return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0) + + def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: + str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) + index_length = len(str_index) + + if self.fmt.is_truncated_horizontally: + strcols = self._insert_dot_separator_horizontal(strcols, index_length) + + if self.fmt.is_truncated_vertically: + strcols = self._insert_dot_separator_vertical(strcols, index_length) + + return strcols + + def _insert_dot_separator_horizontal( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + strcols.insert(self.fmt.tr_col_num + 1, [" ..."] * index_length) + return strcols + + def _insert_dot_separator_vertical( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + n_header_rows = index_length - len(self.fmt.tr_frame) + row_num = self.fmt.tr_row_num + for ix, col in enumerate(strcols): + cwidth = self.adj.len(col[row_num]) + + if self.fmt.is_truncated_horizontally: + is_dot_col = ix == self.fmt.tr_col_num + 1 + else: + is_dot_col = False + + if cwidth > 3 or is_dot_col: + dots = "..." + else: + dots = ".." + + if ix == 0: + dot_mode = "left" + elif is_dot_col: + cwidth = 4 + dot_mode = "right" + else: + dot_mode = "right" + + dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] + col.insert(row_num + n_header_rows, dot_str) + return strcols + + def _join_multiline(self, strcols_input: Iterable[List[str]]) -> str: + lwidth = self.line_width + adjoin_width = 1 + strcols = list(strcols_input) + + if self.fmt.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.fmt.is_truncated_vertically: + assert self.fmt.max_rows_fitted is not None + nrows = self.fmt.max_rows_fitted + 1 + else: + nrows = len(self.frame) + + str_lst = [] + start = 0 + for i, end in enumerate(col_bins): + row = strcols[start:end] + if self.fmt.index: + row.insert(0, idx) + if nbins > 1: + if end <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + start = end + return "\n\n".join(str_lst) + + def _fit_strcols_to_terminal_width(self, strcols: List[List[str]]) -> str: + from pandas import Series + + lines = self.adj.adjoin(1, *strcols).split("\n") + max_len = Series(lines).str.len().max() + # plus truncate dot col + width, _ = get_terminal_size() + dif = max_len - width + # '+ 1' to avoid too wide repr (GH PR #17023) + adj_dif = dif + 1 + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = int(round(n_cols / 2.0)) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + # adjoin adds one + adj_dif -= col_len + 1 + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + + # subtract index column + max_cols_fitted = n_cols - self.fmt.index + # GH-21180. Ensure that we print at least two. + max_cols_fitted = max(max_cols_fitted, 2) + self.fmt.max_cols_fitted = max_cols_fitted + + # Call again _truncate to cut frame appropriately + # and then generate string representation + self.fmt.truncate() + strcols = self._get_strcols() + return self.adj.adjoin(1, *strcols) + + +def _binify(cols: List[int], line_width: int) -> List[int]: + adjoin_width = 1 + bins = [] + curr_width = 0 + i_last_column = len(cols) - 1 + for i, w in enumerate(cols): + w_adjoined = w + adjoin_width + curr_width += w_adjoined + if i_last_column == i: + wrap = curr_width + 1 > line_width and i > 0 + else: + wrap = curr_width + 2 > line_width and i > 0 + if wrap: + bins.append(i) + curr_width = w_adjoined + + bins.append(len(cols)) + return bins diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 0089d7a32f723..2f3416cbf2d87 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -829,7 +829,8 @@ def applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": See Also -------- - Styler.where + Styler.where: Updates the HTML representation with a style which is + selected in accordance with the return value of a function. """ self._todo.append( (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs) @@ -870,7 +871,7 @@ def where( See Also -------- - Styler.applymap + Styler.applymap: Updates the HTML representation with the result. """ if other is None: other = "" @@ -930,7 +931,7 @@ def export(self) -> List[Tuple[Callable, Tuple, Dict]]: See Also -------- - Styler.use + Styler.use: Set the styles on the current Styler. """ return self._todo @@ -951,7 +952,7 @@ def use(self, styles: List[Tuple[Callable, Tuple, Dict]]) -> "Styler": See Also -------- - Styler.export + Styler.export : Export the styles to applied to the current Styler. """ self._todo.extend(styles) return self diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3cf3c2e7fd91d..288bc0adc5162 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,15 +1,21 @@ +from abc import ABC, abstractmethod from collections import abc import functools from io import BytesIO, StringIO from itertools import islice import os -from typing import IO, Any, Callable, List, Optional, Tuple, Type +from typing import IO, Any, Callable, List, Mapping, Optional, Tuple, Type, Union import numpy as np import pandas._libs.json as json from pandas._libs.tslibs import iNaT -from pandas._typing import CompressionOptions, JSONSerializable, StorageOptions +from pandas._typing import ( + CompressionOptions, + IndexLabel, + JSONSerializable, + StorageOptions, +) from pandas.errors import AbstractMethodError from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments @@ -17,6 +23,7 @@ from pandas import DataFrame, MultiIndex, Series, isna, to_datetime from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.generic import NDFrame from pandas.core.reshape.concat import concat from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle @@ -33,7 +40,7 @@ # interface to/from def to_json( path_or_buf, - obj, + obj: NDFrame, orient: Optional[str] = None, date_format: str = "epoch", double_precision: int = 10, @@ -110,7 +117,7 @@ def to_json( path_or_buf.close() -class Writer: +class Writer(ABC): _default_orient: str def __init__( @@ -146,75 +153,52 @@ def _format_axes(self): raise AbstractMethodError(self) def write(self): - return self._write( - self.obj, - self.orient, - self.double_precision, - self.ensure_ascii, - self.date_unit, - self.date_format == "iso", - self.default_handler, - self.indent, - ) - - def _write( - self, - obj, - orient: Optional[str], - double_precision: int, - ensure_ascii: bool, - date_unit: str, - iso_dates: bool, - default_handler: Optional[Callable[[Any], JSONSerializable]], - indent: int, - ): + iso_dates = self.date_format == "iso" return dumps( - obj, - orient=orient, - double_precision=double_precision, - ensure_ascii=ensure_ascii, - date_unit=date_unit, + self.obj_to_write, + orient=self.orient, + double_precision=self.double_precision, + ensure_ascii=self.ensure_ascii, + date_unit=self.date_unit, iso_dates=iso_dates, - default_handler=default_handler, - indent=indent, + default_handler=self.default_handler, + indent=self.indent, ) + @property + @abstractmethod + def obj_to_write(self) -> Union[NDFrame, Mapping[IndexLabel, Any]]: + """Object to write in JSON format.""" + pass + class SeriesWriter(Writer): _default_orient = "index" + @property + def obj_to_write(self) -> Union[NDFrame, Mapping[IndexLabel, Any]]: + if not self.index and self.orient == "split": + return {"name": self.obj.name, "data": self.obj.values} + else: + return self.obj + def _format_axes(self): if not self.obj.index.is_unique and self.orient == "index": raise ValueError(f"Series index must be unique for orient='{self.orient}'") - def _write( - self, - obj, - orient: Optional[str], - double_precision: int, - ensure_ascii: bool, - date_unit: str, - iso_dates: bool, - default_handler: Optional[Callable[[Any], JSONSerializable]], - indent: int, - ): - if not self.index and orient == "split": - obj = {"name": obj.name, "data": obj.values} - return super()._write( - obj, - orient, - double_precision, - ensure_ascii, - date_unit, - iso_dates, - default_handler, - indent, - ) - class FrameWriter(Writer): _default_orient = "columns" + @property + def obj_to_write(self) -> Union[NDFrame, Mapping[IndexLabel, Any]]: + if not self.index and self.orient == "split": + obj_to_write = self.obj.to_dict(orient="split") + del obj_to_write["index"] + else: + obj_to_write = self.obj + return obj_to_write + def _format_axes(self): """ Try to format axes if they are datelike. @@ -232,31 +216,6 @@ def _format_axes(self): f"DataFrame columns must be unique for orient='{self.orient}'." ) - def _write( - self, - obj, - orient: Optional[str], - double_precision: int, - ensure_ascii: bool, - date_unit: str, - iso_dates: bool, - default_handler: Optional[Callable[[Any], JSONSerializable]], - indent: int, - ): - if not self.index and orient == "split": - obj = obj.to_dict(orient="split") - del obj["index"] - return super()._write( - obj, - orient, - double_precision, - ensure_ascii, - date_unit, - iso_dates, - default_handler, - indent, - ) - class JSONTableWriter(FrameWriter): _default_orient = "records" @@ -331,30 +290,9 @@ def __init__( self.orient = "records" self.index = index - def _write( - self, - obj, - orient, - double_precision, - ensure_ascii, - date_unit, - iso_dates, - default_handler, - indent, - ): - table_obj = {"schema": self.schema, "data": obj} - serialized = super()._write( - table_obj, - orient, - double_precision, - ensure_ascii, - date_unit, - iso_dates, - default_handler, - indent, - ) - - return serialized + @property + def obj_to_write(self) -> Union[NDFrame, Mapping[IndexLabel, Any]]: + return {"schema": self.schema, "data": self.obj} @deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 97ec0ed1f7fdc..88f57e18593f2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,5 +1,6 @@ """ parquet compat """ +import io from typing import Any, AnyStr, Dict, List, Optional from warnings import catch_warnings @@ -238,28 +239,29 @@ def read( def to_parquet( df: DataFrame, - path: FilePathOrBuffer[AnyStr], + path: Optional[FilePathOrBuffer] = None, engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, -): +) -> Optional[bytes]: """ Write a DataFrame to the parquet format. Parameters ---------- df : DataFrame - path : str or file-like object + path : str or file-like object, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. - .. versionchanged:: 0.24.0 + .. versionchanged:: 1.2.0 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option @@ -298,13 +300,20 @@ def to_parquet( kwargs Additional keyword arguments passed to the engine + + Returns + ------- + bytes if no path argument is provided else None """ if isinstance(partition_cols, str): partition_cols = [partition_cols] impl = get_engine(engine) - return impl.write( + + path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path + + impl.write( df, - path, + path_or_buf, compression=compression, index=index, partition_cols=partition_cols, @@ -312,6 +321,12 @@ def to_parquet( **kwargs, ) + if path is None: + assert isinstance(path_or_buf, io.BytesIO) + return path_or_buf.getvalue() + else: + return None + def read_parquet(path, engine: str = "auto", columns=None, **kwargs): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 63c3f9899d915..2110a2d400be8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -606,7 +606,7 @@ def read_csv( del kwds["filepath_or_buffer"] del kwds["sep"] - kwds_defaults = _check_defaults_read( + kwds_defaults = _refine_defaults_read( dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","} ) kwds.update(kwds_defaults) @@ -684,7 +684,7 @@ def read_table( del kwds["filepath_or_buffer"] del kwds["sep"] - kwds_defaults = _check_defaults_read( + kwds_defaults = _refine_defaults_read( dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"} ) kwds.update(kwds_defaults) @@ -789,63 +789,14 @@ def __init__(self, f, engine=None, **kwds): else: engine = "python" engine_specified = False - + self.engine = engine self._engine_specified = kwds.get("engine_specified", engine_specified) - if kwds.get("dialect") is not None: - dialect = kwds["dialect"] - if dialect in csv.list_dialects(): - dialect = csv.get_dialect(dialect) - - # Any valid dialect should have these attributes. - # If any are missing, we will raise automatically. - for param in ( - "delimiter", - "doublequote", - "escapechar", - "skipinitialspace", - "quotechar", - "quoting", - ): - try: - dialect_val = getattr(dialect, param) - except AttributeError as err: - raise ValueError( - f"Invalid dialect {kwds['dialect']} provided" - ) from err - parser_default = _parser_defaults[param] - provided = kwds.get(param, parser_default) - - # Messages for conflicting values between the dialect - # instance and the actual parameters provided. - conflict_msgs = [] - - # Don't warn if the default parameter was passed in, - # even if it conflicts with the dialect (gh-23761). - if provided != parser_default and provided != dialect_val: - msg = ( - f"Conflicting values for '{param}': '{provided}' was " - f"provided, but the dialect specifies '{dialect_val}'. " - "Using the dialect-specified value." - ) - - # Annoying corner case for not warning about - # conflicts between dialect and delimiter parameter. - # Refer to the outer "_read_" function for more info. - if not (param == "delimiter" and kwds.pop("sep_override", False)): - conflict_msgs.append(msg) + _validate_skipfooter(kwds) - if conflict_msgs: - warnings.warn( - "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2 - ) - kwds[param] = dialect_val - - if kwds.get("skipfooter"): - if kwds.get("iterator") or kwds.get("chunksize"): - raise ValueError("'skipfooter' not supported for 'iteration'") - if kwds.get("nrows"): - raise ValueError("'skipfooter' not supported with 'nrows'") + dialect = _extract_dialect(kwds) + if dialect is not None: + kwds = _merge_with_dialect_properties(dialect, kwds) if kwds.get("header", "infer") == "infer": kwds["header"] = 0 if kwds.get("names") is None else None @@ -853,7 +804,6 @@ def __init__(self, f, engine=None, **kwds): self.orig_options = kwds # miscellanea - self.engine = engine self._currow = 0 options = self._get_options_with_defaults(engine) @@ -928,7 +878,6 @@ def _check_file_or_buffer(self, f, engine): def _clean_options(self, options, engine): result = options.copy() - engine_specified = self._engine_specified fallback_reason = None # C engine not supported yet @@ -992,7 +941,7 @@ def _clean_options(self, options, engine): ) engine = "python" - if fallback_reason and engine_specified: + if fallback_reason and self._engine_specified: raise ValueError(fallback_reason) if engine == "c": @@ -1028,25 +977,18 @@ def _clean_options(self, options, engine): validate_header_arg(options["header"]) - depr_warning = "" - for arg in _deprecated_args: parser_default = _c_parser_defaults[arg] depr_default = _deprecated_defaults[arg] - - msg = ( - f"The {repr(arg)} argument has been deprecated and will be " - "removed in a future version." - ) - if result.get(arg, depr_default) != depr_default: - depr_warning += msg + "\n\n" + msg = ( + f"The {arg} argument has been deprecated and will be " + "removed in a future version.\n\n" + ) + warnings.warn(msg, FutureWarning, stacklevel=2) else: result[arg] = parser_default - if depr_warning != "": - warnings.warn(depr_warning, FutureWarning, stacklevel=2) - if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): @@ -1661,7 +1603,7 @@ def _get_name(icol): return index - def _agg_index(self, index, try_parse_dates=True): + def _agg_index(self, index, try_parse_dates=True) -> Index: arrays = [] for i, arr in enumerate(index): @@ -2235,7 +2177,7 @@ def TextParser(*args, **kwds): return TextFileReader(*args, **kwds) -def count_empty_vals(vals): +def count_empty_vals(vals) -> int: return sum(1 for v in vals if v == "" or v is None) @@ -3706,7 +3648,7 @@ def _make_reader(self, f): ) -def _check_defaults_read( +def _refine_defaults_read( dialect: Union[str, csv.Dialect], delimiter: Union[str, object], delim_whitespace: bool, @@ -3714,7 +3656,7 @@ def _check_defaults_read( sep: Union[str, object], defaults: Dict[str, Any], ): - """Check default values of input parameters of read_csv, read_table. + """Validate/refine default values of input parameters of read_csv, read_table. Parameters ---------- @@ -3766,7 +3708,7 @@ def _check_defaults_read( # the comparison to dialect values by checking if default values # for BOTH "delimiter" and "sep" were provided. if dialect is not None: - kwds["sep_override"] = (delimiter is None) and ( + kwds["sep_override"] = delimiter is None and ( sep is lib.no_default or sep == delim_default ) @@ -3793,3 +3735,120 @@ def _check_defaults_read( kwds["engine_specified"] = False return kwds + + +def _extract_dialect(kwds: Dict[str, Any]) -> Optional[csv.Dialect]: + """ + Extract concrete csv dialect instance. + + Returns + ------- + csv.Dialect or None + """ + if kwds.get("dialect") is None: + return None + + dialect = kwds["dialect"] + if dialect in csv.list_dialects(): + dialect = csv.get_dialect(dialect) + + _validate_dialect(dialect) + + return dialect + + +MANDATORY_DIALECT_ATTRS = ( + "delimiter", + "doublequote", + "escapechar", + "skipinitialspace", + "quotechar", + "quoting", +) + + +def _validate_dialect(dialect: csv.Dialect) -> None: + """ + Validate csv dialect instance. + + Raises + ------ + ValueError + If incorrect dialect is provided. + """ + for param in MANDATORY_DIALECT_ATTRS: + if not hasattr(dialect, param): + raise ValueError(f"Invalid dialect {dialect} provided") + + +def _merge_with_dialect_properties( + dialect: csv.Dialect, + defaults: Dict[str, Any], +) -> Dict[str, Any]: + """ + Merge default kwargs in TextFileReader with dialect parameters. + + Parameters + ---------- + dialect : csv.Dialect + Concrete csv dialect. See csv.Dialect documentation for more details. + defaults : dict + Keyword arguments passed to TextFileReader. + + Returns + ------- + kwds : dict + Updated keyword arguments, merged with dialect parameters. + """ + kwds = defaults.copy() + + for param in MANDATORY_DIALECT_ATTRS: + dialect_val = getattr(dialect, param) + + parser_default = _parser_defaults[param] + provided = kwds.get(param, parser_default) + + # Messages for conflicting values between the dialect + # instance and the actual parameters provided. + conflict_msgs = [] + + # Don't warn if the default parameter was passed in, + # even if it conflicts with the dialect (gh-23761). + if provided != parser_default and provided != dialect_val: + msg = ( + f"Conflicting values for '{param}': '{provided}' was " + f"provided, but the dialect specifies '{dialect_val}'. " + "Using the dialect-specified value." + ) + + # Annoying corner case for not warning about + # conflicts between dialect and delimiter parameter. + # Refer to the outer "_read_" function for more info. + if not (param == "delimiter" and kwds.pop("sep_override", False)): + conflict_msgs.append(msg) + + if conflict_msgs: + warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2) + kwds[param] = dialect_val + return kwds + + +def _validate_skipfooter(kwds: Dict[str, Any]) -> None: + """ + Check whether skipfooter is compatible with other kwargs in TextFileReader. + + Parameters + ---------- + kwds : dict + Keyword arguments passed to TextFileReader. + + Raises + ------ + ValueError + If skipfooter is not compatible with other parameters. + """ + if kwds.get("skipfooter"): + if kwds.get("iterator") or kwds.get("chunksize"): + raise ValueError("'skipfooter' not supported for iteration") + if kwds.get("nrows"): + raise ValueError("'skipfooter' not supported with 'nrows'") diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 80baa6f78ddd7..426a40a65b522 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -98,7 +98,7 @@ def to_pickle( if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: - f.write(pickle.dumps(obj, protocol=protocol)) + pickle.dump(obj, f, protocol=protocol) finally: if f != filepath_or_buffer: # do not close user-provided file objects GH 35679 diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2903ede1d5c0b..ffc3a4501470f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -565,6 +565,7 @@ def __fspath__(self): def root(self): """ return the root node """ self._check_if_open() + assert self._handle is not None # for mypy return self._handle.root @property @@ -1393,6 +1394,8 @@ def groups(self): """ _tables() self._check_if_open() + assert self._handle is not None # for mypy + assert _table_mod is not None # for mypy return [ g for g in self._handle.walk_groups() @@ -1437,6 +1440,9 @@ def walk(self, where="/"): """ _tables() self._check_if_open() + assert self._handle is not None # for mypy + assert _table_mod is not None # for mypy + for g in self._handle.walk_groups(where): if getattr(g._v_attrs, "pandas_type", None) is not None: continue @@ -1862,6 +1868,8 @@ def __init__( def __iter__(self): # iterate current = self.start + if self.coordinates is None: + raise ValueError("Cannot iterate until get_result is called.") while current < self.stop: stop = min(current + self.chunksize, self.stop) value = self.func(None, None, self.coordinates[current:stop]) @@ -3019,8 +3027,6 @@ def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None) vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) - elif empty_array: - self.write_array_empty(key, value) elif is_datetime64_dtype(value.dtype): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "datetime64" @@ -3035,6 +3041,8 @@ def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None) elif is_timedelta64_dtype(value.dtype): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif empty_array: + self.write_array_empty(key, value) else: self._handle.create_array(self.group, key, value) @@ -3196,7 +3204,7 @@ class Table(Fixed): pandas_kind = "wide_table" format_type: str = "table" # GH#30962 needed by dask table_type: str - levels = 1 + levels: Union[int, List[Label]] = 1 is_table = True index_axes: List[IndexCol] @@ -3292,7 +3300,9 @@ def is_multi_index(self) -> bool: """the levels attribute is 1 or a list in the case of a multi-index""" return isinstance(self.levels, list) - def validate_multiindex(self, obj): + def validate_multiindex( + self, obj: FrameOrSeriesUnion + ) -> Tuple[DataFrame, List[Label]]: """ validate that we can store the multi-index; reset and return the new object @@ -3301,11 +3311,13 @@ def validate_multiindex(self, obj): l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names) ] try: - return obj.reset_index(), levels + reset_obj = obj.reset_index() except ValueError as err: raise ValueError( "duplicate names/columns in the multi-index when storing as a table" ) from err + assert isinstance(reset_obj, DataFrame) # for mypy + return reset_obj, levels @property def nrows_expected(self) -> int: @@ -3433,7 +3445,7 @@ def get_attrs(self): self.nan_rep = getattr(self.attrs, "nan_rep", None) self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) - self.levels = getattr(self.attrs, "levels", None) or [] + self.levels: List[Label] = getattr(self.attrs, "levels", None) or [] self.index_axes = [a for a in self.indexables if a.is_an_indexable] self.values_axes = [a for a in self.indexables if not a.is_an_indexable] @@ -4562,11 +4574,12 @@ class AppendableMultiSeriesTable(AppendableSeriesTable): def write(self, obj, **kwargs): """ we are going to write this as a frame table """ name = obj.name or "values" - obj, self.levels = self.validate_multiindex(obj) + newobj, self.levels = self.validate_multiindex(obj) + assert isinstance(self.levels, list) # for mypy cols = list(self.levels) cols.append(name) - obj.columns = cols - return super().write(obj=obj, **kwargs) + newobj.columns = Index(cols) + return super().write(obj=newobj, **kwargs) class GenericTable(AppendableFrameTable): @@ -4576,6 +4589,7 @@ class GenericTable(AppendableFrameTable): table_type = "generic_table" ndim = 2 obj_type = DataFrame + levels: List[Label] @property def pandas_type(self) -> str: @@ -4609,7 +4623,7 @@ def indexables(self): name="index", axis=0, table=self.table, meta=meta, metadata=md ) - _indexables = [index_col] + _indexables: List[Union[GenericIndexCol, GenericDataIndexableCol]] = [index_col] for i, n in enumerate(d._v_names): assert isinstance(n, str) @@ -4652,6 +4666,7 @@ def write(self, obj, data_columns=None, **kwargs): elif data_columns is True: data_columns = obj.columns.tolist() obj, self.levels = self.validate_multiindex(obj) + assert isinstance(self.levels, list) # for mypy for n in self.levels: if n not in data_columns: data_columns.insert(0, n) @@ -5173,7 +5188,7 @@ def select_coords(self): start = 0 elif start < 0: start += nrows - if self.stop is None: + if stop is None: stop = nrows elif stop < 0: stop += nrows diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55dde374048b6..cec73ceb17f09 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -378,8 +378,8 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): d["delta"] = time_delta._values.astype(np.int64) // 1000 # microseconds if days or year: date_index = DatetimeIndex(dates) - d["year"] = date_index.year - d["month"] = date_index.month + d["year"] = date_index._data.year + d["month"] = date_index._data.month if days: days_in_ns = dates.astype(np.int64) - to_datetime( d["year"], format="%Y" @@ -469,7 +469,7 @@ class PossiblePrecisionLoss(Warning): precision_loss_doc = """ -Column converted from %s to %s, and some data are outside of the lossless +Column converted from {0} to {1}, and some data are outside of the lossless conversion range. This may result in a loss of precision in the saved data. """ @@ -543,7 +543,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: object in a DataFrame. """ ws = "" - # original, if small, if large + # original, if small, if large conversion_data = ( (np.bool_, np.int8, np.int8), (np.uint8, np.int8, np.int16), @@ -563,7 +563,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: dtype = c_data[1] else: dtype = c_data[2] - if c_data[2] == np.float64: # Warn if necessary + if c_data[2] == np.int64: # Warn if necessary if data[col].max() >= 2 ** 53: ws = precision_loss_doc.format("uint64", "float64") @@ -627,12 +627,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): self.value_labels = list(zip(np.arange(len(categories)), categories)) self.value_labels.sort(key=lambda x: x[0]) self.text_len = 0 - self.off: List[int] = [] - self.val: List[int] = [] self.txt: List[bytes] = [] self.n = 0 # Compute lengths and setup lists of offsets and labels + offsets: List[int] = [] + values: List[int] = [] for vl in self.value_labels: category = vl[1] if not isinstance(category, str): @@ -642,9 +642,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): ValueLabelTypeMismatch, ) category = category.encode(encoding) - self.off.append(self.text_len) + offsets.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding - self.val.append(vl[0]) + values.append(vl[0]) self.txt.append(category) self.n += 1 @@ -655,8 +655,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"): ) # Ensure int32 - self.off = np.array(self.off, dtype=np.int32) - self.val = np.array(self.val, dtype=np.int32) + self.off = np.array(offsets, dtype=np.int32) + self.val = np.array(values, dtype=np.int32) # Total length self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len @@ -868,26 +868,28 @@ def __init__(self): # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int self.DTYPE_MAP = dict( - list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)])) + list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) + [ - (251, np.int8), - (252, np.int16), - (253, np.int32), - (254, np.float32), - (255, np.float64), + (251, np.dtype(np.int8)), + (252, np.dtype(np.int16)), + (253, np.dtype(np.int32)), + (254, np.dtype(np.float32)), + (255, np.dtype(np.float64)), ] ) self.DTYPE_MAP_XML = dict( [ - (32768, np.uint8), # Keys to GSO - (65526, np.float64), - (65527, np.float32), - (65528, np.int32), - (65529, np.int16), - (65530, np.int8), + (32768, np.dtype(np.uint8)), # Keys to GSO + (65526, np.dtype(np.float64)), + (65527, np.dtype(np.float32)), + (65528, np.dtype(np.int32)), + (65529, np.dtype(np.int16)), + (65530, np.dtype(np.int8)), ] ) - self.TYPE_MAP = list(range(251)) + list("bhlfd") + # error: Argument 1 to "list" has incompatible type "str"; + # expected "Iterable[int]" [arg-type] + self.TYPE_MAP = list(range(251)) + list("bhlfd") # type: ignore[arg-type] self.TYPE_MAP_XML = dict( [ # Not really a Q, unclear how to handle byteswap @@ -1043,9 +1045,10 @@ def __init__( self._order_categoricals = order_categoricals self._encoding = "" self._chunksize = chunksize - if self._chunksize is not None and ( - not isinstance(chunksize, int) or chunksize <= 0 - ): + self._using_iterator = False + if self._chunksize is None: + self._chunksize = 1 + elif not isinstance(chunksize, int) or chunksize <= 0: raise ValueError("chunksize must be a positive integer when set.") # State variables for the file @@ -1055,7 +1058,7 @@ def __init__( self._column_selector_set = False self._value_labels_read = False self._data_read = False - self._dtype = None + self._dtype: Optional[np.dtype] = None self._lines_read = 0 self._native_byteorder = _set_endianness(sys.byteorder) @@ -1191,7 +1194,7 @@ def _read_new_header(self) -> None: # Get data type information, works for versions 117-119. def _get_dtypes( self, seek_vartypes: int - ) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]: + ) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]: self.path_or_buf.seek(seek_vartypes) raw_typlist = [ @@ -1516,11 +1519,8 @@ def _read_strls(self) -> None: self.GSO[str(v_o)] = decoded_va def __next__(self) -> DataFrame: - if self._chunksize is None: - raise ValueError( - "chunksize must be set to a positive integer to use as an iterator." - ) - return self.read(nrows=self._chunksize or 1) + self._using_iterator = True + return self.read(nrows=self._chunksize) def get_chunk(self, size: Optional[int] = None) -> DataFrame: """ @@ -1688,11 +1688,15 @@ def any_startswith(x: str) -> bool: convert = False for col in data: dtype = data[col].dtype - if dtype in (np.float16, np.float32): - dtype = np.float64 + if dtype in (np.dtype(np.float16), np.dtype(np.float32)): + dtype = np.dtype(np.float64) convert = True - elif dtype in (np.int8, np.int16, np.int32): - dtype = np.int64 + elif dtype in ( + np.dtype(np.int8), + np.dtype(np.int16), + np.dtype(np.int32), + ): + dtype = np.dtype(np.int64) convert = True retyped_data.append((col, data[col].astype(dtype))) if convert: @@ -1804,14 +1808,14 @@ def _do_convert_categoricals( keys = np.array(list(vl.keys())) column = data[col] key_matches = column.isin(keys) - if self._chunksize is not None and key_matches.all(): - initial_categories = keys + if self._using_iterator and key_matches.all(): + initial_categories: Optional[np.ndarray] = keys # If all categories are in the keys and we are iterating, # use the same keys for all chunks. If some are missing # value labels, then we will fall back to the categories # varying across chunks. else: - if self._chunksize is not None: + if self._using_iterator: # warn is using an iterator warnings.warn( categorical_conversion_warning, CategoricalConversionWarning @@ -2022,7 +2026,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: "ty", "%ty", ]: - return np.float64 # Stata expects doubles for SIFs + return np.dtype(np.float64) # Stata expects doubles for SIFs else: raise NotImplementedError(f"Format {fmt} not implemented") diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index d02f12a8e1029..e0e35e31d22ac 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -678,15 +678,25 @@ class PlotAccessor(PandasObject): ylim : 2-tuple/list Set the y limits of the current axes. xlabel : label, optional - Name to use for the xlabel on x-axis. Default uses index name as xlabel. + Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the + x-column name for planar plots. .. versionadded:: 1.1.0 + .. versionchanged:: 1.2.0 + + Now applicable to planar plots (`scatter`, `hexbin`). + ylabel : label, optional - Name to use for the ylabel on y-axis. Default will show no ylabel. + Name to use for the ylabel on y-axis. Default will show no ylabel, or the + y-column name for planar plots. .. versionadded:: 1.1.0 + .. versionchanged:: 1.2.0 + + Now applicable to planar plots (`scatter`, `hexbin`). + rot : int, default None Rotation for ticks (xticks for vertical, yticks for horizontal plots). diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index f806325d60eca..6c9924e0ada79 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -82,6 +82,8 @@ def _kind(self): _default_rot = 0 orientation: Optional[str] = None + axes: np.ndarray # of Axes objects + def __init__( self, data, @@ -177,7 +179,7 @@ def __init__( self.ax = ax self.fig = fig - self.axes = None + self.axes = np.array([], dtype=object) # "real" version get set in `generate` # parse errorbar input if given xerr = kwds.pop("xerr", None) @@ -697,7 +699,7 @@ def _get_ax_layer(cls, ax, primary=True): else: return getattr(ax, "right_ax", ax) - def _get_ax(self, i): + def _get_ax(self, i: int): # get the twinx ax if appropriate if self.subplots: ax = self.axes[i] @@ -922,8 +924,10 @@ def nseries(self) -> int: def _post_plot_logic(self, ax: "Axes", data): x, y = self.x, self.y - ax.set_ylabel(pprint_thing(y)) - ax.set_xlabel(pprint_thing(x)) + xlabel = self.xlabel if self.xlabel is not None else pprint_thing(x) + ylabel = self.ylabel if self.ylabel is not None else pprint_thing(y) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) def _plot_colorbar(self, ax: "Axes", **kwds): # Addresses issues #10611 and #10678: diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 832957dd73ec7..bec1f48f5e64a 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -401,11 +401,11 @@ def handle_shared_axes( _remove_labels_from_axis(ax.yaxis) -def flatten_axes(axes: Union["Axes", Sequence["Axes"]]) -> Sequence["Axes"]: +def flatten_axes(axes: Union["Axes", Sequence["Axes"]]) -> np.ndarray: if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, ABCIndexClass)): - return axes.ravel() + return np.asarray(axes).ravel() return np.array(axes) diff --git a/pandas/tests/arithmetic/test_categorical.py b/pandas/tests/arithmetic/test_categorical.py new file mode 100644 index 0000000000000..a978f763fbaaa --- /dev/null +++ b/pandas/tests/arithmetic/test_categorical.py @@ -0,0 +1,12 @@ +import numpy as np + +from pandas import Categorical, Series +import pandas._testing as tm + + +class TestCategoricalComparisons: + def test_categorical_nan_equality(self): + cat = Series(Categorical(["a", "b", "c", np.nan])) + expected = Series([True, True, True, False]) + result = cat == cat + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index c0ae36017f47a..f9dd4a7445a99 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -140,11 +140,11 @@ def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): xbox = box if box not in [pd.Index, pd.array] else np.ndarray ts = pd.Timestamp.now(tz) - ser = pd.Series([ts, pd.NaT]) + ser = Series([ts, pd.NaT]) obj = tm.box_expected(ser, box) - expected = pd.Series([True, False], dtype=np.bool_) + expected = Series([True, False], dtype=np.bool_) expected = tm.box_expected(expected, xbox) result = obj == ts @@ -278,7 +278,7 @@ def test_series_comparison_scalars(self, val): def test_timestamp_compare_series(self, left, right): # see gh-4982 # Make sure we can compare Timestamps on the right AND left hand side. - ser = pd.Series(pd.date_range("20010101", periods=10), name="dates") + ser = Series(pd.date_range("20010101", periods=10), name="dates") s_nat = ser.copy(deep=True) ser[0] = pd.Timestamp("nat") @@ -313,7 +313,7 @@ def test_dt64arr_timestamp_equality(self, box_with_array): box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray ) - ser = pd.Series([pd.Timestamp("2000-01-29 01:59:00"), "NaT"]) + ser = Series([pd.Timestamp("2000-01-29 01:59:00"), "NaT"]) ser = tm.box_expected(ser, box_with_array) result = ser != ser @@ -973,7 +973,7 @@ def test_dt64arr_sub_timestamp(self, box_with_array): ser = tm.box_expected(ser, box_with_array) - delta_series = pd.Series([np.timedelta64(0, "D"), np.timedelta64(1, "D")]) + delta_series = Series([np.timedelta64(0, "D"), np.timedelta64(1, "D")]) expected = tm.box_expected(delta_series, box_with_array) tm.assert_equal(ser - ts, expected) @@ -985,7 +985,7 @@ def test_dt64arr_sub_NaT(self, box_with_array): ser = tm.box_expected(dti, box_with_array) result = ser - pd.NaT - expected = pd.Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") + expected = Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -993,7 +993,7 @@ def test_dt64arr_sub_NaT(self, box_with_array): ser_tz = tm.box_expected(dti_tz, box_with_array) result = ser_tz - pd.NaT - expected = pd.Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") + expected = Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -1606,7 +1606,7 @@ def test_dt64_series_arith_overflow(self): dt = pd.Timestamp("1700-01-31") td = pd.Timedelta("20000 Days") dti = pd.date_range("1949-09-30", freq="100Y", periods=4) - ser = pd.Series(dti) + ser = Series(dti) msg = "Overflow in int64 addition" with pytest.raises(OverflowError, match=msg): ser - dt @@ -1618,7 +1618,7 @@ def test_dt64_series_arith_overflow(self): td + ser ser.iloc[-1] = pd.NaT - expected = pd.Series( + expected = Series( ["2004-10-03", "2104-10-04", "2204-10-04", "NaT"], dtype="datetime64[ns]" ) res = ser + td @@ -1627,9 +1627,7 @@ def test_dt64_series_arith_overflow(self): tm.assert_series_equal(res, expected) ser.iloc[1:] = pd.NaT - expected = pd.Series( - ["91279 Days", "NaT", "NaT", "NaT"], dtype="timedelta64[ns]" - ) + expected = Series(["91279 Days", "NaT", "NaT", "NaT"], dtype="timedelta64[ns]") res = ser - dt tm.assert_series_equal(res, expected) res = dt - ser @@ -1830,8 +1828,8 @@ def test_dt64tz_series_sub_dtitz(self): # GH#19071 subtracting tzaware DatetimeIndex from tzaware Series # (with same tz) raises, fixed by #19024 dti = pd.date_range("1999-09-30", periods=10, tz="US/Pacific") - ser = pd.Series(dti) - expected = pd.Series(pd.TimedeltaIndex(["0days"] * 10)) + ser = Series(dti) + expected = Series(pd.TimedeltaIndex(["0days"] * 10)) res = dti - ser tm.assert_series_equal(res, expected) diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py index 03cc4fe2bdcb5..30a23d8563ef8 100644 --- a/pandas/tests/arithmetic/test_interval.py +++ b/pandas/tests/arithmetic/test_interval.py @@ -290,6 +290,6 @@ def test_index_series_compat(self, op, constructor, expected_type, assert_func): def test_comparison_operations(self, scalars): # GH #28981 expected = Series([False, False]) - s = pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval") + s = Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval") result = s == scalars tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 04ba41307d0ef..1418aec015b92 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -42,7 +42,7 @@ def adjust_negative_zero(zero, expected): # TODO: remove this kludge once mypy stops giving false positives here # List comprehension has incompatible type List[PandasObject]; expected List[RangeIndex] # See GH#29725 -ser_or_index: List[Any] = [pd.Series, pd.Index] +ser_or_index: List[Any] = [Series, Index] lefts: List[Any] = [pd.RangeIndex(10, 40, 10)] lefts.extend( [ @@ -59,14 +59,14 @@ def adjust_negative_zero(zero, expected): class TestNumericComparisons: def test_operator_series_comparison_zerorank(self): # GH#13006 - result = np.float64(0) > pd.Series([1, 2, 3]) - expected = 0.0 > pd.Series([1, 2, 3]) + result = np.float64(0) > Series([1, 2, 3]) + expected = 0.0 > Series([1, 2, 3]) tm.assert_series_equal(result, expected) - result = pd.Series([1, 2, 3]) < np.float64(0) - expected = pd.Series([1, 2, 3]) < 0.0 + result = Series([1, 2, 3]) < np.float64(0) + expected = Series([1, 2, 3]) < 0.0 tm.assert_series_equal(result, expected) - result = np.array([0, 1, 2])[0] > pd.Series([0, 1, 2]) - expected = 0.0 > pd.Series([1, 2, 3]) + result = np.array([0, 1, 2])[0] > Series([0, 1, 2]) + expected = 0.0 > Series([1, 2, 3]) tm.assert_series_equal(result, expected) def test_df_numeric_cmp_dt64_raises(self): @@ -92,22 +92,22 @@ def test_df_numeric_cmp_dt64_raises(self): def test_compare_invalid(self): # GH#8058 # ops testing - a = pd.Series(np.random.randn(5), name=0) - b = pd.Series(np.random.randn(5)) + a = Series(np.random.randn(5), name=0) + b = Series(np.random.randn(5)) b.name = pd.Timestamp("2000-01-01") tm.assert_series_equal(a / b, 1 / (b / a)) def test_numeric_cmp_string_numexpr_path(self, box_with_array): # GH#36377, GH#35700 box = box_with_array - xbox = box if box is not pd.Index else np.ndarray + xbox = box if box is not Index else np.ndarray - obj = pd.Series(np.random.randn(10 ** 5)) + obj = Series(np.random.randn(10 ** 5)) obj = tm.box_expected(obj, box, transpose=False) result = obj == "a" - expected = pd.Series(np.zeros(10 ** 5, dtype=bool)) + expected = Series(np.zeros(10 ** 5, dtype=bool)) expected = tm.box_expected(expected, xbox, transpose=False) tm.assert_equal(result, expected) @@ -126,7 +126,7 @@ def test_numeric_cmp_string_numexpr_path(self, box_with_array): class TestNumericArraylikeArithmeticWithDatetimeLike: # TODO: also check name retentention - @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) + @pytest.mark.parametrize("box_cls", [np.array, Index, Series]) @pytest.mark.parametrize( "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) ) @@ -136,8 +136,8 @@ def test_mul_td64arr(self, left, box_cls): right = box_cls(right) expected = pd.TimedeltaIndex(["10s", "40s", "90s"]) - if isinstance(left, pd.Series) or box_cls is pd.Series: - expected = pd.Series(expected) + if isinstance(left, Series) or box_cls is Series: + expected = Series(expected) result = left * right tm.assert_equal(result, expected) @@ -146,7 +146,7 @@ def test_mul_td64arr(self, left, box_cls): tm.assert_equal(result, expected) # TODO: also check name retentention - @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) + @pytest.mark.parametrize("box_cls", [np.array, Index, Series]) @pytest.mark.parametrize( "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) ) @@ -156,8 +156,8 @@ def test_div_td64arr(self, left, box_cls): right = box_cls(right) expected = pd.TimedeltaIndex(["1s", "2s", "3s"]) - if isinstance(left, pd.Series) or box_cls is pd.Series: - expected = pd.Series(expected) + if isinstance(left, Series) or box_cls is Series: + expected = Series(expected) result = right / left tm.assert_equal(result, expected) @@ -172,16 +172,7 @@ def test_div_td64arr(self, left, box_cls): with pytest.raises(TypeError, match=msg): left // right - # TODO: de-duplicate with test_numeric_arr_mul_tdscalar - def test_ops_series(self): - # regression test for G#H8813 - td = Timedelta("1 day") - other = pd.Series([1, 2]) - expected = pd.Series(pd.to_timedelta(["1 day", "2 days"])) - tm.assert_series_equal(expected, td * other) - tm.assert_series_equal(expected, other * td) - - # TODO: also test non-nanosecond timedelta64 and Tick objects; + # TODO: also test Tick objects; # see test_numeric_arr_rdiv_tdscalar for note on these failing @pytest.mark.parametrize( "scalar_td", @@ -189,6 +180,8 @@ def test_ops_series(self): Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta(), + Timedelta(days=1).to_timedelta64().astype("timedelta64[s]"), + Timedelta(days=1).to_timedelta64().astype("timedelta64[ms]"), ], ids=lambda x: type(x).__name__, ) @@ -196,7 +189,7 @@ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array): # GH#19333 box = box_with_array index = numeric_idx - expected = pd.TimedeltaIndex([pd.Timedelta(days=n) for n in range(5)]) + expected = pd.TimedeltaIndex([pd.Timedelta(days=n) for n in range(len(index))]) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) @@ -325,7 +318,7 @@ class TestDivisionByZero: def test_div_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) # We only adjust for Index, because Series does not yet apply # the adjustment correctly. expected2 = adjust_negative_zero(zero, expected) @@ -338,7 +331,7 @@ def test_div_zero(self, zero, numeric_idx): def test_floordiv_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) # We only adjust for Index, because Series does not yet apply # the adjustment correctly. expected2 = adjust_negative_zero(zero, expected) @@ -351,7 +344,7 @@ def test_floordiv_zero(self, zero, numeric_idx): def test_mod_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) + expected = Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) result = idx % zero tm.assert_index_equal(result, expected) ser_compat = Series(idx).astype("i8") % np.array(zero).astype("i8") @@ -360,8 +353,8 @@ def test_mod_zero(self, zero, numeric_idx): def test_divmod_zero(self, zero, numeric_idx): idx = numeric_idx - exleft = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) - exright = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) + exleft = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + exright = Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) exleft = adjust_negative_zero(zero, exleft) result = divmod(idx, zero) @@ -375,9 +368,7 @@ def test_div_negative_zero(self, zero, numeric_idx, op): return idx = numeric_idx - 3 - expected = pd.Index( - [-np.inf, -np.inf, -np.inf, np.nan, np.inf], dtype=np.float64 - ) + expected = Index([-np.inf, -np.inf, -np.inf, np.nan, np.inf], dtype=np.float64) expected = adjust_negative_zero(zero, expected) result = op(idx, zero) @@ -409,8 +400,8 @@ def test_ser_div_ser(self, dtype1, any_real_dtype): def test_ser_divmod_zero(self, dtype1, any_real_dtype): # GH#26987 dtype2 = any_real_dtype - left = pd.Series([1, 1]).astype(dtype1) - right = pd.Series([0, 2]).astype(dtype2) + left = Series([1, 1]).astype(dtype1) + right = Series([0, 2]).astype(dtype2) # GH#27321 pandas convention is to set 1 // 0 to np.inf, as opposed # to numpy which sets to np.nan; patch `expected[0]` below @@ -429,8 +420,8 @@ def test_ser_divmod_zero(self, dtype1, any_real_dtype): tm.assert_series_equal(result[1], expected[1]) def test_ser_divmod_inf(self): - left = pd.Series([np.inf, 1.0]) - right = pd.Series([np.inf, 2.0]) + left = Series([np.inf, 1.0]) + right = Series([np.inf, 2.0]) expected = left // right, left % right result = divmod(left, right) @@ -487,8 +478,8 @@ def test_df_div_zero_df(self): df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) result = df / df - first = pd.Series([1.0, 1.0, 1.0, 1.0]) - second = pd.Series([np.nan, np.nan, np.nan, 1]) + first = Series([1.0, 1.0, 1.0, 1.0]) + second = Series([np.nan, np.nan, np.nan, 1]) expected = pd.DataFrame({"first": first, "second": second}) tm.assert_frame_equal(result, expected) @@ -496,8 +487,8 @@ def test_df_div_zero_array(self): # integer div, but deal with the 0's (GH#9144) df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) - first = pd.Series([1.0, 1.0, 1.0, 1.0]) - second = pd.Series([np.nan, np.nan, np.nan, 1]) + first = Series([1.0, 1.0, 1.0, 1.0]) + second = Series([np.nan, np.nan, np.nan, 1]) expected = pd.DataFrame({"first": first, "second": second}) with np.errstate(all="ignore"): @@ -537,8 +528,8 @@ def test_df_mod_zero_df(self): # this is technically wrong, as the integer portion is coerced to float # ### - first = pd.Series([0, 0, 0, 0], dtype="float64") - second = pd.Series([np.nan, np.nan, np.nan, 0]) + first = Series([0, 0, 0, 0], dtype="float64") + second = Series([np.nan, np.nan, np.nan, 0]) expected = pd.DataFrame({"first": first, "second": second}) result = df % df tm.assert_frame_equal(result, expected) @@ -549,8 +540,8 @@ def test_df_mod_zero_array(self): # this is technically wrong, as the integer portion is coerced to float # ### - first = pd.Series([0, 0, 0, 0], dtype="float64") - second = pd.Series([np.nan, np.nan, np.nan, 0]) + first = Series([0, 0, 0, 0], dtype="float64") + second = Series([np.nan, np.nan, np.nan, 0]) expected = pd.DataFrame({"first": first, "second": second}) # numpy has a slightly different (wrong) treatment @@ -815,36 +806,50 @@ class TestAdditionSubtraction: # __add__, __sub__, __radd__, __rsub__, __iadd__, __isub__ # for non-timestamp/timedelta/period dtypes - # TODO: This came from series.test.test_operators, needs cleanup - def test_arith_ops_df_compat(self): + @pytest.mark.parametrize( + "first, second, expected", + [ + ( + Series([1, 2, 3], index=list("ABC"), name="x"), + Series([2, 2, 2], index=list("ABD"), name="x"), + Series([3.0, 4.0, np.nan, np.nan], index=list("ABCD"), name="x"), + ), + ( + Series([1, 2, 3], index=list("ABC"), name="x"), + Series([2, 2, 2, 2], index=list("ABCD"), name="x"), + Series([3, 4, 5, np.nan], index=list("ABCD"), name="x"), + ), + ], + ) + def test_add_series(self, first, second, expected): # GH#1134 - s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") - s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") - - exp = pd.Series([3.0, 4.0, np.nan, np.nan], index=list("ABCD"), name="x") - tm.assert_series_equal(s1 + s2, exp) - tm.assert_series_equal(s2 + s1, exp) + tm.assert_series_equal(first + second, expected) + tm.assert_series_equal(second + first, expected) - exp = pd.DataFrame({"x": [3.0, 4.0, np.nan, np.nan]}, index=list("ABCD")) - tm.assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) - tm.assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) - - # different length - s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") - s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") - - exp = pd.Series([3, 4, 5, np.nan], index=list("ABCD"), name="x") - tm.assert_series_equal(s3 + s4, exp) - tm.assert_series_equal(s4 + s3, exp) - - exp = pd.DataFrame({"x": [3, 4, 5, np.nan]}, index=list("ABCD")) - tm.assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) - tm.assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) + @pytest.mark.parametrize( + "first, second, expected", + [ + ( + pd.DataFrame({"x": [1, 2, 3]}, index=list("ABC")), + pd.DataFrame({"x": [2, 2, 2]}, index=list("ABD")), + pd.DataFrame({"x": [3.0, 4.0, np.nan, np.nan]}, index=list("ABCD")), + ), + ( + pd.DataFrame({"x": [1, 2, 3]}, index=list("ABC")), + pd.DataFrame({"x": [2, 2, 2, 2]}, index=list("ABCD")), + pd.DataFrame({"x": [3, 4, 5, np.nan]}, index=list("ABCD")), + ), + ], + ) + def test_add_frames(self, first, second, expected): + # GH#1134 + tm.assert_frame_equal(first + second, expected) + tm.assert_frame_equal(second + first, expected) # TODO: This came from series.test.test_operators, needs cleanup def test_series_frame_radd_bug(self): # GH#353 - vals = pd.Series(tm.rands_array(5, 10)) + vals = Series(tm.rands_array(5, 10)) result = "foo_" + vals expected = vals.map(lambda x: "foo_" + x) tm.assert_series_equal(result, expected) @@ -869,14 +874,14 @@ def test_series_frame_radd_bug(self): # TODO: This came from series.test.test_operators, needs cleanup def test_datetime64_with_index(self): # arithmetic integer ops with an index - ser = pd.Series(np.random.randn(5)) + ser = Series(np.random.randn(5)) expected = ser - ser.index.to_series() result = ser - ser.index tm.assert_series_equal(result, expected) # GH#4629 # arithmetic datetime64 ops with an index - ser = pd.Series( + ser = Series( pd.date_range("20130101", periods=5), index=pd.date_range("20130101", periods=5), ) @@ -903,7 +908,7 @@ def test_frame_operators(self, float_frame): frame2 = pd.DataFrame(float_frame, columns=["D", "C", "B", "A"]) garbage = np.random.random(4) - colSeries = pd.Series(garbage, index=np.array(frame.columns)) + colSeries = Series(garbage, index=np.array(frame.columns)) idSum = frame + frame seriesSum = frame + colSeries @@ -1026,8 +1031,8 @@ def test_series_divmod_zero(self): other = tser * 0 result = divmod(tser, other) - exp1 = pd.Series([np.inf] * len(tser), index=tser.index, name="ts") - exp2 = pd.Series([np.nan] * len(tser), index=tser.index, name="ts") + exp1 = Series([np.inf] * len(tser), index=tser.index, name="ts") + exp2 = Series([np.nan] * len(tser), index=tser.index, name="ts") tm.assert_series_equal(result[0], exp1) tm.assert_series_equal(result[1], exp2) @@ -1035,10 +1040,10 @@ def test_series_divmod_zero(self): class TestUFuncCompat: @pytest.mark.parametrize( "holder", - [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.RangeIndex, pd.Series], + [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.RangeIndex, Series], ) def test_ufunc_compat(self, holder): - box = pd.Series if holder is pd.Series else pd.Index + box = Series if holder is Series else Index if holder is pd.RangeIndex: idx = pd.RangeIndex(0, 5) @@ -1049,11 +1054,11 @@ def test_ufunc_compat(self, holder): tm.assert_equal(result, expected) @pytest.mark.parametrize( - "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.Series] + "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, Series] ) def test_ufunc_coercions(self, holder): idx = holder([1, 2, 3, 4, 5], name="x") - box = pd.Series if holder is pd.Series else pd.Index + box = Series if holder is Series else Index result = np.sqrt(idx) assert result.dtype == "f8" and isinstance(result, box) @@ -1093,11 +1098,11 @@ def test_ufunc_coercions(self, holder): tm.assert_equal(result, exp) @pytest.mark.parametrize( - "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.Series] + "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, Series] ) def test_ufunc_multiple_return_values(self, holder): obj = holder([1, 2, 3], name="x") - box = pd.Series if holder is pd.Series else pd.Index + box = Series if holder is Series else Index result = np.modf(obj) assert isinstance(result, tuple) @@ -1107,9 +1112,9 @@ def test_ufunc_multiple_return_values(self, holder): tm.assert_equal(result[1], tm.box_expected(exp2, box)) def test_ufunc_at(self): - s = pd.Series([0, 1, 2], index=[1, 2, 3], name="x") + s = Series([0, 1, 2], index=[1, 2, 3], name="x") np.add.at(s, [0, 2], 10) - expected = pd.Series([10, 1, 12], index=[1, 2, 3], name="x") + expected = Series([10, 1, 12], index=[1, 2, 3], name="x") tm.assert_series_equal(s, expected) @@ -1119,8 +1124,8 @@ class TestObjectDtypeEquivalence: @pytest.mark.parametrize("dtype", [None, object]) def test_numarr_with_dtype_add_nan(self, dtype, box_with_array): box = box_with_array - ser = pd.Series([1, 2, 3], dtype=dtype) - expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) + ser = Series([1, 2, 3], dtype=dtype) + expected = Series([np.nan, np.nan, np.nan], dtype=dtype) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) @@ -1134,8 +1139,8 @@ def test_numarr_with_dtype_add_nan(self, dtype, box_with_array): @pytest.mark.parametrize("dtype", [None, object]) def test_numarr_with_dtype_add_int(self, dtype, box_with_array): box = box_with_array - ser = pd.Series([1, 2, 3], dtype=dtype) - expected = pd.Series([2, 3, 4], dtype=dtype) + ser = Series([1, 2, 3], dtype=dtype) + expected = Series([2, 3, 4], dtype=dtype) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) @@ -1153,7 +1158,7 @@ def test_numarr_with_dtype_add_int(self, dtype, box_with_array): ) def test_operators_reverse_object(self, op): # GH#56 - arr = pd.Series(np.random.randn(10), index=np.arange(10), dtype=object) + arr = Series(np.random.randn(10), index=np.arange(10), dtype=object) result = op(1.0, arr) expected = op(1.0, arr.astype(float)) @@ -1217,9 +1222,9 @@ def test_arithmetic_with_frame_or_series(self, op): # check that we return NotImplemented when operating with Series # or DataFrame index = pd.RangeIndex(5) - other = pd.Series(np.random.randn(5)) + other = Series(np.random.randn(5)) - expected = op(pd.Series(index), other) + expected = op(Series(index), other) result = op(index, other) tm.assert_series_equal(result, expected) @@ -1292,14 +1297,14 @@ def test_numeric_compat2(self): def test_addsub_arithmetic(self, dtype, delta): # GH#8142 delta = dtype(delta) - index = pd.Index([10, 11, 12], dtype=dtype) + index = Index([10, 11, 12], dtype=dtype) result = index + delta - expected = pd.Index(index.values + delta, dtype=dtype) + expected = Index(index.values + delta, dtype=dtype) tm.assert_index_equal(result, expected) # this subtraction used to fail result = index - delta - expected = pd.Index(index.values - delta, dtype=dtype) + expected = Index(index.values - delta, dtype=dtype) tm.assert_index_equal(result, expected) tm.assert_index_equal(index + index, 2 * index) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 02cb4f4d7a606..ddd14af0918de 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -95,8 +95,8 @@ def test_add_extension_scalar(self, other, box_with_array, op): # Check that scalars satisfying is_extension_array_dtype(obj) # do not incorrectly try to dispatch to an ExtensionArray operation - arr = pd.Series(["a", "b", "c"]) - expected = pd.Series([op(x, other) for x in arr]) + arr = Series(["a", "b", "c"]) + expected = Series([op(x, other) for x in arr]) arr = tm.box_expected(arr, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -105,8 +105,8 @@ def test_add_extension_scalar(self, other, box_with_array, op): tm.assert_equal(result, expected) def test_objarr_add_str(self, box_with_array): - ser = pd.Series(["x", np.nan, "x"]) - expected = pd.Series(["xa", np.nan, "xa"]) + ser = Series(["x", np.nan, "x"]) + expected = Series(["xa", np.nan, "xa"]) ser = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -115,8 +115,8 @@ def test_objarr_add_str(self, box_with_array): tm.assert_equal(result, expected) def test_objarr_radd_str(self, box_with_array): - ser = pd.Series(["x", np.nan, "x"]) - expected = pd.Series(["ax", np.nan, "ax"]) + ser = Series(["x", np.nan, "x"]) + expected = Series(["ax", np.nan, "ax"]) ser = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -166,22 +166,22 @@ def test_objarr_add_invalid(self, op, box_with_array): def test_operators_na_handling(self): ser = Series(["foo", "bar", "baz", np.nan]) result = "prefix_" + ser - expected = pd.Series(["prefix_foo", "prefix_bar", "prefix_baz", np.nan]) + expected = Series(["prefix_foo", "prefix_bar", "prefix_baz", np.nan]) tm.assert_series_equal(result, expected) result = ser + "_suffix" - expected = pd.Series(["foo_suffix", "bar_suffix", "baz_suffix", np.nan]) + expected = Series(["foo_suffix", "bar_suffix", "baz_suffix", np.nan]) tm.assert_series_equal(result, expected) # TODO: parametrize over box @pytest.mark.parametrize("dtype", [None, object]) def test_series_with_dtype_radd_timedelta(self, dtype): # note this test is _not_ aimed at timedelta64-dtyped Series - ser = pd.Series( + ser = Series( [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], dtype=dtype, ) - expected = pd.Series( + expected = Series( [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")] ) @@ -194,7 +194,7 @@ def test_series_with_dtype_radd_timedelta(self, dtype): # TODO: cleanup & parametrize over box def test_mixed_timezone_series_ops_object(self): # GH#13043 - ser = pd.Series( + ser = Series( [ pd.Timestamp("2015-01-01", tz="US/Eastern"), pd.Timestamp("2015-01-01", tz="Asia/Tokyo"), @@ -203,7 +203,7 @@ def test_mixed_timezone_series_ops_object(self): ) assert ser.dtype == object - exp = pd.Series( + exp = Series( [ pd.Timestamp("2015-01-02", tz="US/Eastern"), pd.Timestamp("2015-01-02", tz="Asia/Tokyo"), @@ -214,7 +214,7 @@ def test_mixed_timezone_series_ops_object(self): tm.assert_series_equal(pd.Timedelta("1 days") + ser, exp) # object series & object series - ser2 = pd.Series( + ser2 = Series( [ pd.Timestamp("2015-01-03", tz="US/Eastern"), pd.Timestamp("2015-01-05", tz="Asia/Tokyo"), @@ -222,27 +222,25 @@ def test_mixed_timezone_series_ops_object(self): name="xxx", ) assert ser2.dtype == object - exp = pd.Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") + exp = Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") tm.assert_series_equal(ser2 - ser, exp) tm.assert_series_equal(ser - ser2, -exp) - ser = pd.Series( + ser = Series( [pd.Timedelta("01:00:00"), pd.Timedelta("02:00:00")], name="xxx", dtype=object, ) assert ser.dtype == object - exp = pd.Series( - [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx" - ) + exp = Series([pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx") tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp) # TODO: cleanup & parametrize over box def test_iadd_preserves_name(self): # GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name - ser = pd.Series([1, 2, 3]) + ser = Series([1, 2, 3]) ser.index.name = "foo" ser.index += 1 @@ -343,8 +341,9 @@ def _simple_new(cls, values, name=None, dtype=None): result._index_data = values result._name = name result._calls = 0 + result._reset_identity() - return result._reset_identity() + return result def __add__(self, other): self._calls += 1 diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index e78e696d00398..f02259a1c7e62 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -449,10 +449,10 @@ def _check(self, values, func, expected): assert isinstance(expected, (pd.Index, np.ndarray)) tm.assert_equal(result, expected) - s = pd.Series(values) + s = Series(values) result = func(s) - exp = pd.Series(expected, name=values.name) + exp = Series(expected, name=values.name) tm.assert_series_equal(result, exp) def test_pi_comp_period(self): @@ -1247,13 +1247,13 @@ def test_parr_add_sub_object_array(self): class TestPeriodSeriesArithmetic: def test_ops_series_timedelta(self): # GH#13043 - ser = pd.Series( + ser = Series( [pd.Period("2015-01-01", freq="D"), pd.Period("2015-01-02", freq="D")], name="xxx", ) assert ser.dtype == "Period[D]" - expected = pd.Series( + expected = Series( [pd.Period("2015-01-02", freq="D"), pd.Period("2015-01-03", freq="D")], name="xxx", ) @@ -1272,7 +1272,7 @@ def test_ops_series_timedelta(self): def test_ops_series_period(self): # GH#13043 - ser = pd.Series( + ser = Series( [pd.Period("2015-01-01", freq="D"), pd.Period("2015-01-02", freq="D")], name="xxx", ) @@ -1281,17 +1281,17 @@ def test_ops_series_period(self): per = pd.Period("2015-01-10", freq="D") off = per.freq # dtype will be object because of original dtype - expected = pd.Series([9 * off, 8 * off], name="xxx", dtype=object) + expected = Series([9 * off, 8 * off], name="xxx", dtype=object) tm.assert_series_equal(per - ser, expected) tm.assert_series_equal(ser - per, -1 * expected) - s2 = pd.Series( + s2 = Series( [pd.Period("2015-01-05", freq="D"), pd.Period("2015-01-04", freq="D")], name="xxx", ) assert s2.dtype == "Period[D]" - expected = pd.Series([4 * off, 2 * off], name="xxx", dtype=object) + expected = Series([4 * off, 2 * off], name="xxx", dtype=object) tm.assert_series_equal(s2 - ser, expected) tm.assert_series_equal(ser - s2, -1 * expected) @@ -1304,10 +1304,10 @@ def _check(self, values, func, expected): result = func(idx) tm.assert_equal(result, expected) - ser = pd.Series(values) + ser = Series(values) result = func(ser) - exp = pd.Series(expected, name=values.name) + exp = Series(expected, name=values.name) tm.assert_series_equal(result, exp) def test_pi_ops(self): @@ -1455,7 +1455,7 @@ def test_pi_offset_errors(self): freq="D", name="idx", ) - ser = pd.Series(idx) + ser = Series(idx) # Series op is applied per Period instance, thus error is raised # from Period diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 3e979aed0551f..5f556718ea0d3 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -77,10 +77,10 @@ def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar): box = box_with_array xbox = box if box not in [pd.Index, pd.array] else np.ndarray - ser = pd.Series([timedelta(days=1), timedelta(days=2)]) + ser = Series([timedelta(days=1), timedelta(days=2)]) ser = tm.box_expected(ser, box) actual = ser > td_scalar - expected = pd.Series([False, True]) + expected = Series([False, True]) expected = tm.box_expected(expected, xbox) tm.assert_equal(actual, expected) @@ -722,14 +722,14 @@ def test_timedelta_ops_with_missing_values(self): sn = pd.to_timedelta(Series([pd.NaT], dtype="m8[ns]")) - df1 = pd.DataFrame(["00:00:01"]).apply(pd.to_timedelta) - df2 = pd.DataFrame(["00:00:02"]).apply(pd.to_timedelta) + df1 = DataFrame(["00:00:01"]).apply(pd.to_timedelta) + df2 = DataFrame(["00:00:02"]).apply(pd.to_timedelta) with pytest.raises(TypeError, match=msg): # Passing datetime64-dtype data to TimedeltaIndex is no longer # supported GH#29794 - pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) + DataFrame([pd.NaT]).apply(pd.to_timedelta) - dfn = pd.DataFrame([pd.NaT.value]).apply(pd.to_timedelta) + dfn = DataFrame([pd.NaT.value]).apply(pd.to_timedelta) scalar1 = pd.to_timedelta("00:00:01") scalar2 = pd.to_timedelta("00:00:02") @@ -1104,7 +1104,7 @@ def test_td64arr_sub_periodlike(self, box_with_array, tdi_freq, pi_freq): ) def test_td64arr_addsub_numeric_scalar_invalid(self, box_with_array, other): # vector-like others are tested in test_td64arr_add_sub_numeric_arr_invalid - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdarr = tm.box_expected(tdser, box_with_array) assert_invalid_addsub_type(tdarr, other) @@ -1122,7 +1122,7 @@ def test_td64arr_addsub_numeric_scalar_invalid(self, box_with_array, other): def test_td64arr_addsub_numeric_arr_invalid( self, box_with_array, vec, any_real_dtype ): - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdarr = tm.box_expected(tdser, box_with_array) vector = vec.astype(any_real_dtype) @@ -1556,7 +1556,7 @@ def test_tdi_mul_int_series(self, box_with_array): idx = tm.box_expected(idx, box) expected = tm.box_expected(expected, xbox) - result = idx * pd.Series(np.arange(5, dtype="int64")) + result = idx * Series(np.arange(5, dtype="int64")) tm.assert_equal(result, expected) def test_tdi_mul_float_series(self, box_with_array): @@ -1765,8 +1765,8 @@ def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): box = box_with_array xbox = np.ndarray if box is pd.array else box - left = pd.Series([1000, 222330, 30], dtype="timedelta64[ns]") - right = pd.Series([1000, 222330, None], dtype="timedelta64[ns]") + left = Series([1000, 222330, 30], dtype="timedelta64[ns]") + right = Series([1000, 222330, None], dtype="timedelta64[ns]") left = tm.box_expected(left, box) right = tm.box_expected(right, box) @@ -1900,10 +1900,13 @@ def test_td64arr_mod_tdscalar(self, box_with_array, three_days): result = tdarr % three_days tm.assert_equal(result, expected) - if box_with_array is pd.DataFrame: - pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__") + warn = None + if box_with_array is pd.DataFrame and isinstance(three_days, pd.DateOffset): + warn = PerformanceWarning + + with tm.assert_produces_warning(warn): + result = divmod(tdarr, three_days) - result = divmod(tdarr, three_days) tm.assert_equal(result[1], expected) tm.assert_equal(result[0], tdarr // three_days) @@ -1921,9 +1924,6 @@ def test_td64arr_mod_int(self, box_with_array): with pytest.raises(TypeError, match=msg): 2 % tdarr - if box_with_array is pd.DataFrame: - pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__") - result = divmod(tdarr, 2) tm.assert_equal(result[1], expected) tm.assert_equal(result[0], tdarr // 2) @@ -1939,9 +1939,6 @@ def test_td64arr_rmod_tdscalar(self, box_with_array, three_days): result = three_days % tdarr tm.assert_equal(result, expected) - if box_with_array is pd.DataFrame: - pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__") - result = divmod(three_days, tdarr) tm.assert_equal(result[1], expected) tm.assert_equal(result[0], three_days // tdarr) @@ -1991,7 +1988,7 @@ def test_td64arr_mul_td64arr_raises(self, box_with_array): def test_td64arr_mul_numeric_scalar(self, box_with_array, one): # GH#4521 # divide/multiply by integers - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") expected = Series(["-59 Days", "-59 Days", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) @@ -2014,7 +2011,7 @@ def test_td64arr_mul_numeric_scalar(self, box_with_array, one): def test_td64arr_div_numeric_scalar(self, box_with_array, two): # GH#4521 # divide/multiply by integers - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") expected = Series(["29.5D", "29.5D", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) @@ -2036,7 +2033,7 @@ def test_td64arr_rmul_numeric_array(self, box_with_array, vector, any_real_dtype # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_dtype) expected = Series(["1180 Days", "1770 Days", "NaT"], dtype="timedelta64[ns]") @@ -2060,7 +2057,7 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, any_real_dtype) # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_dtype) expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 1eef86980f974..471e11cff9fd7 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -638,7 +638,7 @@ def test_constructor_imaginary(self): def test_constructor_string_and_tuples(self): # GH 21416 c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) - expected_index = pd.Index([("a", "b"), ("b", "a"), "c"]) + expected_index = Index([("a", "b"), ("b", "a"), "c"]) assert c.categories.equals(expected_index) def test_interval(self): diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 2c4dd8fe64057..c589b72fa2895 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -91,6 +91,22 @@ def test_setitem_tuple(self): cat[1] = cat[0] assert cat[1] == (0, 1) + def test_setitem_listlike(self): + + # GH#9469 + # properly coerce the input indexers + np.random.seed(1) + c = Categorical( + np.random.randint(0, 5, size=150000).astype(np.int8) + ).add_categories([-1000]) + indexer = np.array([100000]).astype(np.int64) + c[indexer] = -1000 + + # we are asserting the code result here + # which maps to the -1000 category + result = c.codes[np.array([100000]).astype(np.int64)] + tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) + class TestCategoricalIndexing: def test_getitem_slice(self): @@ -200,40 +216,38 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): tm.assert_numpy_array_equal(exp_miss, res_miss) def test_where_unobserved_nan(self): - ser = pd.Series(pd.Categorical(["a", "b"])) + ser = Series(pd.Categorical(["a", "b"])) result = ser.where([True, False]) - expected = pd.Series(pd.Categorical(["a", None], categories=["a", "b"])) + expected = Series(pd.Categorical(["a", None], categories=["a", "b"])) tm.assert_series_equal(result, expected) # all NA - ser = pd.Series(pd.Categorical(["a", "b"])) + ser = Series(pd.Categorical(["a", "b"])) result = ser.where([False, False]) - expected = pd.Series(pd.Categorical([None, None], categories=["a", "b"])) + expected = Series(pd.Categorical([None, None], categories=["a", "b"])) tm.assert_series_equal(result, expected) def test_where_unobserved_categories(self): - ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) result = ser.where([True, True, False], other="b") - expected = pd.Series( - Categorical(["a", "b", "b"], categories=ser.cat.categories) - ) + expected = Series(Categorical(["a", "b", "b"], categories=ser.cat.categories)) tm.assert_series_equal(result, expected) def test_where_other_categorical(self): - ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"]) result = ser.where([True, False, True], other) - expected = pd.Series(Categorical(["a", "c", "c"], dtype=ser.dtype)) + expected = Series(Categorical(["a", "c", "c"], dtype=ser.dtype)) tm.assert_series_equal(result, expected) def test_where_new_category_raises(self): - ser = pd.Series(Categorical(["a", "b", "c"])) + ser = Series(Categorical(["a", "b", "c"])) msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): ser.where([True, False, True], "d") def test_where_ordered_differs_rasies(self): - ser = pd.Series( + ser = Series( Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True) ) other = Categorical( diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index 2767d93741d4c..baf60a363ad29 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat import IS64 + import pandas as pd import pandas._testing as tm @@ -71,6 +73,7 @@ def test_ufunc_reduce_raises(values): np.add.reduce(a) +@pytest.mark.skipif(not IS64, reason="GH 36579: fail on 32-bit system") @pytest.mark.parametrize( "pandasmethname, kwargs", [ diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index f22d958dc88e3..463196eaa36bf 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,7 +4,7 @@ import pytest import pytz -from pandas._libs import OutOfBoundsDatetime +from pandas._libs import OutOfBoundsDatetime, Timestamp from pandas.compat.numpy import np_version_under1p18 import pandas as pd @@ -324,6 +324,15 @@ def test_getitem_2d(self, arr1d): expected = arr1d[-1] assert result == expected + def test_iter_2d(self, arr1d): + data2d = arr1d._data[:3, np.newaxis] + arr2d = type(arr1d)._simple_new(data2d, dtype=arr1d.dtype) + result = list(arr2d) + for x in result: + assert isinstance(x, type(arr1d)) + assert x.ndim == 1 + assert x.dtype == arr1d.dtype + def test_setitem(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") @@ -390,7 +399,7 @@ def test_setitem_raises(self): with pytest.raises(IndexError, match="index 12 is out of bounds"): arr[12] = val - with pytest.raises(TypeError, match="'value' should be a.* 'object'"): + with pytest.raises(TypeError, match="value should be a.* 'object'"): arr[0] = object() msg = "cannot set using a list-like indexer with a different length" @@ -407,7 +416,10 @@ def test_setitem_raises(self): def test_setitem_numeric_raises(self, arr1d, box): # We dont case e.g. int64 to our own dtype for setitem - msg = "requires compatible dtype" + msg = ( + f"value should be a '{arr1d._scalar_type.__name__}', " + "'NaT', or array of those. Got" + ) with pytest.raises(TypeError, match=msg): arr1d[:2] = box([0, 1]) @@ -683,6 +695,15 @@ def test_take_fill_valid(self, arr1d): # require appropriate-dtype if we have a NA value arr.take([-1, 1], allow_fill=True, fill_value=value) + if arr.tz is not None: + # GH#37356 + # Assuming here that arr1d fixture does not include Australia/Melbourne + value = Timestamp.now().tz_localize("Australia/Melbourne") + msg = "Timezones don't match. .* != 'Australia/Melbourne'" + with pytest.raises(ValueError, match=msg): + # require tz match, not just tzawareness match + arr.take([-1, 1], allow_fill=True, fill_value=value) + def test_concat_same_type_invalid(self, arr1d): # different timezones arr = arr1d @@ -1020,7 +1041,7 @@ def test_casting_nat_setitem_array(array, casting_nats): ) def test_invalid_nat_setitem_array(array, non_casting_nats): msg = ( - "'value' should be a '(Timestamp|Timedelta|Period)', 'NaT', or array of those. " + "value should be a '(Timestamp|Timedelta|Period)', 'NaT', or array of those. " "Got '(timedelta64|datetime64|int)' instead." ) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 9f136b4979bb7..9245eda2a71fe 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -9,6 +9,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd +from pandas import NaT import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import sequence_to_dt64ns @@ -396,7 +397,7 @@ def test_searchsorted_invalid_types(self, other, index): msg = "|".join( [ "searchsorted requires compatible dtype or scalar", - "Unexpected type for 'value'", + "value should be a 'Timestamp', 'NaT', or array of those. Got", ] ) with pytest.raises(TypeError, match=msg): @@ -566,3 +567,54 @@ def test_median_2d(self, arr1d): result = arr.median(axis=1, skipna=False) expected = type(arr)._from_sequence([pd.NaT], dtype=arr.dtype) tm.assert_equal(result, expected) + + def test_mean(self, arr1d): + arr = arr1d + + # manually verified result + expected = arr[0] + 0.4 * pd.Timedelta(days=1) + + result = arr.mean() + assert result == expected + result = arr.mean(skipna=False) + assert result is pd.NaT + + result = arr.dropna().mean(skipna=False) + assert result == expected + + result = arr.mean(axis=0) + assert result == expected + + def test_mean_2d(self): + dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific") + dta = dti._data.reshape(3, 2) + + result = dta.mean(axis=0) + expected = dta[1] + tm.assert_datetime_array_equal(result, expected) + + result = dta.mean(axis=1) + expected = dta[:, 0] + pd.Timedelta(hours=12) + tm.assert_datetime_array_equal(result, expected) + + result = dta.mean(axis=None) + expected = dti.mean() + assert result == expected + + @pytest.mark.parametrize("skipna", [True, False]) + def test_mean_empty(self, arr1d, skipna): + arr = arr1d[:0] + + assert arr.mean(skipna=skipna) is NaT + + arr2d = arr.reshape(0, 3) + result = arr2d.mean(axis=0, skipna=skipna) + expected = DatetimeArray._from_sequence([NaT, NaT, NaT], dtype=arr.dtype) + tm.assert_datetime_array_equal(result, expected) + + result = arr2d.mean(axis=1, skipna=skipna) + expected = arr # i.e. 1D, empty + tm.assert_datetime_array_equal(result, expected) + + result = arr2d.mean(axis=None, skipna=skipna) + assert result is NaT diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index b3b8f4d55e4de..95265a958c35d 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -2,7 +2,9 @@ import pytest import pandas as pd +from pandas import Timedelta import pandas._testing as tm +from pandas.core import nanops from pandas.core.arrays import TimedeltaArray @@ -136,7 +138,7 @@ def test_searchsorted_invalid_types(self, other, index): msg = "|".join( [ "searchsorted requires compatible dtype or scalar", - "Unexpected type for 'value'", + "value should be a 'Timedelta', 'NaT', or array of those. Got", ] ) with pytest.raises(TypeError, match=msg): @@ -175,7 +177,7 @@ def test_neg_freq(self): class TestReductions: - @pytest.mark.parametrize("name", ["sum", "std", "min", "max", "median"]) + @pytest.mark.parametrize("name", ["std", "min", "max", "median", "mean"]) @pytest.mark.parametrize("skipna", [True, False]) def test_reductions_empty(self, name, skipna): tdi = pd.TimedeltaIndex([]) @@ -187,6 +189,19 @@ def test_reductions_empty(self, name, skipna): result = getattr(arr, name)(skipna=skipna) assert result is pd.NaT + @pytest.mark.parametrize("skipna", [True, False]) + def test_sum_empty(self, skipna): + tdi = pd.TimedeltaIndex([]) + arr = tdi.array + + result = tdi.sum(skipna=skipna) + assert isinstance(result, Timedelta) + assert result == Timedelta(0) + + result = arr.sum(skipna=skipna) + assert isinstance(result, Timedelta) + assert result == Timedelta(0) + def test_min_max(self): arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"]) @@ -251,6 +266,30 @@ def test_npsum(self): assert isinstance(result, pd.Timedelta) assert result == expected + def test_sum_2d_skipna_false(self): + arr = np.arange(8).astype(np.int64).view("m8[s]").astype("m8[ns]").reshape(4, 2) + arr[-1, -1] = "Nat" + + tda = TimedeltaArray(arr) + + result = tda.sum(skipna=False) + assert result is pd.NaT + + result = tda.sum(axis=0, skipna=False) + expected = pd.TimedeltaIndex([pd.Timedelta(seconds=12), pd.NaT])._values + tm.assert_timedelta_array_equal(result, expected) + + result = tda.sum(axis=1, skipna=False) + expected = pd.TimedeltaIndex( + [ + pd.Timedelta(seconds=1), + pd.Timedelta(seconds=5), + pd.Timedelta(seconds=9), + pd.NaT, + ] + )._values + tm.assert_timedelta_array_equal(result, expected) + def test_std(self): tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"]) arr = tdi.array @@ -264,12 +303,19 @@ def test_std(self): assert isinstance(result, pd.Timedelta) assert result == expected + result = nanops.nanstd(np.asarray(arr), skipna=True) + assert isinstance(result, pd.Timedelta) + assert result == expected + result = arr.std(skipna=False) assert result is pd.NaT result = tdi.std(skipna=False) assert result is pd.NaT + result = nanops.nanstd(np.asarray(arr), skipna=False) + assert result is pd.NaT + def test_median(self): tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) arr = tdi.array @@ -283,8 +329,42 @@ def test_median(self): assert isinstance(result, pd.Timedelta) assert result == expected - result = arr.std(skipna=False) + result = arr.median(skipna=False) assert result is pd.NaT - result = tdi.std(skipna=False) + result = tdi.median(skipna=False) + assert result is pd.NaT + + def test_mean(self): + tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + arr = tdi._data + + # manually verified result + expected = pd.Timedelta(arr.dropna()._ndarray.mean()) + + result = arr.mean() + assert result == expected + result = arr.mean(skipna=False) assert result is pd.NaT + + result = arr.dropna().mean(skipna=False) + assert result == expected + + result = arr.mean(axis=0) + assert result == expected + + def test_mean_2d(self): + tdi = pd.timedelta_range("14 days", periods=6) + tda = tdi._data.reshape(3, 2) + + result = tda.mean(axis=0) + expected = tda[1] + tm.assert_timedelta_array_equal(result, expected) + + result = tda.mean(axis=1) + expected = tda[:, 0] + pd.Timedelta(hours=12) + tm.assert_timedelta_array_equal(result, expected) + + result = tda.mean(axis=None) + expected = tdi.mean() + assert result == expected diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 26ad6fc1c6572..538cf2c78b50e 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -5,7 +5,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd -from pandas import CategoricalIndex, Series, Timedelta, Timestamp +from pandas import CategoricalIndex, Series, Timedelta, Timestamp, date_range import pandas._testing as tm from pandas.core.arrays import ( DatetimeArray, @@ -208,7 +208,7 @@ def test_iter_box(self): ], ) def test_values_consistent(array, expected_type, dtype): - l_values = pd.Series(array)._values + l_values = Series(array)._values r_values = pd.Index(array)._values assert type(l_values) is expected_type assert type(l_values) is type(r_values) @@ -218,14 +218,14 @@ def test_values_consistent(array, expected_type, dtype): @pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) def test_numpy_array(arr): - ser = pd.Series(arr) + ser = Series(arr) result = ser.array expected = PandasArray(arr) tm.assert_extension_array_equal(result, expected) def test_numpy_array_all_dtypes(any_numpy_dtype): - ser = pd.Series(dtype=any_numpy_dtype) + ser = Series(dtype=any_numpy_dtype) result = ser.array if is_datetime64_dtype(any_numpy_dtype): assert isinstance(result, DatetimeArray) @@ -336,7 +336,7 @@ def test_to_numpy(array, expected, index_or_series): def test_to_numpy_copy(arr, as_series): obj = pd.Index(arr, copy=False) if as_series: - obj = pd.Series(obj.values, copy=False) + obj = Series(obj.values, copy=False) # no copy by default result = obj.to_numpy() @@ -355,7 +355,7 @@ def test_to_numpy_dtype(as_series): tz = "US/Eastern" obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) if as_series: - obj = pd.Series(obj) + obj = Series(obj) # preserve tz by default result = obj.to_numpy() @@ -395,13 +395,13 @@ def test_to_numpy_na_value_numpy_dtype( def test_to_numpy_kwargs_raises(): # numpy - s = pd.Series([1, 2, 3]) + s = Series([1, 2, 3]) msg = r"to_numpy\(\) got an unexpected keyword argument 'foo'" with pytest.raises(TypeError, match=msg): s.to_numpy(foo=True) # extension - s = pd.Series([1, 2, 3], dtype="Int64") + s = Series([1, 2, 3], dtype="Int64") with pytest.raises(TypeError, match=msg): s.to_numpy(foo=True) @@ -449,3 +449,39 @@ def test_to_numpy_dataframe_single_block_no_mutate(): expected = pd.DataFrame(np.array([1.0, 2.0, np.nan])) result.to_numpy(na_value=0.0) tm.assert_frame_equal(result, expected) + + +class TestAsArray: + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_asarray_object_dt64(self, tz): + ser = Series(date_range("2000", periods=2, tz=tz)) + + with tm.assert_produces_warning(None): + # Future behavior (for tzaware case) with no warning + result = np.asarray(ser, dtype=object) + + expected = np.array( + [Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)] + ) + tm.assert_numpy_array_equal(result, expected) + + def test_asarray_tz_naive(self): + # This shouldn't produce a warning. + ser = Series(date_range("2000", periods=2)) + expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") + result = np.asarray(ser) + + tm.assert_numpy_array_equal(result, expected) + + def test_asarray_tz_aware(self): + tz = "US/Central" + ser = Series(date_range("2000", periods=2, tz=tz)) + expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") + result = np.asarray(ser, dtype="datetime64[ns]") + + tm.assert_numpy_array_equal(result, expected) + + # Old behavior with no warning + result = np.asarray(ser, dtype="M8[ns]") + + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/base/test_drop_duplicates.py b/pandas/tests/base/test_drop_duplicates.py index 4032890b4db18..8cde7aae5df05 100644 --- a/pandas/tests/base/test_drop_duplicates.py +++ b/pandas/tests/base/test_drop_duplicates.py @@ -1,12 +1,14 @@ from datetime import datetime import numpy as np +import pytest import pandas as pd import pandas._testing as tm -def test_drop_duplicates_series_vs_dataframe(): +@pytest.mark.parametrize("keep", ["first", "last", False]) +def test_drop_duplicates_series_vs_dataframe(keep): # GH 14192 df = pd.DataFrame( { @@ -24,7 +26,6 @@ def test_drop_duplicates_series_vs_dataframe(): } ) for column in df.columns: - for keep in ["first", "last", False]: - dropped_frame = df[[column]].drop_duplicates(keep=keep) - dropped_series = df[column].drop_duplicates(keep=keep) - tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) + dropped_frame = df[[column]].drop_duplicates(keep=keep) + dropped_series = df[column].drop_duplicates(keep=keep) + tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py deleted file mode 100644 index f8cbadb987d29..0000000000000 --- a/pandas/tests/base/test_factorize.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -import pandas._testing as tm - - -@pytest.mark.parametrize("sort", [True, False]) -def test_factorize(index_or_series_obj, sort): - obj = index_or_series_obj - result_codes, result_uniques = obj.factorize(sort=sort) - - constructor = pd.Index - if isinstance(obj, pd.MultiIndex): - constructor = pd.MultiIndex.from_tuples - expected_uniques = constructor(obj.unique()) - - if sort: - expected_uniques = expected_uniques.sort_values() - - # construct an integer ndarray so that - # `expected_uniques.take(expected_codes)` is equal to `obj` - expected_uniques_list = list(expected_uniques) - expected_codes = [expected_uniques_list.index(val) for val in obj] - expected_codes = np.asarray(expected_codes, dtype=np.intp) - - tm.assert_numpy_array_equal(result_codes, expected_codes) - tm.assert_index_equal(result_uniques, expected_uniques) - - -def test_series_factorize_na_sentinel_none(): - # GH35667 - values = np.array([1, 2, 1, np.nan]) - ser = pd.Series(values) - codes, uniques = ser.factorize(na_sentinel=None) - - expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) - expected_uniques = pd.Index([1.0, 2.0, np.nan]) - - tm.assert_numpy_array_equal(codes, expected_codes) - tm.assert_index_equal(uniques, expected_uniques) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 2dc2fe6d2ad07..298f8ca4f0f73 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas.compat import PYPY +from pandas.compat import IS64, PYPY from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -14,7 +14,6 @@ import pandas as pd from pandas import DataFrame, Index, IntervalIndex, Series -import pandas._testing as tm @pytest.mark.parametrize( @@ -128,7 +127,10 @@ def test_memory_usage(index_or_series_obj): ) if len(obj) == 0: - expected = 0 if isinstance(obj, Index) else 80 + if isinstance(obj, Index): + expected = 0 + else: + expected = 80 if IS64 else 48 assert res_deep == res == expected elif is_object or is_categorical: # only deep will pick them up @@ -182,7 +184,7 @@ def test_access_by_position(index): elif isinstance(index, pd.MultiIndex): pytest.skip("Can't instantiate Series from MultiIndex") - series = pd.Series(index) + series = Series(index) assert index[0] == series.iloc[0] assert index[5] == series.iloc[5] assert index[-1] == series.iloc[-1] @@ -196,10 +198,3 @@ def test_access_by_position(index): msg = "single positional indexer is out-of-bounds" with pytest.raises(IndexError, match=msg): series.iloc[size] - - -def test_get_indexer_non_unique_dtype_mismatch(): - # GH 25459 - indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) - tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) - tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 73a41e7010c5f..def7e41e22fb1 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -30,10 +30,10 @@ def test_value_counts(index_or_series_obj): result = obj.value_counts() counter = collections.Counter(obj) - expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) + expected = Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) expected.index = expected.index.astype(obj.dtype) if isinstance(obj, pd.MultiIndex): - expected.index = pd.Index(expected.index) + expected.index = Index(expected.index) # TODO: Order of entries with the same count is inconsistent on CI (gh-32449) if obj.duplicated().any(): @@ -67,7 +67,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): # because np.nan == np.nan is False, but None == None is True # np.nan would be duplicated, whereas None wouldn't counter = collections.Counter(obj.dropna()) - expected = pd.Series(dict(counter.most_common()), dtype=np.int64) + expected = Series(dict(counter.most_common()), dtype=np.int64) expected.index = expected.index.astype(obj.dtype) result = obj.value_counts() @@ -80,7 +80,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): # can't use expected[null_obj] = 3 as # IntervalIndex doesn't allow assignment - new_entry = pd.Series({np.nan: 3}, dtype=np.int64) + new_entry = Series({np.nan: 3}, dtype=np.int64) expected = expected.append(new_entry) result = obj.value_counts(dropna=False) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 2c5846872c341..728ffc6f85ba4 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2,13 +2,15 @@ from functools import reduce from itertools import product import operator -from typing import Dict, Type +from typing import Dict, List, Type import warnings import numpy as np from numpy.random import rand, randint, randn import pytest +from pandas.compat import is_platform_windows +from pandas.compat.numpy import np_version_under1p17 from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -65,15 +67,18 @@ def ne_lt_2_6_9(): return "numexpr" -@pytest.fixture -def unary_fns_for_ne(): +def _get_unary_fns_for_ne(): if NUMEXPR_INSTALLED: if NUMEXPR_VERSION >= LooseVersion("2.6.9"): - return _unary_math_ops + return list(_unary_math_ops) else: - return tuple(x for x in _unary_math_ops if x not in ("floor", "ceil")) - else: - pytest.skip("numexpr is not present") + return [x for x in _unary_math_ops if x not in ["floor", "ceil"]] + return [] + + +@pytest.fixture(params=_get_unary_fns_for_ne()) +def unary_fns_for_ne(request): + return request.param def engine_has_neg_frac(engine): @@ -114,84 +119,72 @@ def _is_py3_complex_incompat(result, expected): return isinstance(expected, (complex, np.complexfloating)) and np.isnan(result) -_good_arith_ops = set(ARITH_OPS_SYMS).difference(SPECIAL_CASE_ARITH_OPS_SYMS) +_good_arith_ops = sorted(set(ARITH_OPS_SYMS).difference(SPECIAL_CASE_ARITH_OPS_SYMS)) + + +# TODO: using range(5) here is a kludge +@pytest.fixture(params=list(range(5))) +def lhs(request): + + nan_df1 = DataFrame(rand(10, 5)) + nan_df1[nan_df1 > 0.5] = np.nan + + opts = ( + DataFrame(randn(10, 5)), + Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), + nan_df1, + randn(), + ) + return opts[request.param] + + +rhs = lhs +midhs = lhs @td.skip_if_no_ne class TestEvalNumexprPandas: + exclude_cmp: List[str] = [] + exclude_bool: List[str] = [] + + engine = "numexpr" + parser = "pandas" + @classmethod def setup_class(cls): import numexpr as ne cls.ne = ne - cls.engine = "numexpr" - cls.parser = "pandas" - @classmethod - def teardown_class(cls): - del cls.engine, cls.parser - if hasattr(cls, "ne"): - del cls.ne - - def setup_data(self): - nan_df1 = DataFrame(rand(10, 5)) - nan_df1[nan_df1 > 0.5] = np.nan - nan_df2 = DataFrame(rand(10, 5)) - nan_df2[nan_df2 > 0.5] = np.nan - - self.pandas_lhses = ( - DataFrame(randn(10, 5)), - Series(randn(5)), - Series([1, 2, np.nan, np.nan, 5]), - nan_df1, - ) - self.pandas_rhses = ( - DataFrame(randn(10, 5)), - Series(randn(5)), - Series([1, 2, np.nan, np.nan, 5]), - nan_df2, - ) - self.scalar_lhses = (randn(),) - self.scalar_rhses = (randn(),) - - self.lhses = self.pandas_lhses + self.scalar_lhses - self.rhses = self.pandas_rhses + self.scalar_rhses - - def setup_ops(self): - self.cmp_ops = expr.CMP_OPS_SYMS - self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = expr.BOOL_OPS_SYMS - self.special_case_ops = SPECIAL_CASE_ARITH_OPS_SYMS - self.arith_ops = _good_arith_ops - self.unary_ops = "-", "~", "not " - - def setup_method(self, method): - self.setup_ops() - self.setup_data() - self.current_engines = (engine for engine in ENGINES if engine != self.engine) - - def teardown_method(self, method): - del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses - del self.pandas_rhses, self.pandas_lhses, self.current_engines - - @pytest.mark.slow + @property + def current_engines(self): + return (engine for engine in ENGINES if engine != self.engine) + @pytest.mark.parametrize( "cmp1", ["!=", "==", "<=", ">=", "<", ">"], ids=["ne", "eq", "le", "ge", "lt", "gt"], ) @pytest.mark.parametrize("cmp2", [">", "<"], ids=["gt", "lt"]) - def test_complex_cmp_ops(self, cmp1, cmp2): - for lhs, rhs, binop in product(self.lhses, self.rhses, self.bin_ops): - lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) - rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) - expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + @pytest.mark.parametrize("binop", expr.BOOL_OPS_SYMS) + def test_complex_cmp_ops(self, cmp1, cmp2, binop, lhs, rhs): + if binop in self.exclude_bool: + pytest.skip() - ex = f"(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)" - result = pd.eval(ex, engine=self.engine, parser=self.parser) - self.check_equal(result, expected) + lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) + rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) + expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + + ex = f"(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)" + result = pd.eval(ex, engine=self.engine, parser=self.parser) + self.check_equal(result, expected) + + @pytest.mark.parametrize("cmp_op", expr.CMP_OPS_SYMS) + def test_simple_cmp_ops(self, cmp_op): + if cmp_op in self.exclude_cmp: + pytest.skip() - def test_simple_cmp_ops(self): bool_lhses = ( DataFrame(tm.randbool(size=(10, 5))), Series(tm.randbool((5,))), @@ -202,46 +195,58 @@ def test_simple_cmp_ops(self): Series(tm.randbool((5,))), tm.randbool(), ) - for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops): + for lhs, rhs in product(bool_lhses, bool_rhses): self.check_simple_cmp_op(lhs, cmp_op, rhs) - @pytest.mark.slow - def test_binary_arith_ops(self): - for lhs, op, rhs in product(self.lhses, self.arith_ops, self.rhses): - self.check_binary_arith_op(lhs, op, rhs) + @pytest.mark.parametrize("op", _good_arith_ops) + def test_binary_arith_ops(self, op, lhs, rhs, request): + + if ( + op == "/" + and isinstance(lhs, DataFrame) + and isinstance(rhs, DataFrame) + and not lhs.isna().any().any() + and rhs.shape == (10, 5) + and np_version_under1p17 + and is_platform_windows() + and compat.PY38 + ): + mark = pytest.mark.xfail( + reason="GH#37328 floating point precision on Windows builds" + ) + request.node.add_marker(mark) + + self.check_binary_arith_op(lhs, op, rhs) - def test_modulus(self): - for lhs, rhs in product(self.lhses, self.rhses): - self.check_modulus(lhs, "%", rhs) + def test_modulus(self, lhs, rhs): + self.check_modulus(lhs, "%", rhs) - def test_floor_division(self): - for lhs, rhs in product(self.lhses, self.rhses): - self.check_floor_division(lhs, "//", rhs) + def test_floor_division(self, lhs, rhs): + self.check_floor_division(lhs, "//", rhs) @td.skip_if_windows - def test_pow(self): + def test_pow(self, lhs, rhs): # odd failure on win32 platform, so skip - for lhs, rhs in product(self.lhses, self.rhses): - self.check_pow(lhs, "**", rhs) - - @pytest.mark.slow - def test_single_invert_op(self): - for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): - self.check_single_invert_op(lhs, op, rhs) - - @pytest.mark.slow - def test_compound_invert_op(self): - for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): - self.check_compound_invert_op(lhs, op, rhs) - - @pytest.mark.slow - def test_chained_cmp_op(self): - mids = self.lhses - cmp_ops = "<", ">" - for lhs, cmp1, mid, cmp2, rhs in product( - self.lhses, cmp_ops, mids, cmp_ops, self.rhses - ): - self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) + self.check_pow(lhs, "**", rhs) + + @pytest.mark.parametrize("op", expr.CMP_OPS_SYMS) + def test_single_invert_op(self, op, lhs): + if op in self.exclude_cmp: + pytest.skip() + + self.check_single_invert_op(lhs, op) + + @pytest.mark.parametrize("op", expr.CMP_OPS_SYMS) + def test_compound_invert_op(self, op, lhs, rhs): + if op in self.exclude_cmp: + pytest.skip() + + self.check_compound_invert_op(lhs, op, rhs) + + @pytest.mark.parametrize("cmp1", ["<", ">"]) + @pytest.mark.parametrize("cmp2", ["<", ">"]) + def test_chained_cmp_op(self, cmp1, cmp2, lhs, midhs, rhs): + self.check_chained_cmp_op(lhs, cmp1, midhs, cmp2, rhs) def check_equal(self, result, expected): if isinstance(result, DataFrame): @@ -311,10 +316,13 @@ def check_alignment(self, result, nlhs, ghs, op): # TypeError, AttributeError: series or frame with scalar align pass else: - # direct numpy comparison expected = self.ne.evaluate(f"nlhs {op} ghs") - tm.assert_numpy_array_equal(result.values, expected) + # Update assert statement due to unreliable numerical + # precision component (GH37328) + # TODO: update testing code so that assert_almost_equal statement + # can be replaced again by the assert_numpy_array_equal statement + tm.assert_almost_equal(result.values, expected) # modulus, pow, and floor division require special casing @@ -388,21 +396,20 @@ def check_pow(self, lhs, arith1, rhs): ) tm.assert_almost_equal(result, expected) - def check_single_invert_op(self, lhs, cmp1, rhs): + def check_single_invert_op(self, elem, cmp1): # simple - for el in (lhs, rhs): - try: - elb = el.astype(bool) - except AttributeError: - elb = np.array([bool(el)]) - expected = ~elb - result = pd.eval("~elb", engine=self.engine, parser=self.parser) - tm.assert_almost_equal(expected, result) - - for engine in self.current_engines: - tm.assert_almost_equal( - result, pd.eval("~elb", engine=engine, parser=self.parser) - ) + try: + elb = elem.astype(bool) + except AttributeError: + elb = np.array([bool(elem)]) + expected = ~elb + result = pd.eval("~elb", engine=self.engine, parser=self.parser) + tm.assert_almost_equal(expected, result) + + for engine in self.current_engines: + tm.assert_almost_equal( + result, pd.eval("~elb", engine=engine, parser=self.parser) + ) def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = ["in", "not in"] @@ -667,7 +674,7 @@ def test_unary_in_array(self): @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_float_comparison_bin_op(self, dtype): # GH 16363 - df = pd.DataFrame({"x": np.array([0], dtype=dtype)}) + df = DataFrame({"x": np.array([0], dtype=dtype)}) res = df.eval("x < -0.1") assert res.values == np.array([False]) @@ -734,7 +741,7 @@ def test_float_truncation(self): expected = np.float64(exp) assert result == expected - df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) + df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) cutoff = 1000000000.0006 result = df.query(f"A < {cutoff:.4f}") assert result.empty @@ -751,12 +758,12 @@ def test_float_truncation(self): def test_disallow_python_keywords(self): # GH 18221 - df = pd.DataFrame([[0, 0, 0]], columns=["foo", "bar", "class"]) + df = DataFrame([[0, 0, 0]], columns=["foo", "bar", "class"]) msg = "Python keyword not valid identifier in numexpr query" with pytest.raises(SyntaxError, match=msg): df.query("class == 0") - df = pd.DataFrame() + df = DataFrame() df.index.name = "lambda" with pytest.raises(SyntaxError, match=msg): df.query("lambda == 0") @@ -764,22 +771,18 @@ def test_disallow_python_keywords(self): @td.skip_if_no_ne class TestEvalNumexprPython(TestEvalNumexprPandas): + exclude_cmp = ["in", "not in"] + exclude_bool = ["and", "or"] + + engine = "numexpr" + parser = "python" + @classmethod def setup_class(cls): super().setup_class() import numexpr as ne cls.ne = ne - cls.engine = "numexpr" - cls.parser = "python" - - def setup_ops(self): - self.cmp_ops = [op for op in expr.CMP_OPS_SYMS if op not in ("in", "not in")] - self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = [op for op in expr.BOOL_OPS_SYMS if op not in ("and", "or")] - self.special_case_ops = SPECIAL_CASE_ARITH_OPS_SYMS - self.arith_ops = _good_arith_ops - self.unary_ops = "+", "-", "~" def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): ex1 = f"lhs {cmp1} mid {cmp2} rhs" @@ -789,11 +792,8 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): class TestEvalPythonPython(TestEvalNumexprPython): - @classmethod - def setup_class(cls): - super().setup_class() - cls.engine = "python" - cls.parser = "python" + engine = "python" + parser = "python" def check_modulus(self, lhs, arith1, rhs): ex = f"lhs {arith1} rhs" @@ -818,11 +818,8 @@ def check_alignment(self, result, nlhs, ghs, op): class TestEvalPythonPandas(TestEvalPythonPython): - @classmethod - def setup_class(cls): - super().setup_class() - cls.engine = "python" - cls.parser = "pandas" + engine = "python" + parser = "pandas" def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs) @@ -871,8 +868,8 @@ def should_warn(*args): class TestAlignment: - index_types = "i", "u", "dt" - lhs_index_types = index_types + ("s",) # 'p' + index_types = ["i", "u", "dt"] + lhs_index_types = index_types + ["s"] # 'p' def test_align_nested_unary_op(self, engine, parser): s = "df * ~2" @@ -880,65 +877,73 @@ def test_align_nested_unary_op(self, engine, parser): res = pd.eval(s, engine=engine, parser=parser) tm.assert_frame_equal(res, df * ~2) - def test_basic_frame_alignment(self, engine, parser): - args = product(self.lhs_index_types, self.index_types, self.index_types) + @pytest.mark.parametrize("lr_idx_type", lhs_index_types) + @pytest.mark.parametrize("rr_idx_type", index_types) + @pytest.mark.parametrize("c_idx_type", index_types) + def test_basic_frame_alignment( + self, engine, parser, lr_idx_type, rr_idx_type, c_idx_type + ): with warnings.catch_warnings(record=True): warnings.simplefilter("always", RuntimeWarning) - for lr_idx_type, rr_idx_type, c_idx_type in args: - df = tm.makeCustomDataframe( - 10, 10, data_gen_f=f, r_idx_type=lr_idx_type, c_idx_type=c_idx_type - ) - df2 = tm.makeCustomDataframe( - 20, 10, data_gen_f=f, r_idx_type=rr_idx_type, c_idx_type=c_idx_type - ) - # only warns if not monotonic and not sortable - if should_warn(df.index, df2.index): - with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval("df + df2", engine=engine, parser=parser) - else: - res = pd.eval("df + df2", engine=engine, parser=parser) - tm.assert_frame_equal(res, df + df2) - def test_frame_comparison(self, engine, parser): - args = product(self.lhs_index_types, repeat=2) - for r_idx_type, c_idx_type in args: df = tm.makeCustomDataframe( - 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + 10, 10, data_gen_f=f, r_idx_type=lr_idx_type, c_idx_type=c_idx_type ) - res = pd.eval("df < 2", engine=engine, parser=parser) - tm.assert_frame_equal(res, df < 2) + df2 = tm.makeCustomDataframe( + 20, 10, data_gen_f=f, r_idx_type=rr_idx_type, c_idx_type=c_idx_type + ) + # only warns if not monotonic and not sortable + if should_warn(df.index, df2.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval("df + df2", engine=engine, parser=parser) + else: + res = pd.eval("df + df2", engine=engine, parser=parser) + tm.assert_frame_equal(res, df + df2) + + @pytest.mark.parametrize("r_idx_type", lhs_index_types) + @pytest.mark.parametrize("c_idx_type", lhs_index_types) + def test_frame_comparison(self, engine, parser, r_idx_type, c_idx_type): + df = tm.makeCustomDataframe( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + res = pd.eval("df < 2", engine=engine, parser=parser) + tm.assert_frame_equal(res, df < 2) - df3 = DataFrame(randn(*df.shape), index=df.index, columns=df.columns) - res = pd.eval("df < df3", engine=engine, parser=parser) - tm.assert_frame_equal(res, df < df3) + df3 = DataFrame(randn(*df.shape), index=df.index, columns=df.columns) + res = pd.eval("df < df3", engine=engine, parser=parser) + tm.assert_frame_equal(res, df < df3) - @pytest.mark.slow - def test_medium_complex_frame_alignment(self, engine, parser): - args = product( - self.lhs_index_types, self.index_types, self.index_types, self.index_types - ) + @pytest.mark.parametrize("r1", lhs_index_types) + @pytest.mark.parametrize("c1", index_types) + @pytest.mark.parametrize("r2", index_types) + @pytest.mark.parametrize("c2", index_types) + def test_medium_complex_frame_alignment(self, engine, parser, r1, c1, r2, c2): with warnings.catch_warnings(record=True): warnings.simplefilter("always", RuntimeWarning) - for r1, c1, r2, c2 in args: - df = tm.makeCustomDataframe( - 3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1 - ) - df2 = tm.makeCustomDataframe( - 4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 - ) - df3 = tm.makeCustomDataframe( - 5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 - ) - if should_warn(df.index, df2.index, df3.index): - with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval("df + df2 + df3", engine=engine, parser=parser) - else: + df = tm.makeCustomDataframe( + 3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1 + ) + df2 = tm.makeCustomDataframe( + 4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 + ) + df3 = tm.makeCustomDataframe( + 5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 + ) + if should_warn(df.index, df2.index, df3.index): + with tm.assert_produces_warning(RuntimeWarning): res = pd.eval("df + df2 + df3", engine=engine, parser=parser) - tm.assert_frame_equal(res, df + df2 + df3) - - def test_basic_frame_series_alignment(self, engine, parser): + else: + res = pd.eval("df + df2 + df3", engine=engine, parser=parser) + tm.assert_frame_equal(res, df + df2 + df3) + + @pytest.mark.parametrize("index_name", ["index", "columns"]) + @pytest.mark.parametrize("c_idx_type", index_types) + @pytest.mark.parametrize("r_idx_type", lhs_index_types) + def test_basic_frame_series_alignment( + self, engine, parser, index_name, r_idx_type, c_idx_type + ): def testit(r_idx_type, c_idx_type, index_name): df = tm.makeCustomDataframe( 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type @@ -958,13 +963,13 @@ def testit(r_idx_type, c_idx_type, index_name): expected = df + s tm.assert_frame_equal(res, expected) - args = product(self.lhs_index_types, self.index_types, ("index", "columns")) with warnings.catch_warnings(record=True): warnings.simplefilter("always", RuntimeWarning) - for r_idx_type, c_idx_type, index_name in args: - testit(r_idx_type, c_idx_type, index_name) - def test_basic_series_frame_alignment(self, engine, parser): + testit(r_idx_type, c_idx_type, index_name) + + @pytest.mark.parametrize("index_name", ["index", "columns"]) + def test_basic_series_frame_alignment(self, engine, parser, index_name): def testit(r_idx_type, c_idx_type, index_name): df = tm.makeCustomDataframe( 10, 7, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type @@ -984,101 +989,104 @@ def testit(r_idx_type, c_idx_type, index_name): tm.assert_frame_equal(res, expected) # only test dt with dt, otherwise weird joins result - args = product(["i", "u", "s"], ["i", "u", "s"], ("index", "columns")) + args = product(["i", "u", "s"], ["i", "u", "s"]) with warnings.catch_warnings(record=True): # avoid warning about comparing strings and ints warnings.simplefilter("ignore", RuntimeWarning) - for r_idx_type, c_idx_type, index_name in args: + for r_idx_type, c_idx_type in args: testit(r_idx_type, c_idx_type, index_name) # dt with dt - args = product(["dt"], ["dt"], ("index", "columns")) + args = product(["dt"], ["dt"]) with warnings.catch_warnings(record=True): # avoid warning about comparing strings and ints warnings.simplefilter("ignore", RuntimeWarning) - for r_idx_type, c_idx_type, index_name in args: + for r_idx_type, c_idx_type in args: testit(r_idx_type, c_idx_type, index_name) - def test_series_frame_commutativity(self, engine, parser): - args = product( - self.lhs_index_types, self.index_types, ("+", "*"), ("index", "columns") - ) + @pytest.mark.parametrize("c_idx_type", index_types) + @pytest.mark.parametrize("r_idx_type", lhs_index_types) + @pytest.mark.parametrize("index_name", ["index", "columns"]) + @pytest.mark.parametrize("op", ["+", "*"]) + def test_series_frame_commutativity( + self, engine, parser, index_name, op, r_idx_type, c_idx_type + ): with warnings.catch_warnings(record=True): warnings.simplefilter("always", RuntimeWarning) - for r_idx_type, c_idx_type, op, index_name in args: - df = tm.makeCustomDataframe( - 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type - ) - index = getattr(df, index_name) - s = Series(np.random.randn(5), index[:5]) - - lhs = f"s {op} df" - rhs = f"df {op} s" - if should_warn(df.index, s.index): - with tm.assert_produces_warning(RuntimeWarning): - a = pd.eval(lhs, engine=engine, parser=parser) - with tm.assert_produces_warning(RuntimeWarning): - b = pd.eval(rhs, engine=engine, parser=parser) - else: - a = pd.eval(lhs, engine=engine, parser=parser) - b = pd.eval(rhs, engine=engine, parser=parser) - if r_idx_type != "dt" and c_idx_type != "dt": - if engine == "numexpr": - tm.assert_frame_equal(a, b) + df = tm.makeCustomDataframe( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) - @pytest.mark.slow - def test_complex_series_frame_alignment(self, engine, parser): + lhs = f"s {op} df" + rhs = f"df {op} s" + if should_warn(df.index, s.index): + with tm.assert_produces_warning(RuntimeWarning): + a = pd.eval(lhs, engine=engine, parser=parser) + with tm.assert_produces_warning(RuntimeWarning): + b = pd.eval(rhs, engine=engine, parser=parser) + else: + a = pd.eval(lhs, engine=engine, parser=parser) + b = pd.eval(rhs, engine=engine, parser=parser) + + if r_idx_type != "dt" and c_idx_type != "dt": + if engine == "numexpr": + tm.assert_frame_equal(a, b) + + @pytest.mark.parametrize("r1", lhs_index_types) + @pytest.mark.parametrize("c1", index_types) + @pytest.mark.parametrize("r2", index_types) + @pytest.mark.parametrize("c2", index_types) + def test_complex_series_frame_alignment(self, engine, parser, r1, c1, r2, c2): import random - args = product( - self.lhs_index_types, self.index_types, self.index_types, self.index_types - ) n = 3 m1 = 5 m2 = 2 * m1 with warnings.catch_warnings(record=True): warnings.simplefilter("always", RuntimeWarning) - for r1, r2, c1, c2 in args: - index_name = random.choice(["index", "columns"]) - obj_name = random.choice(["df", "df2"]) - df = tm.makeCustomDataframe( - m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1 - ) - df2 = tm.makeCustomDataframe( - m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 - ) - index = getattr(locals().get(obj_name), index_name) - ser = Series(np.random.randn(n), index[:n]) - - if r2 == "dt" or c2 == "dt": - if engine == "numexpr": - expected2 = df2.add(ser) - else: - expected2 = df2 + ser + index_name = random.choice(["index", "columns"]) + obj_name = random.choice(["df", "df2"]) + + df = tm.makeCustomDataframe( + m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1 + ) + df2 = tm.makeCustomDataframe( + m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 + ) + index = getattr(locals().get(obj_name), index_name) + ser = Series(np.random.randn(n), index[:n]) + + if r2 == "dt" or c2 == "dt": + if engine == "numexpr": + expected2 = df2.add(ser) else: expected2 = df2 + ser + else: + expected2 = df2 + ser - if r1 == "dt" or c1 == "dt": - if engine == "numexpr": - expected = expected2.add(df) - else: - expected = expected2 + df + if r1 == "dt" or c1 == "dt": + if engine == "numexpr": + expected = expected2.add(df) else: expected = expected2 + df + else: + expected = expected2 + df - if should_warn(df2.index, ser.index, df.index): - with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval("df2 + ser + df", engine=engine, parser=parser) - else: + if should_warn(df2.index, ser.index, df.index): + with tm.assert_produces_warning(RuntimeWarning): res = pd.eval("df2 + ser + df", engine=engine, parser=parser) - assert res.shape == expected.shape - tm.assert_frame_equal(res, expected) + else: + res = pd.eval("df2 + ser + df", engine=engine, parser=parser) + assert res.shape == expected.shape + tm.assert_frame_equal(res, expected) def test_performance_warning_for_poor_alignment(self, engine, parser): df = DataFrame(randn(1000, 10)) @@ -1131,15 +1139,18 @@ def test_performance_warning_for_poor_alignment(self, engine, parser): @td.skip_if_no_ne class TestOperationsNumExprPandas: - @classmethod - def setup_class(cls): - cls.engine = "numexpr" - cls.parser = "pandas" - cls.arith_ops = expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS + exclude_arith: List[str] = [] + + engine = "numexpr" + parser = "pandas" @classmethod - def teardown_class(cls): - del cls.engine, cls.parser + def setup_class(cls): + cls.arith_ops = [ + op + for op in expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS + if op not in cls.exclude_arith + ] def eval(self, *args, **kwargs): kwargs["engine"] = self.engine @@ -1176,21 +1187,23 @@ def test_simple_arith_ops(self): ) assert y == expec - def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr.BOOL_OPS_SYMS, (True, False), (True, False)): - ex = f"{lhs} {op} {rhs}" - res = self.eval(ex) - exp = eval(ex) - assert res == exp - - def test_bool_ops_with_constants(self): - for op, lhs, rhs in product( - expr.BOOL_OPS_SYMS, ("True", "False"), ("True", "False") - ): - ex = f"{lhs} {op} {rhs}" - res = self.eval(ex) - exp = eval(ex) - assert res == exp + @pytest.mark.parametrize("rhs", [True, False]) + @pytest.mark.parametrize("lhs", [True, False]) + @pytest.mark.parametrize("op", expr.BOOL_OPS_SYMS) + def test_simple_bool_ops(self, rhs, lhs, op): + ex = f"{lhs} {op} {rhs}" + res = self.eval(ex) + exp = eval(ex) + assert res == exp + + @pytest.mark.parametrize("rhs", [True, False]) + @pytest.mark.parametrize("lhs", [True, False]) + @pytest.mark.parametrize("op", expr.BOOL_OPS_SYMS) + def test_bool_ops_with_constants(self, rhs, lhs, op): + ex = f"{lhs} {op} {rhs}" + res = self.eval(ex) + exp = eval(ex) + assert res == exp def test_4d_ndarray_fails(self): x = randn(3, 4, 5, 6) @@ -1366,7 +1379,7 @@ def assignment_not_inplace(self): def test_multi_line_expression(self): # GH 11149 - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() expected["c"] = expected["a"] + expected["b"] @@ -1403,7 +1416,7 @@ def test_multi_line_expression(self): def test_multi_line_expression_not_inplace(self): # GH 11149 - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() expected["c"] = expected["a"] + expected["b"] @@ -1428,7 +1441,7 @@ def test_multi_line_expression_not_inplace(self): def test_multi_line_expression_local_variable(self): # GH 15342 - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() local_var = 7 @@ -1446,7 +1459,7 @@ def test_multi_line_expression_local_variable(self): def test_multi_line_expression_callable_local_variable(self): # 26426 - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) def local_func(a, b): return b @@ -1466,7 +1479,7 @@ def local_func(a, b): def test_multi_line_expression_callable_local_variable_with_kwargs(self): # 26426 - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) def local_func(a, b): return b @@ -1486,7 +1499,7 @@ def local_func(a, b): def test_assignment_in_query(self): # GH 8664 - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() msg = "cannot assign without a target object" with pytest.raises(ValueError, match=msg): @@ -1495,7 +1508,7 @@ def test_assignment_in_query(self): def test_query_inplace(self): # see gh-11149 - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() expected = expected[expected["a"] == 2] df.query("a == 2", inplace=True) @@ -1630,16 +1643,10 @@ def test_simple_in_ops(self): @td.skip_if_no_ne class TestOperationsNumExprPython(TestOperationsNumExprPandas): - @classmethod - def setup_class(cls): - super().setup_class() - cls.engine = "numexpr" - cls.parser = "python" - cls.arith_ops = [ - op - for op in expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS - if op not in ("in", "not in") - ] + exclude_arith: List[str] = ["in", "not in"] + + engine = "numexpr" + parser = "python" def test_check_many_exprs(self): a = 1 # noqa @@ -1695,66 +1702,51 @@ def test_fails_pipe(self): with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, parser=self.parser, engine=self.engine) - def test_bool_ops_with_constants(self): - for op, lhs, rhs in product( - expr.BOOL_OPS_SYMS, ("True", "False"), ("True", "False") - ): - ex = f"{lhs} {op} {rhs}" - if op in ("and", "or"): - msg = "'BoolOp' nodes are not implemented" - with pytest.raises(NotImplementedError, match=msg): - self.eval(ex) - else: - res = self.eval(ex) - exp = eval(ex) - assert res == exp - - def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr.BOOL_OPS_SYMS, (True, False), (True, False)): - ex = f"lhs {op} rhs" - if op in ("and", "or"): - msg = "'BoolOp' nodes are not implemented" - with pytest.raises(NotImplementedError, match=msg): - pd.eval(ex, engine=self.engine, parser=self.parser) - else: - res = pd.eval(ex, engine=self.engine, parser=self.parser) - exp = eval(ex) - assert res == exp + @pytest.mark.parametrize("rhs", [True, False]) + @pytest.mark.parametrize("lhs", [True, False]) + @pytest.mark.parametrize("op", expr.BOOL_OPS_SYMS) + def test_bool_ops_with_constants(self, lhs, rhs, op): + ex = f"{lhs} {op} {rhs}" + if op in ("and", "or"): + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + self.eval(ex) + else: + res = self.eval(ex) + exp = eval(ex) + assert res == exp + + @pytest.mark.parametrize("rhs", [True, False]) + @pytest.mark.parametrize("lhs", [True, False]) + @pytest.mark.parametrize("op", expr.BOOL_OPS_SYMS) + def test_simple_bool_ops(self, lhs, rhs, op): + ex = f"lhs {op} rhs" + if op in ("and", "or"): + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval(ex, engine=self.engine, parser=self.parser) + else: + res = pd.eval(ex, engine=self.engine, parser=self.parser) + exp = eval(ex) + assert res == exp class TestOperationsPythonPython(TestOperationsNumExprPython): - @classmethod - def setup_class(cls): - super().setup_class() - cls.engine = cls.parser = "python" - cls.arith_ops = [ - op - for op in expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS - if op not in ("in", "not in") - ] + engine = "python" + parser = "python" class TestOperationsPythonPandas(TestOperationsNumExprPandas): - @classmethod - def setup_class(cls): - super().setup_class() - cls.engine = "python" - cls.parser = "pandas" - cls.arith_ops = expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS + exclude_arith: List[str] = [] + + engine = "python" + parser = "pandas" @td.skip_if_no_ne class TestMathPythonPython: - @classmethod - def setup_class(cls): - cls.engine = "python" - cls.parser = "pandas" - cls.unary_fns = _unary_math_ops - cls.binary_fns = _binary_math_ops - - @classmethod - def teardown_class(cls): - del cls.engine, cls.parser + engine = "python" + parser = "pandas" def eval(self, *args, **kwargs): kwargs["engine"] = self.engine @@ -1766,30 +1758,32 @@ def test_unary_functions(self, unary_fns_for_ne): df = DataFrame({"a": np.random.randn(10)}) a = df.a - for fn in unary_fns_for_ne: - expr = f"{fn}(a)" - got = self.eval(expr) - with np.errstate(all="ignore"): - expect = getattr(np, fn)(a) - tm.assert_series_equal(got, expect, check_names=False) + fn = unary_fns_for_ne - def test_floor_and_ceil_functions_raise_error(self, ne_lt_2_6_9, unary_fns_for_ne): - for fn in ("floor", "ceil"): - msg = f'"{fn}" is not a supported function' - with pytest.raises(ValueError, match=msg): - expr = f"{fn}(100)" - self.eval(expr) + expr = f"{fn}(a)" + got = self.eval(expr) + with np.errstate(all="ignore"): + expect = getattr(np, fn)(a) + tm.assert_series_equal(got, expect, check_names=False) - def test_binary_functions(self): + @pytest.mark.parametrize("fn", ["floor", "ceil"]) + def test_floor_and_ceil_functions_raise_error(self, ne_lt_2_6_9, fn): + msg = f'"{fn}" is not a supported function' + with pytest.raises(ValueError, match=msg): + expr = f"{fn}(100)" + self.eval(expr) + + @pytest.mark.parametrize("fn", _binary_math_ops) + def test_binary_functions(self, fn): df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) a = df.a b = df.b - for fn in self.binary_fns: - expr = f"{fn}(a, b)" - got = self.eval(expr) - with np.errstate(all="ignore"): - expect = getattr(np, fn)(a, b) - tm.assert_almost_equal(got, expect, check_names=False) + + expr = f"{fn}(a, b)" + got = self.eval(expr) + with np.errstate(all="ignore"): + expect = getattr(np, fn)(a, b) + tm.assert_almost_equal(got, expect, check_names=False) def test_df_use_case(self): df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) @@ -1851,27 +1845,18 @@ def test_keyword_arg(self): class TestMathPythonPandas(TestMathPythonPython): - @classmethod - def setup_class(cls): - super().setup_class() - cls.engine = "python" - cls.parser = "pandas" + engine = "python" + parser = "pandas" class TestMathNumExprPandas(TestMathPythonPython): - @classmethod - def setup_class(cls): - super().setup_class() - cls.engine = "numexpr" - cls.parser = "pandas" + engine = "numexpr" + parser = "pandas" class TestMathNumExprPython(TestMathPythonPython): - @classmethod - def setup_class(cls): - super().setup_class() - cls.engine = "numexpr" - cls.parser = "python" + engine = "numexpr" + parser = "python" _var_s = randn(10) @@ -1925,10 +1910,10 @@ def test_invalid_parser(): @pytest.mark.parametrize("parser", _parsers) def test_disallowed_nodes(engine, parser): VisitorClass = _parsers[parser] - uns_ops = VisitorClass.unsupported_nodes inst = VisitorClass("x + 1", engine, parser) - for ops in uns_ops: + for ops in VisitorClass.unsupported_nodes: + msg = "nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): getattr(inst, ops)() @@ -1947,17 +1932,16 @@ def test_name_error_exprs(engine, parser): pd.eval(e, engine=engine, parser=parser) -def test_invalid_local_variable_reference(engine, parser): +@pytest.mark.parametrize("express", ["a + @b", "@a + b", "@a + @b"]) +def test_invalid_local_variable_reference(engine, parser, express): a, b = 1, 2 # noqa - exprs = "a + @b", "@a + b", "@a + @b" - for _expr in exprs: - if parser != "pandas": - with pytest.raises(SyntaxError, match="The '@' prefix is only"): - pd.eval(_expr, engine=engine, parser=parser) - else: - with pytest.raises(SyntaxError, match="The '@' prefix is not"): - pd.eval(_expr, engine=engine, parser=parser) + if parser != "pandas": + with pytest.raises(SyntaxError, match="The '@' prefix is only"): + pd.eval(express, engine=engine, parser=parser) + else: + with pytest.raises(SyntaxError, match="The '@' prefix is not"): + pd.eval(express, engine=engine, parser=parser) def test_numexpr_builtin_raises(engine, parser): @@ -2052,7 +2036,7 @@ def test_truediv_deprecated(engine, parser): def test_negate_lt_eq_le(engine, parser): - df = pd.DataFrame([[0, 10], [1, 20]], columns=["cat", "count"]) + df = DataFrame([[0, 10], [1, 20]], columns=["cat", "count"]) expected = df[~(df.cat > 0)] result = df.query("~(cat > 0)", engine=engine, parser=parser) @@ -2068,10 +2052,9 @@ def test_negate_lt_eq_le(engine, parser): class TestValidate: - def test_validate_bool_args(self): - invalid_values = [1, "True", [1, 2, 3], 5.0] + @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) + def test_validate_bool_args(self, value): - for value in invalid_values: - msg = 'For argument "inplace" expected type bool, received type' - with pytest.raises(ValueError, match=msg): - pd.eval("2+2", inplace=value) + msg = 'For argument "inplace" expected type bool, received type' + with pytest.raises(ValueError, match=msg): + pd.eval("2+2", inplace=value) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index a58dc5e5ec74a..a419cb0dded79 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -953,9 +953,9 @@ def test_registry_find(dtype, expected): (bool, True), (np.bool_, True), (np.array(["a", "b"]), False), - (pd.Series([1, 2]), False), + (Series([1, 2]), False), (np.array([True, False]), True), - (pd.Series([True, False]), True), + (Series([True, False]), True), (SparseArray([True, False]), True), (SparseDtype(bool), True), ], @@ -966,7 +966,7 @@ def test_is_bool_dtype(dtype, expected): def test_is_bool_dtype_sparse(): - result = is_bool_dtype(pd.Series(SparseArray([True, False]))) + result = is_bool_dtype(Series(SparseArray([True, False]))) assert result is True @@ -991,3 +991,10 @@ def test_is_dtype_no_warning(check): with tm.assert_produces_warning(None): check(data["A"]) + + +def test_period_dtype_compare_to_string(): + # https://github.com/pandas-dev/pandas/issues/37265 + dtype = PeriodDtype(freq="M") + assert (dtype == "period[M]") is True + assert (dtype != "period[M]") is False diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index c6c54ccb357d5..afae6ab7b9c07 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -584,7 +584,7 @@ def test_maybe_convert_objects_nullable_integer(self, exp): def test_maybe_convert_objects_bool_nan(self): # GH32146 - ind = pd.Index([True, False, np.nan], dtype=object) + ind = Index([True, False, np.nan], dtype=object) exp = np.array([True, False, np.nan], dtype=object) out = lib.maybe_convert_objects(ind.values, safe=1) tm.assert_numpy_array_equal(out, exp) @@ -709,6 +709,9 @@ def test_decimals(self): result = lib.infer_dtype(arr, skipna=True) assert result == "mixed" + result = lib.infer_dtype(arr[::-1], skipna=True) + assert result == "mixed" + arr = np.array([Decimal(1), Decimal("NaN"), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) assert result == "decimal" @@ -729,6 +732,9 @@ def test_complex(self, skipna): result = lib.infer_dtype(arr, skipna=skipna) assert result == "mixed" + result = lib.infer_dtype(arr[::-1], skipna=skipna) + assert result == "mixed" + # gets cast to complex on array construction arr = np.array([1, np.nan, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) @@ -1217,7 +1223,7 @@ def test_interval(self): inferred = lib.infer_dtype(idx._data, skipna=False) assert inferred == "interval" - inferred = lib.infer_dtype(pd.Series(idx), skipna=False) + inferred = lib.infer_dtype(Series(idx), skipna=False) assert inferred == "interval" @pytest.mark.parametrize("klass", [pd.array, pd.Series]) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 046b82ef3131a..e7b5d2598d8e7 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -90,8 +90,8 @@ def test_isna_isnull(self, isna_f): assert not isna_f(-np.inf) # type - assert not isna_f(type(pd.Series(dtype=object))) - assert not isna_f(type(pd.Series(dtype=np.float64))) + assert not isna_f(type(Series(dtype=object))) + assert not isna_f(type(Series(dtype=np.float64))) assert not isna_f(type(pd.DataFrame())) # series @@ -247,11 +247,11 @@ def test_datetime_other_units(self): tm.assert_numpy_array_equal(isna(values), exp) tm.assert_numpy_array_equal(notna(values), ~exp) - exp = pd.Series([False, True, False]) - s = pd.Series(values) + exp = Series([False, True, False]) + s = Series(values) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) - s = pd.Series(values, dtype=object) + s = Series(values, dtype=object) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) @@ -278,11 +278,11 @@ def test_timedelta_other_units(self): tm.assert_numpy_array_equal(isna(values), exp) tm.assert_numpy_array_equal(notna(values), ~exp) - exp = pd.Series([False, True, False]) - s = pd.Series(values) + exp = Series([False, True, False]) + s = Series(values) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) - s = pd.Series(values, dtype=object) + s = Series(values, dtype=object) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) @@ -292,11 +292,11 @@ def test_period(self): tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) - exp = pd.Series([False, True, False]) - s = pd.Series(idx) + exp = Series([False, True, False]) + s = Series(idx) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) - s = pd.Series(idx, dtype=object) + s = Series(idx, dtype=object) tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 598da9c52731e..5744a893cd9d6 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -358,8 +358,8 @@ def test_apply_reduce_Series(self, float_frame): def test_apply_reduce_rows_to_dict(self): # GH 25196 - data = pd.DataFrame([[1, 2], [3, 4]]) - expected = pd.Series([{0: 1, 1: 3}, {0: 2, 1: 4}]) + data = DataFrame([[1, 2], [3, 4]]) + expected = Series([{0: 1, 1: 3}, {0: 2, 1: 4}]) result = data.apply(dict) tm.assert_series_equal(result, expected) @@ -445,7 +445,7 @@ def transform2(row): def test_apply_bug(self): # GH 6125 - positions = pd.DataFrame( + positions = DataFrame( [ [1, "ABC0", 50], [1, "YUM0", 20], @@ -619,10 +619,10 @@ def test_applymap(self, float_frame): # GH 8222 empty_frames = [ - pd.DataFrame(), - pd.DataFrame(columns=list("ABC")), - pd.DataFrame(index=list("ABC")), - pd.DataFrame({"A": [], "B": [], "C": []}), + DataFrame(), + DataFrame(columns=list("ABC")), + DataFrame(index=list("ABC")), + DataFrame({"A": [], "B": [], "C": []}), ] for frame in empty_frames: for func in [round, lambda x: x]: @@ -647,17 +647,17 @@ def test_applymap_na_ignore(self, float_frame): def test_applymap_box_timestamps(self): # GH 2689, GH 2627 - ser = pd.Series(date_range("1/1/2000", periods=10)) + ser = Series(date_range("1/1/2000", periods=10)) def func(x): return (x.hour, x.day, x.month) # it works! - pd.DataFrame(ser).applymap(func) + DataFrame(ser).applymap(func) def test_applymap_box(self): # ufunc will not be boxed. Same test cases as the test_map_box - df = pd.DataFrame( + df = DataFrame( { "a": [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")], "b": [ @@ -673,7 +673,7 @@ def test_applymap_box(self): ) result = df.applymap(lambda x: type(x).__name__) - expected = pd.DataFrame( + expected = DataFrame( { "a": ["Timestamp", "Timestamp"], "b": ["Timestamp", "Timestamp"], @@ -713,8 +713,8 @@ def test_apply_non_numpy_dtype(self): def test_apply_dup_names_multi_agg(self): # GH 21063 - df = pd.DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) - expected = pd.DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) + expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) result = df.agg(["min"]) tm.assert_frame_equal(result, expected) @@ -724,7 +724,7 @@ def test_apply_nested_result_axis_1(self): def apply_list(row): return [2 * row["A"], 2 * row["C"], 2 * row["B"]] - df = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCD")) + df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) result = df.apply(apply_list, axis=1) expected = Series( [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] @@ -733,7 +733,7 @@ def apply_list(row): def test_apply_noreduction_tzaware_object(self): # https://github.com/pandas-dev/pandas/issues/31505 - df = pd.DataFrame( + df = DataFrame( {"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]" ) result = df.apply(lambda x: x) @@ -744,7 +744,7 @@ def test_apply_noreduction_tzaware_object(self): def test_apply_function_runs_once(self): # https://github.com/pandas-dev/pandas/issues/30815 - df = pd.DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) names = [] # Save row names function is applied to def reducing_function(row): @@ -763,7 +763,7 @@ def non_reducing_function(row): def test_apply_raw_function_runs_once(self): # https://github.com/pandas-dev/pandas/issues/34506 - df = pd.DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) values = [] # Save row values function is applied to def reducing_function(row): @@ -781,7 +781,7 @@ def non_reducing_function(row): def test_applymap_function_runs_once(self): - df = pd.DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) values = [] # Save values function is applied to def reducing_function(val): @@ -799,8 +799,8 @@ def non_reducing_function(val): def test_apply_with_byte_string(self): # GH 34529 - df = pd.DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"]) - expected = pd.DataFrame( + df = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"]) + expected = DataFrame( np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object ) # After we make the aply we exect a dataframe just @@ -812,10 +812,10 @@ def test_apply_with_byte_string(self): def test_apply_category_equalness(self, val): # Check if categorical comparisons on apply, GH 21239 df_values = ["asd", None, 12, "asd", "cde", np.NaN] - df = pd.DataFrame({"a": df_values}, dtype="category") + df = DataFrame({"a": df_values}, dtype="category") result = df.a.apply(lambda x: x == val) - expected = pd.Series( + expected = Series( [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a" ) tm.assert_series_equal(result, expected) @@ -829,7 +829,7 @@ class TestInferOutputShape: def test_infer_row_shape(self): # GH 17437 # if row shape is changing, infer it - df = pd.DataFrame(np.random.rand(10, 2)) + df = DataFrame(np.random.rand(10, 2)) result = df.apply(np.fft.fft, axis=0) assert result.shape == (10, 2) @@ -954,7 +954,7 @@ def test_infer_output_shape_listlike_columns(self): tm.assert_series_equal(result, expected) # GH 17892 - df = pd.DataFrame( + df = DataFrame( { "a": [ pd.Timestamp("2010-02-01"), @@ -1122,7 +1122,7 @@ def test_transform_and_agg_err(self, axis, float_frame): with np.errstate(all="ignore"): float_frame.agg(["max", "sqrt"], axis=axis) - df = pd.DataFrame({"A": range(5), "B": 5}) + df = DataFrame({"A": range(5), "B": 5}) def f(): with np.errstate(all="ignore"): @@ -1130,7 +1130,7 @@ def f(): def test_demo(self): # demonstration tests - df = pd.DataFrame({"A": range(5), "B": 5}) + df = DataFrame({"A": range(5), "B": 5}) result = df.agg(["min", "max"]) expected = DataFrame( @@ -1149,21 +1149,21 @@ def test_demo(self): def test_agg_with_name_as_column_name(self): # GH 36212 - Column name is "name" data = {"name": ["foo", "bar"]} - df = pd.DataFrame(data) + df = DataFrame(data) # result's name should be None result = df.agg({"name": "count"}) - expected = pd.Series({"name": 2}) + expected = Series({"name": 2}) tm.assert_series_equal(result, expected) # Check if name is still preserved when aggregating series instead result = df["name"].agg({"name": "count"}) - expected = pd.Series({"name": 2}, name="name") + expected = Series({"name": 2}, name="name") tm.assert_series_equal(result, expected) def test_agg_multiple_mixed_no_warning(self): # GH 20909 - mdf = pd.DataFrame( + mdf = DataFrame( { "A": [1, 2, 3], "B": [1.0, 2.0, 3.0], @@ -1171,7 +1171,7 @@ def test_agg_multiple_mixed_no_warning(self): "D": pd.date_range("20130101", periods=3), } ) - expected = pd.DataFrame( + expected = DataFrame( { "A": [1, 6], "B": [1.0, 6.0], @@ -1197,7 +1197,7 @@ def test_agg_multiple_mixed_no_warning(self): def test_agg_dict_nested_renaming_depr(self): - df = pd.DataFrame({"A": range(5), "B": 5}) + df = DataFrame({"A": range(5), "B": 5}) # nested renaming msg = r"nested renamer is not supported" @@ -1254,7 +1254,7 @@ def test_agg_reduce(self, axis, float_frame): # dict input with lists with multiple func = dict([(name1, ["mean", "sum"]), (name2, ["sum", "max"])]) result = float_frame.agg(func, axis=axis) - expected = DataFrame( + expected = pd.concat( dict( [ ( @@ -1278,7 +1278,8 @@ def test_agg_reduce(self, axis, float_frame): ), ), ] - ) + ), + axis=1, ) expected = expected.T if axis in {1, "columns"} else expected tm.assert_frame_equal(result, expected) @@ -1343,7 +1344,7 @@ def test_non_callable_aggregates(self): result2 = df.agg( {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} ) - expected = pd.DataFrame( + expected = DataFrame( { "A": {"count": 2, "size": 3}, "B": {"count": 2, "size": 3}, @@ -1376,7 +1377,7 @@ def func(group_col): return list(group_col.dropna().unique()) result = df.agg(func) - expected = pd.Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) + expected = Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) result = df.agg([func]) @@ -1480,12 +1481,12 @@ def test_agg_args_kwargs(self, axis, args, kwargs): def f(x, a, b, c=3): return x.sum() + (a + b) / c - df = pd.DataFrame([[1, 2], [3, 4]]) + df = DataFrame([[1, 2], [3, 4]]) if axis == 0: - expected = pd.Series([5.0, 7.0]) + expected = Series([5.0, 7.0]) else: - expected = pd.Series([4.0, 8.0]) + expected = Series([4.0, 8.0]) result = df.agg(f, axis, *args, **kwargs) @@ -1510,11 +1511,11 @@ def test_apply_datetime_tz_issue(self): ] df = DataFrame(data=[0, 1, 2], index=timestamps) result = df.apply(lambda x: x.name, axis=1) - expected = pd.Series(index=timestamps, data=timestamps) + expected = Series(index=timestamps, data=timestamps) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("df", [pd.DataFrame({"A": ["a", None], "B": ["c", "d"]})]) + @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, method): # GH 16832 @@ -1528,7 +1529,7 @@ def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, meth @pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) def test_apply_dtype(self, col): # GH 31466 - df = pd.DataFrame([[1.0, col]], columns=["a", "b"]) + df = DataFrame([[1.0, col]], columns=["a", "b"]) result = df.apply(lambda x: x.dtype) expected = df.dtypes @@ -1537,7 +1538,7 @@ def test_apply_dtype(self, col): def test_apply_mutating(): # GH#35462 case where applied func pins a new BlockManager to a row - df = pd.DataFrame({"a": range(100), "b": range(100, 200)}) + df = DataFrame({"a": range(100), "b": range(100, 200)}) def func(row): mgr = row._mgr @@ -1556,18 +1557,18 @@ def func(row): def test_apply_empty_list_reduce(): # GH#35683 get columns correct - df = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"]) + df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"]) result = df.apply(lambda x: [], result_type="reduce") - expected = pd.Series({"a": [], "b": []}, dtype=object) + expected = Series({"a": [], "b": []}, dtype=object) tm.assert_series_equal(result, expected) def test_apply_no_suffix_index(): # GH36189 - pdf = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) + pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = pd.DataFrame( + expected = DataFrame( {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] ) @@ -1576,7 +1577,7 @@ def test_apply_no_suffix_index(): def test_apply_raw_returns_string(): # https://github.com/pandas-dev/pandas/issues/35940 - df = pd.DataFrame({"A": ["aa", "bbb"]}) + df = DataFrame({"A": ["aa", "bbb"]}) result = df.apply(lambda x: x[0], axis=1, raw=True) - expected = pd.Series(["aa", "bbb"]) + expected = Series(["aa", "bbb"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index 1b259ddbd41dc..d141fa8682c10 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -20,7 +20,7 @@ def test_transform_ufunc(axis, float_frame): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("op", transformation_kernels) +@pytest.mark.parametrize("op", sorted(transformation_kernels)) def test_transform_groupby_kernel(axis, float_frame, op): # GH 35964 if op == "cumcount": @@ -161,7 +161,7 @@ def test_transform_reducer_raises(all_reductions): # mypy doesn't allow adding lists of different types # https://github.com/python/mypy/issues/5492 -@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) +@pytest.mark.parametrize("op", [*sorted(transformation_kernels), lambda x: x + 1]) def test_transform_bad_dtype(op): # GH 35964 df = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms @@ -182,7 +182,7 @@ def test_transform_bad_dtype(op): df.transform({"A": [op]}) -@pytest.mark.parametrize("op", transformation_kernels) +@pytest.mark.parametrize("op", sorted(transformation_kernels)) def test_transform_partial_failure(op): # GH 35964 wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"] diff --git a/pandas/tests/reshape/merge/test_pivot_old.py b/pandas/tests/frame/indexing/__init__.py similarity index 100% rename from pandas/tests/reshape/merge/test_pivot_old.py rename to pandas/tests/frame/indexing/__init__.py diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index 314de5bdd8146..e37d00c540974 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -352,14 +352,6 @@ def test_assigning_ops(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) - def test_functions_no_warnings(self): - df = DataFrame({"value": np.random.randint(0, 100, 20)}) - labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] - with tm.assert_produces_warning(False): - df["group"] = pd.cut( - df.value, range(0, 105, 10), right=False, labels=labels - ) - def test_setitem_single_row_categorical(self): # GH 25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) @@ -394,14 +386,3 @@ def test_loc_indexing_preserves_index_category_dtype(self): result = df.loc[["a"]].index.levels[0] tm.assert_index_equal(result, expected) - - def test_categorical_filtering(self): - # GH22609 Verify filtering operations on DataFrames with categorical Series - df = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) - df["b"] = df.b.astype("category") - - result = df.where(df.a > 0) - expected = df.copy() - expected.loc[0, :] = np.nan - - tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py deleted file mode 100644 index 1866ac341def6..0000000000000 --- a/pandas/tests/frame/indexing/test_datetime.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd -from pandas import DataFrame, Index, Series, date_range, notna -import pandas._testing as tm - - -class TestDataFrameIndexingDatetimeWithTZ: - def test_setitem(self, timezone_frame): - - df = timezone_frame - idx = df["B"].rename("foo") - - # setitem - df["C"] = idx - tm.assert_series_equal(df["C"], Series(idx, name="C")) - - df["D"] = "foo" - df["D"] = idx - tm.assert_series_equal(df["D"], Series(idx, name="D")) - del df["D"] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._mgr.blocks[1] - b2 = df._mgr.blocks[2] - tm.assert_extension_array_equal(b1.values, b2.values) - b1base = b1.values._data.base - b2base = b2.values._data.base - assert b1base is None or (id(b1base) != id(b2base)) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal(df2.dtypes, df.dtypes) - - def test_set_reset(self): - - idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype == "datetime64[ns, US/Eastern]" - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) diff --git a/pandas/tests/frame/indexing/test_get_value.py b/pandas/tests/frame/indexing/test_get_value.py new file mode 100644 index 0000000000000..9a2ec975f1e31 --- /dev/null +++ b/pandas/tests/frame/indexing/test_get_value.py @@ -0,0 +1,19 @@ +import pytest + +from pandas import DataFrame, MultiIndex + + +class TestGetValue: + def test_get_set_value_no_partial_indexing(self): + # partial w/ MultiIndex raise exception + index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)]) + df = DataFrame(index=index, columns=range(4)) + with pytest.raises(KeyError, match=r"^0$"): + df._get_value(0, 1) + + def test_get_value(self, float_frame): + for idx in float_frame.index: + for col in float_frame.columns: + result = float_frame._get_value(idx, col) + expected = float_frame[col][idx] + assert result == expected diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py new file mode 100644 index 0000000000000..d9cdfa5ea45ec --- /dev/null +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -0,0 +1,31 @@ +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex, period_range +import pandas._testing as tm + + +class TestGetitem: + def test_getitem_unused_level_raises(self): + # GH#20410 + mi = MultiIndex( + levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], + codes=[[1, 0], [1, 0]], + ) + df = DataFrame(-1, index=range(3), columns=mi) + + with pytest.raises(KeyError, match="notevenone"): + df["notevenone"] + + def test_getitem_periodindex(self): + rng = period_range("1/1/2000", periods=5) + df = DataFrame(np.random.randn(10, 5), columns=rng) + + ts = df[rng[0]] + tm.assert_series_equal(ts, df.iloc[:, 0]) + + # GH#1211; smoketest unrelated to the rest of this test + repr(df) + + ts = df["1/1/2000"] + tm.assert_series_equal(ts, df.iloc[:, 0]) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 507d01f5b900c..d7be4d071ad74 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -6,7 +6,7 @@ from pandas._libs import iNaT -from pandas.core.dtypes.common import is_float_dtype, is_integer +from pandas.core.dtypes.common import is_integer import pandas as pd from pandas import ( @@ -75,7 +75,7 @@ def test_loc_iterable(self, float_frame, key_type): def test_loc_timedelta_0seconds(self): # GH#10583 - df = pd.DataFrame(np.random.normal(size=(10, 4))) + df = DataFrame(np.random.normal(size=(10, 4))) df.index = pd.timedelta_range(start="0s", periods=10, freq="s") expected = df.loc[pd.Timedelta("0s") :, :] result = df.loc["0s":, :] @@ -200,7 +200,7 @@ def test_setitem_list_of_tuples(self, float_frame): ( ["A", "B", "C", "D"], 7, - pd.DataFrame( + DataFrame( [[7, 7, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]], columns=["A", "B", "C", "D"], ), @@ -208,7 +208,7 @@ def test_setitem_list_of_tuples(self, float_frame): ( ["C", "D"], [7, 8], - pd.DataFrame( + DataFrame( [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], columns=["A", "B", "C", "D"], ), @@ -216,14 +216,12 @@ def test_setitem_list_of_tuples(self, float_frame): ( ["A", "B", "C"], np.array([7, 8, 9], dtype=np.int64), - pd.DataFrame( - [[7, 8, 9], [7, 8, 9], [7, 8, 9]], columns=["A", "B", "C"] - ), + DataFrame([[7, 8, 9], [7, 8, 9], [7, 8, 9]], columns=["A", "B", "C"]), ), ( ["B", "C", "D"], [[7, 8, 9], [10, 11, 12], [13, 14, 15]], - pd.DataFrame( + DataFrame( [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]], columns=["A", "B", "C", "D"], ), @@ -231,15 +229,15 @@ def test_setitem_list_of_tuples(self, float_frame): ( ["C", "A", "D"], np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]], dtype=np.int64), - pd.DataFrame( + DataFrame( [[8, 2, 7, 9], [11, 4, 10, 12], [14, 6, 13, 15]], columns=["A", "B", "C", "D"], ), ), ( ["A", "C"], - pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), - pd.DataFrame( + DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + DataFrame( [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] ), ), @@ -247,7 +245,7 @@ def test_setitem_list_of_tuples(self, float_frame): ) def test_setitem_list_missing_columns(self, columns, box, expected): # GH 29334 - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) df[columns] = box tm.assert_frame_equal(df, expected) @@ -259,7 +257,7 @@ def test_setitem_multi_index(self): cols = MultiIndex.from_product(it) index = pd.date_range("20141006", periods=20) vals = np.random.randint(1, 1000, (len(index), len(cols))) - df = pd.DataFrame(vals, columns=cols, index=index) + df = DataFrame(vals, columns=cols, index=index) i, j = df.index.values.copy(), it[-1][:] @@ -277,10 +275,10 @@ def test_setitem_multi_index(self): def test_setitem_callable(self): # GH 12533 - df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}) + df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}) df[lambda x: "A"] = [11, 12, 13, 14] - exp = pd.DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]}) + exp = DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]}) tm.assert_frame_equal(df, exp) def test_setitem_other_callable(self): @@ -288,10 +286,10 @@ def test_setitem_other_callable(self): def inc(x): return x + 1 - df = pd.DataFrame([[-1, 1], [1, -1]]) + df = DataFrame([[-1, 1], [1, -1]]) df[df > 0] = inc - expected = pd.DataFrame([[-1, inc], [inc, -1]]) + expected = DataFrame([[-1, inc], [inc, -1]]) tm.assert_frame_equal(df, expected) def test_getitem_boolean( @@ -440,7 +438,7 @@ def test_getitem_ix_mixed_integer(self): tm.assert_frame_equal(result, expected) # 11320 - df = pd.DataFrame( + df = DataFrame( { "rna": (1.5, 2.2, 3.2, 4.5), -1000: [11, 21, 36, 40], @@ -782,7 +780,7 @@ def test_setitem_None(self, float_frame): def test_setitem_empty(self): # GH 9596 - df = pd.DataFrame( + df = DataFrame( {"a": ["1", "2", "3"], "b": ["11", "22", "33"], "c": ["111", "222", "333"]} ) @@ -803,10 +801,10 @@ def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): def test_setitem_with_empty_listlike(self): # GH #17101 - index = pd.Index([], name="idx") - result = pd.DataFrame(columns=["A"], index=index) + index = Index([], name="idx") + result = DataFrame(columns=["A"], index=index) result["A"] = [] - expected = pd.DataFrame(columns=["A"], index=index) + expected = DataFrame(columns=["A"], index=index) tm.assert_index_equal(result.index, expected.index) def test_setitem_scalars_no_index(self): @@ -819,7 +817,7 @@ def test_setitem_scalars_no_index(self): def test_getitem_empty_frame_with_boolean(self): # Test for issue #11859 - df = pd.DataFrame() + df = DataFrame() df2 = df[df > 0] tm.assert_frame_equal(df, df2) @@ -887,11 +885,11 @@ def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): def test_setitem_slice_position(self): # GH#31469 - df = pd.DataFrame(np.zeros((100, 1))) + df = DataFrame(np.zeros((100, 1))) df[-4:] = 1 arr = np.zeros((100, 1)) arr[-4:] = 1 - expected = pd.DataFrame(arr) + expected = DataFrame(arr) tm.assert_frame_equal(df, expected) def test_getitem_setitem_non_ix_labels(self): @@ -1190,7 +1188,7 @@ def test_setitem_mixed_datetime(self): ], } ) - df = pd.DataFrame(0, columns=list("ab"), index=range(6)) + df = DataFrame(0, columns=list("ab"), index=range(6)) df["b"] = pd.NaT df.loc[0, "b"] = datetime(2012, 1, 1) df.loc[1, "b"] = 1 @@ -1329,195 +1327,6 @@ def test_getitem_list_duplicates(self): expected = df.iloc[:, 2:] tm.assert_frame_equal(result, expected) - def test_get_value(self, float_frame): - for idx in float_frame.index: - for col in float_frame.columns: - result = float_frame._get_value(idx, col) - expected = float_frame[col][idx] - assert result == expected - - def test_lookup_float(self, float_frame): - df = float_frame - rows = list(df.index) * len(df.columns) - cols = list(df.columns) * len(df.index) - with tm.assert_produces_warning(FutureWarning): - result = df.lookup(rows, cols) - - expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)]) - tm.assert_numpy_array_equal(result, expected) - - def test_lookup_mixed(self, float_string_frame): - df = float_string_frame - rows = list(df.index) * len(df.columns) - cols = list(df.columns) * len(df.index) - with tm.assert_produces_warning(FutureWarning): - result = df.lookup(rows, cols) - - expected = np.array( - [df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_ - ) - tm.assert_almost_equal(result, expected) - - def test_lookup_bool(self): - df = DataFrame( - { - "label": ["a", "b", "a", "c"], - "mask_a": [True, True, False, True], - "mask_b": [True, False, False, False], - "mask_c": [False, True, False, True], - } - ) - with tm.assert_produces_warning(FutureWarning): - df["mask"] = df.lookup(df.index, "mask_" + df["label"]) - - exp_mask = np.array( - [df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])] - ) - - tm.assert_series_equal(df["mask"], pd.Series(exp_mask, name="mask")) - assert df["mask"].dtype == np.bool_ - - def test_lookup_raises(self, float_frame): - with pytest.raises(KeyError, match="'One or more row labels was not found'"): - with tm.assert_produces_warning(FutureWarning): - float_frame.lookup(["xyz"], ["A"]) - - with pytest.raises(KeyError, match="'One or more column labels was not found'"): - with tm.assert_produces_warning(FutureWarning): - float_frame.lookup([float_frame.index[0]], ["xyz"]) - - with pytest.raises(ValueError, match="same size"): - with tm.assert_produces_warning(FutureWarning): - float_frame.lookup(["a", "b", "c"], ["a"]) - - def test_lookup_requires_unique_axes(self): - # GH#33041 raise with a helpful error message - df = pd.DataFrame(np.random.randn(6).reshape(3, 2), columns=["A", "A"]) - - rows = [0, 1] - cols = ["A", "A"] - - # homogeneous-dtype case - with pytest.raises(ValueError, match="requires unique index and columns"): - with tm.assert_produces_warning(FutureWarning): - df.lookup(rows, cols) - with pytest.raises(ValueError, match="requires unique index and columns"): - with tm.assert_produces_warning(FutureWarning): - df.T.lookup(cols, rows) - - # heterogeneous dtype - df["B"] = 0 - with pytest.raises(ValueError, match="requires unique index and columns"): - with tm.assert_produces_warning(FutureWarning): - df.lookup(rows, cols) - - def test_set_value(self, float_frame): - for idx in float_frame.index: - for col in float_frame.columns: - float_frame._set_value(idx, col, 1) - assert float_frame[col][idx] == 1 - - def test_set_value_resize(self, float_frame): - - res = float_frame._set_value("foobar", "B", 0) - assert res is None - assert float_frame.index[-1] == "foobar" - assert float_frame._get_value("foobar", "B") == 0 - - float_frame.loc["foobar", "qux"] = 0 - assert float_frame._get_value("foobar", "qux") == 0 - - res = float_frame.copy() - res._set_value("foobar", "baz", "sam") - assert res["baz"].dtype == np.object_ - - res = float_frame.copy() - res._set_value("foobar", "baz", True) - assert res["baz"].dtype == np.object_ - - res = float_frame.copy() - res._set_value("foobar", "baz", 5) - assert is_float_dtype(res["baz"]) - assert isna(res["baz"].drop(["foobar"])).all() - msg = "could not convert string to float: 'sam'" - with pytest.raises(ValueError, match=msg): - res._set_value("foobar", "baz", "sam") - - def test_reindex_with_multi_index(self): - # https://github.com/pandas-dev/pandas/issues/29896 - # tests for reindexing a multi-indexed DataFrame with a new MultiIndex - # - # confirms that we can reindex a multi-indexed DataFrame with a new - # MultiIndex object correctly when using no filling, backfilling, and - # padding - # - # The DataFrame, `df`, used in this test is: - # c - # a b - # -1 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # 0 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # 1 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # - # and the other MultiIndex, `new_multi_index`, is: - # 0: 0 0.5 - # 1: 2.0 - # 2: 5.0 - # 3: 5.8 - df = pd.DataFrame( - { - "a": [-1] * 7 + [0] * 7 + [1] * 7, - "b": list(range(7)) * 3, - "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, - } - ).set_index(["a", "b"]) - new_index = [0.5, 2.0, 5.0, 5.8] - new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) - - # reindexing w/o a `method` value - reindexed = df.reindex(new_multi_index) - expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} - ).set_index(["a", "b"]) - tm.assert_frame_equal(expected, reindexed) - - # reindexing with backfilling - expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} - ).set_index(["a", "b"]) - reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") - tm.assert_frame_equal(expected, reindexed_with_backfilling) - - reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") - tm.assert_frame_equal(expected, reindexed_with_backfilling) - - # reindexing with padding - expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} - ).set_index(["a", "b"]) - reindexed_with_padding = df.reindex(new_multi_index, method="pad") - tm.assert_frame_equal(expected, reindexed_with_padding) - - reindexed_with_padding = df.reindex(new_multi_index, method="ffill") - tm.assert_frame_equal(expected, reindexed_with_padding) - def test_set_value_with_index_dtype_change(self): df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) @@ -1544,13 +1353,6 @@ def test_set_value_with_index_dtype_change(self): assert list(df.index) == list(df_orig.index) + ["C"] assert list(df.columns) == list(df_orig.columns) + ["D"] - def test_get_set_value_no_partial_indexing(self): - # partial w/ MultiIndex raise exception - index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)]) - df = DataFrame(index=index, columns=range(4)) - with pytest.raises(KeyError, match=r"^0$"): - df._get_value(0, 1) - # TODO: rename? remove? def test_single_element_ix_dont_upcast(self, float_frame): float_frame["E"] = 1 @@ -1560,7 +1362,7 @@ def test_single_element_ix_dont_upcast(self, float_frame): assert is_integer(result) # GH 11617 - df = pd.DataFrame(dict(a=[1.23])) + df = DataFrame(dict(a=[1.23])) df["b"] = 666 result = df.loc[0, "b"] @@ -1660,233 +1462,28 @@ def test_loc_duplicates(self): trange = trange.insert(loc=5, item=pd.Timestamp(year=2017, month=1, day=5)) - df = pd.DataFrame(0, index=trange, columns=["A", "B"]) + df = DataFrame(0, index=trange, columns=["A", "B"]) bool_idx = np.array([False, False, False, False, False, True]) # assignment df.loc[trange[bool_idx], "A"] = 6 - expected = pd.DataFrame( + expected = DataFrame( {"A": [0, 0, 0, 0, 6, 6], "B": [0, 0, 0, 0, 0, 0]}, index=trange ) tm.assert_frame_equal(df, expected) # in-place - df = pd.DataFrame(0, index=trange, columns=["A", "B"]) + df = DataFrame(0, index=trange, columns=["A", "B"]) df.loc[trange[bool_idx], "A"] += 6 tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - "method,expected_values", - [ - ("nearest", [0, 1, 1, 2]), - ("pad", [np.nan, 0, 1, 1]), - ("backfill", [0, 1, 2, 2]), - ], - ) - def test_reindex_methods(self, method, expected_values): - df = pd.DataFrame({"x": list(range(5))}) - target = np.array([-0.1, 0.9, 1.1, 1.5]) - - expected = pd.DataFrame({"x": expected_values}, index=target) - actual = df.reindex(target, method=method) - tm.assert_frame_equal(expected, actual) - - actual = df.reindex(target, method=method, tolerance=1) - tm.assert_frame_equal(expected, actual) - actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) - tm.assert_frame_equal(expected, actual) - - e2 = expected[::-1] - actual = df.reindex(target[::-1], method=method) - tm.assert_frame_equal(e2, actual) - - new_order = [3, 0, 2, 1] - e2 = expected.iloc[new_order] - actual = df.reindex(target[new_order], method=method) - tm.assert_frame_equal(e2, actual) - - switched_method = ( - "pad" if method == "backfill" else "backfill" if method == "pad" else method - ) - actual = df[::-1].reindex(target, method=switched_method) - tm.assert_frame_equal(expected, actual) - - def test_reindex_methods_nearest_special(self): - df = pd.DataFrame({"x": list(range(5))}) - target = np.array([-0.1, 0.9, 1.1, 1.5]) - - expected = pd.DataFrame({"x": [0, 1, 1, np.nan]}, index=target) - actual = df.reindex(target, method="nearest", tolerance=0.2) - tm.assert_frame_equal(expected, actual) - - expected = pd.DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) - actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) - tm.assert_frame_equal(expected, actual) - - def test_reindex_nearest_tz(self, tz_aware_fixture): - # GH26683 - tz = tz_aware_fixture - idx = pd.date_range("2019-01-01", periods=5, tz=tz) - df = pd.DataFrame({"x": list(range(5))}, index=idx) - - expected = df.head(3) - actual = df.reindex(idx[:3], method="nearest") - tm.assert_frame_equal(expected, actual) - - def test_reindex_nearest_tz_empty_frame(self): - # https://github.com/pandas-dev/pandas/issues/31964 - dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"]) - df = pd.DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"])) - expected = pd.DataFrame(index=dti) - result = df.reindex(dti, method="nearest") - tm.assert_frame_equal(result, expected) - - def test_reindex_frame_add_nat(self): - rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") - df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) - - result = df.reindex(range(15)) - assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) - - mask = com.isna(result)["B"] - assert mask[-5:].all() - assert not mask[:-5].any() - - def test_reindex_limit(self): - # GH 28631 - data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] - exp_data = [ - ["A", "A", "A"], - ["B", "B", "B"], - ["C", "C", "C"], - ["D", "D", "D"], - ["D", "D", "D"], - [np.nan, np.nan, np.nan], - ] - df = DataFrame(data) - result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) - expected = DataFrame(exp_data) - tm.assert_frame_equal(result, expected) - def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) assert x[0].dtype == np.dtype("M8[ns]") - def test_non_monotonic_reindex_methods(self): - dr = pd.date_range("2013-08-01", periods=6, freq="B") - data = np.random.randn(6, 1) - df = pd.DataFrame(data, index=dr, columns=list("A")) - df_rev = pd.DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) - # index is not monotonic increasing or decreasing - msg = "index must be monotonic increasing or decreasing" - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="pad") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="ffill") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="bfill") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="nearest") - - def test_reindex_level(self): - from itertools import permutations - - icol = ["jim", "joe", "jolie"] - - def verify_first_level(df, level, idx, check_index_type=True): - def f(val): - return np.nonzero((df[level] == val).to_numpy())[0] - - i = np.concatenate(list(map(f, idx))) - left = df.set_index(icol).reindex(idx, level=level) - right = df.iloc[i].set_index(icol) - tm.assert_frame_equal(left, right, check_index_type=check_index_type) - - def verify(df, level, idx, indexer, check_index_type=True): - left = df.set_index(icol).reindex(idx, level=level) - right = df.iloc[indexer].set_index(icol) - tm.assert_frame_equal(left, right, check_index_type=check_index_type) - - df = pd.DataFrame( - { - "jim": list("B" * 4 + "A" * 2 + "C" * 3), - "joe": list("abcdeabcd")[::-1], - "jolie": [10, 20, 30] * 3, - "joline": np.random.randint(0, 1000, 9), - } - ) - - target = [ - ["C", "B", "A"], - ["F", "C", "A", "D"], - ["A"], - ["A", "B", "C"], - ["C", "A", "B"], - ["C", "B"], - ["C", "A"], - ["A", "B"], - ["B", "A", "C"], - ] - - for idx in target: - verify_first_level(df, "jim", idx) - - # reindex by these causes different MultiIndex levels - for idx in [["D", "F"], ["A", "C", "B"]]: - verify_first_level(df, "jim", idx, check_index_type=False) - - verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) - verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) - verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) - verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) - verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) - verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) - verify(df, "joe", list("edwq"), [0, 4, 5]) - verify(df, "joe", list("wq"), [], check_index_type=False) - + def test_iloc_getitem_float_duplicates(self): df = DataFrame( - { - "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, - "joe": ["3rd"] * 2 - + ["1st"] * 3 - + ["2nd"] * 3 - + ["1st"] * 2 - + ["3rd"] * 3 - + ["1st"] * 2 - + ["3rd"] * 3 - + ["2nd"] * 2, - # this needs to be jointly unique with jim and joe or - # reindexing will fail ~1.5% of the time, this works - # out to needing unique groups of same size as joe - "jolie": np.concatenate( - [ - np.random.choice(1000, x, replace=False) - for x in [2, 3, 3, 2, 3, 2, 3, 2] - ] - ), - "joline": np.random.randn(20).round(3) * 10, - } - ) - - for idx in permutations(df["jim"].unique()): - for i in range(3): - verify_first_level(df, "jim", idx[: i + 1]) - - i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] - verify(df, "joe", ["1st", "2nd", "3rd"], i) - - i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] - verify(df, "joe", ["3rd", "2nd", "1st"], i) - - i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] - verify(df, "joe", ["2nd", "3rd"], i) - - i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] - verify(df, "joe", ["3rd", "1st"], i) - - def test_getitem_ix_float_duplicates(self): - df = pd.DataFrame( np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") ) expect = df.iloc[1:] @@ -1902,7 +1499,7 @@ def test_getitem_ix_float_duplicates(self): expect = df.iloc[1:, 0] tm.assert_series_equal(df.loc[0.2, "a"], expect) - df = pd.DataFrame( + df = DataFrame( np.random.randn(4, 3), index=[1, 0.2, 0.2, 1], columns=list("abc") ) expect = df.iloc[1:-1] @@ -1922,27 +1519,25 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): # GH 12981 # Assignment of unaligned offset-aware datetime series. # Make sure timezone isn't lost - column = pd.Series( - pd.date_range("2015-01-01", periods=3, tz="utc"), name="dates" - ) - df = pd.DataFrame({"dates": column}) + column = Series(pd.date_range("2015-01-01", periods=3, tz="utc"), name="dates") + df = DataFrame({"dates": column}) df["dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) - df = pd.DataFrame({"dates": column}) + df = DataFrame({"dates": column}) df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) - def test_setitem_datetime_coercion(self): + def test_loc_setitem_datetime_coercion(self): # gh-1048 - df = pd.DataFrame({"c": [pd.Timestamp("2010-10-01")] * 3}) + df = DataFrame({"c": [pd.Timestamp("2010-10-01")] * 3}) df.loc[0:1, "c"] = np.datetime64("2008-08-08") assert pd.Timestamp("2008-08-08") == df.loc[0, "c"] assert pd.Timestamp("2008-08-08") == df.loc[1, "c"] df.loc[2, "c"] = date(2005, 5, 5) assert pd.Timestamp("2005-05-05") == df.loc[2, "c"] - def test_setitem_datetimelike_with_inference(self): + def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1965,7 +1560,7 @@ def test_setitem_datetimelike_with_inference(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("idxer", ["var", ["var"]]) - def test_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): + def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # GH 11365 tz = tz_naive_fixture idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) @@ -2020,7 +1615,7 @@ def test_at_time_between_time_datetimeindex(self): result.loc[bkey] = df.iloc[binds] tm.assert_frame_equal(result, df) - def test_index_namedtuple(self): + def test_loc_getitem_index_namedtuple(self): from collections import namedtuple IndexType = namedtuple("IndexType", ["a", "b"]) @@ -2033,17 +1628,17 @@ def test_index_namedtuple(self): assert result == 1 @pytest.mark.parametrize("tpl", [tuple([1]), tuple([1, 2])]) - def test_index_single_double_tuples(self, tpl): + def test_loc_getitem_index_single_double_tuples(self, tpl): # GH 20991 - idx = pd.Index([tuple([1]), tuple([1, 2])], name="A", tupleize_cols=False) + idx = Index([tuple([1]), tuple([1, 2])], name="A", tupleize_cols=False) df = DataFrame(index=idx) result = df.loc[[tpl]] - idx = pd.Index([tpl], name="A", tupleize_cols=False) + idx = Index([tpl], name="A", tupleize_cols=False) expected = DataFrame(index=idx) tm.assert_frame_equal(result, expected) - def test_boolean_indexing(self): + def test_setitem_boolean_indexing(self): idx = list(range(3)) cols = ["A", "B", "C"] df1 = DataFrame( @@ -2066,7 +1661,7 @@ def test_boolean_indexing(self): with pytest.raises(ValueError, match="Item wrong length"): df1[df1.index[:-1] > 2] = -1 - def test_boolean_indexing_mixed(self): + def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, @@ -2139,10 +1734,10 @@ def test_type_error_multiindex(self): result = dg["x", 0] tm.assert_series_equal(result, expected) - def test_interval_index(self): + def test_loc_getitem_interval_index(self): # GH 19977 index = pd.interval_range(start=0, periods=3) - df = pd.DataFrame( + df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=index, columns=["A", "B", "C"] ) @@ -2151,15 +1746,29 @@ def test_interval_index(self): tm.assert_almost_equal(result, expected) index = pd.interval_range(start=0, periods=3, closed="both") - df = pd.DataFrame( + df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=index, columns=["A", "B", "C"] ) index_exp = pd.interval_range(start=0, periods=2, freq=1, closed="both") - expected = pd.Series([1, 4], index=index_exp, name="A") + expected = Series([1, 4], index=index_exp, name="A") result = df.loc[1, "A"] tm.assert_series_equal(result, expected) + def test_getitem_interval_index_partial_indexing(self): + # GH#36490 + df = DataFrame( + np.ones((3, 4)), columns=pd.IntervalIndex.from_breaks(np.arange(5)) + ) + + expected = df.iloc[:, 0] + + res = df[0.5] + tm.assert_series_equal(res, expected) + + res = df.loc[:, 0.5] + tm.assert_series_equal(res, expected) + class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): @@ -2191,22 +1800,10 @@ def test_setitem(self, uint64_frame): ), ) - def test_set_reset(self): - - idx = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10], name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype == np.dtype("uint64") - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) - def test_object_casting_indexing_wraps_datetimelike(): # GH#31649, check the indexing methods all the way down the stack - df = pd.DataFrame( + df = DataFrame( { "A": [1, 2], "B": pd.date_range("2000", periods=2), @@ -2241,12 +1838,3 @@ def test_object_casting_indexing_wraps_datetimelike(): assert blk.dtype == "m8[ns]" # we got the right block val = blk.iget((0, 0)) assert isinstance(val, pd.Timedelta) - - -def test_lookup_deprecated(): - # GH18262 - df = pd.DataFrame( - {"col": ["A", "A", "B", "B"], "A": [80, 23, np.nan, 22], "B": [80, 55, 76, 67]} - ) - with tm.assert_produces_warning(FutureWarning): - df.lookup(df.index, df["col"]) diff --git a/pandas/tests/frame/indexing/test_lookup.py b/pandas/tests/frame/indexing/test_lookup.py new file mode 100644 index 0000000000000..21d732695fba4 --- /dev/null +++ b/pandas/tests/frame/indexing/test_lookup.py @@ -0,0 +1,91 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestLookup: + def test_lookup_float(self, float_frame): + df = float_frame + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + with tm.assert_produces_warning(FutureWarning): + result = df.lookup(rows, cols) + + expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)]) + tm.assert_numpy_array_equal(result, expected) + + def test_lookup_mixed(self, float_string_frame): + df = float_string_frame + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + with tm.assert_produces_warning(FutureWarning): + result = df.lookup(rows, cols) + + expected = np.array( + [df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_ + ) + tm.assert_almost_equal(result, expected) + + def test_lookup_bool(self): + df = DataFrame( + { + "label": ["a", "b", "a", "c"], + "mask_a": [True, True, False, True], + "mask_b": [True, False, False, False], + "mask_c": [False, True, False, True], + } + ) + with tm.assert_produces_warning(FutureWarning): + df["mask"] = df.lookup(df.index, "mask_" + df["label"]) + + exp_mask = np.array( + [df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])] + ) + + tm.assert_series_equal(df["mask"], Series(exp_mask, name="mask")) + assert df["mask"].dtype == np.bool_ + + def test_lookup_raises(self, float_frame): + with pytest.raises(KeyError, match="'One or more row labels was not found'"): + with tm.assert_produces_warning(FutureWarning): + float_frame.lookup(["xyz"], ["A"]) + + with pytest.raises(KeyError, match="'One or more column labels was not found'"): + with tm.assert_produces_warning(FutureWarning): + float_frame.lookup([float_frame.index[0]], ["xyz"]) + + with pytest.raises(ValueError, match="same size"): + with tm.assert_produces_warning(FutureWarning): + float_frame.lookup(["a", "b", "c"], ["a"]) + + def test_lookup_requires_unique_axes(self): + # GH#33041 raise with a helpful error message + df = DataFrame(np.random.randn(6).reshape(3, 2), columns=["A", "A"]) + + rows = [0, 1] + cols = ["A", "A"] + + # homogeneous-dtype case + with pytest.raises(ValueError, match="requires unique index and columns"): + with tm.assert_produces_warning(FutureWarning): + df.lookup(rows, cols) + with pytest.raises(ValueError, match="requires unique index and columns"): + with tm.assert_produces_warning(FutureWarning): + df.T.lookup(cols, rows) + + # heterogeneous dtype + df["B"] = 0 + with pytest.raises(ValueError, match="requires unique index and columns"): + with tm.assert_produces_warning(FutureWarning): + df.lookup(rows, cols) + + +def test_lookup_deprecated(): + # GH#18262 + df = DataFrame( + {"col": ["A", "A", "B", "B"], "A": [80, 23, np.nan, 22], "B": [80, 55, 76, 67]} + ) + with tm.assert_produces_warning(FutureWarning): + df.lookup(df.index, df["col"]) diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py new file mode 100644 index 0000000000000..484e2d544197e --- /dev/null +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -0,0 +1,40 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_float_dtype + +from pandas import isna + + +class TestSetValue: + def test_set_value(self, float_frame): + for idx in float_frame.index: + for col in float_frame.columns: + float_frame._set_value(idx, col, 1) + assert float_frame[col][idx] == 1 + + def test_set_value_resize(self, float_frame): + + res = float_frame._set_value("foobar", "B", 0) + assert res is None + assert float_frame.index[-1] == "foobar" + assert float_frame._get_value("foobar", "B") == 0 + + float_frame.loc["foobar", "qux"] = 0 + assert float_frame._get_value("foobar", "qux") == 0 + + res = float_frame.copy() + res._set_value("foobar", "baz", "sam") + assert res["baz"].dtype == np.object_ + + res = float_frame.copy() + res._set_value("foobar", "baz", True) + assert res["baz"].dtype == np.object_ + + res = float_frame.copy() + res._set_value("foobar", "baz", 5) + assert is_float_dtype(res["baz"]) + assert isna(res["baz"].drop(["foobar"])).all() + msg = "could not convert string to float: 'sam'" + with pytest.raises(ValueError, match=msg): + res._set_value("foobar", "baz", "sam") diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 8313ab0b99bac..55465dffd2027 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -8,10 +8,14 @@ DataFrame, Index, Interval, + NaT, Period, + PeriodIndex, Series, Timestamp, date_range, + notna, + period_range, ) import pandas._testing as tm from pandas.core.arrays import SparseArray @@ -180,3 +184,57 @@ def test_setitem_extension_types(self, obj, dtype): df["obj"] = obj tm.assert_frame_equal(df, expected) + + def test_setitem_dt64tz(self, timezone_frame): + + df = timezone_frame + idx = df["B"].rename("foo") + + # setitem + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._mgr.blocks[1] + b2 = df._mgr.blocks[2] + tm.assert_extension_array_equal(b1.values, b2.values) + b1base = b1.values._data.base + b2base = b2.values._data.base + assert b1base is None or (id(b1base) != id(b2base)) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = NaT + df2.iloc[1, 2] = NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal(df2.dtypes, df.dtypes) + + def test_setitem_periodindex(self): + rng = period_range("1/1/2000", periods=5, name="index") + df = DataFrame(np.random.randn(5, 3), index=rng) + + df["Index"] = rng + rs = Index(df["Index"]) + tm.assert_index_equal(rs, rng, check_names=False) + assert rs.name == "Index" + assert rng.name == "index" + + rs = df.reset_index().set_index("index") + assert isinstance(rs.index, PeriodIndex) + tm.assert_index_equal(rs.index, rng) + + @pytest.mark.parametrize("klass", [list, np.array]) + def test_iloc_setitem_bool_indexer(self, klass): + # GH: 36741 + df = DataFrame({"flag": ["x", "y", "z"], "value": [1, 3, 4]}) + indexer = klass([True, False, False]) + df.iloc[indexer, 1] = df.iloc[indexer, 1] * 2 + expected = DataFrame({"flag": ["x", "y", "z"], "value": [2, 3, 4]}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py index 04e1c8b94c4d9..c0cd7faafb4db 100644 --- a/pandas/tests/frame/indexing/test_sparse.py +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -27,7 +27,7 @@ def test_getitem_sparse_column(self): @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) @td.skip_if_no_scipy - def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): + def test_loc_getitem_from_spmatrix(self, spmatrix_t, dtype): import scipy.sparse spmatrix_t = getattr(scipy.sparse, spmatrix_t) @@ -50,21 +50,6 @@ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): expected = np.full(cols, SparseDtype(dtype, fill_value=0)) tm.assert_numpy_array_equal(result, expected) - def test_reindex(self): - # https://github.com/pandas-dev/pandas/issues/35286 - df = pd.DataFrame( - {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} - ) - result = df.reindex([0, 2]) - expected = pd.DataFrame( - { - "A": [0.0, np.nan], - "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), - }, - index=[0, 2], - ) - tm.assert_frame_equal(result, expected) - def test_all_sparse(self): df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))}) result = df.loc[[0, 1]] diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index d114a3178b686..3495247585236 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -399,7 +399,7 @@ def test_where_none(self): def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): # see gh-21947 - df = pd.DataFrame(columns=["a"]) + df = DataFrame(columns=["a"]) cond = df assert (cond.dtypes == object).all() @@ -642,3 +642,14 @@ def test_df_where_with_category(self, kwargs): expected = Series(A, name="A") tm.assert_series_equal(result, expected) + + def test_where_categorical_filtering(self): + # GH#22609 Verify filtering operations on DataFrames with categorical Series + df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) + df["b"] = df["b"].astype("category") + + result = df.where(df["a"] > 0) + expected = df.copy() + expected.loc[0, :] = np.nan + + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 71b40585f0c2f..11e076f313540 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,13 +3,30 @@ import numpy as np import pytest -import pandas as pd -from pandas import DataFrame, Series +from pandas import DataFrame, Index, IndexSlice, MultiIndex, Series, concat import pandas._testing as tm +import pandas.core.common as com from pandas.tseries.offsets import BDay +@pytest.fixture +def four_level_index_dataframe(): + arr = np.array( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.358, 0.89145, 2.5838], + ] + ) + index = MultiIndex( + levels=[["a", "x"], ["b", "q"], [10.0032, 20.0, 30.0], [3, 4, 5]], + codes=[[0, 0, 1], [0, 1, 1], [0, 1, 2], [2, 1, 0]], + names=["one", "two", "three", "four"], + ) + return DataFrame(arr, index=index, columns=list("ABCDE")) + + class TestXS: def test_xs(self, float_frame, datetime_frame): idx = float_frame.index[5] @@ -53,13 +70,13 @@ def test_xs_corner(self): df["E"] = 3.0 xs = df.xs(0) - exp = pd.Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0) + exp = Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0) tm.assert_series_equal(xs, exp) # no columns but Index(dtype=object) df = DataFrame(index=["a", "b", "c"]) result = df.xs("a") - expected = Series([], name="a", index=pd.Index([]), dtype=np.float64) + expected = Series([], name="a", index=Index([]), dtype=np.float64) tm.assert_series_equal(result, expected) def test_xs_duplicates(self): @@ -93,3 +110,190 @@ def test_xs_view(self): dm.xs(2)[:] = 10 assert (dm.xs(2) == 10).all() + + +class TestXSWithMultiIndex: + def test_xs_integer_key(self): + # see GH#2107 + dates = range(20111201, 20111205) + ids = list("abcde") + index = MultiIndex.from_product([dates, ids], names=["date", "secid"]) + df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) + + result = df.xs(20111201, level="date") + expected = df.loc[20111201, :] + tm.assert_frame_equal(result, expected) + + def test_xs_level(self, multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs("two", level="second") + expected = df[df.index.get_level_values(1) == "two"] + expected.index = Index(["foo", "bar", "baz", "qux"], name="first") + tm.assert_frame_equal(result, expected) + + def test_xs_level_eq_2(self): + arr = np.random.randn(3, 5) + index = MultiIndex( + levels=[["a", "p", "x"], ["b", "q", "y"], ["c", "r", "z"]], + codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]], + ) + df = DataFrame(arr, index=index) + expected = DataFrame(arr[1:2], index=[["a"], ["b"]]) + result = df.xs("c", level=2) + tm.assert_frame_equal(result, expected) + + def test_xs_setting_with_copy_error(self, multiindex_dataframe_random_data): + # this is a copy in 0.14 + df = multiindex_dataframe_random_data + result = df.xs("two", level="second") + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + + def test_xs_setting_with_copy_error_multiple(self, four_level_index_dataframe): + # this is a copy in 0.14 + df = four_level_index_dataframe + result = df.xs(("a", 4), level=["one", "four"]) + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + + @pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) + def test_xs_with_duplicates(self, key, level, multiindex_dataframe_random_data): + # see GH#13719 + frame = multiindex_dataframe_random_data + df = concat([frame] * 2) + assert df.index.is_unique is False + expected = concat([frame.xs("one", level="second")] * 2) + + result = df.xs(key, level=level) + tm.assert_frame_equal(result, expected) + + def test_xs_missing_values_in_index(self): + # see GH#6574 + # missing values in returned index should be preserved + acc = [ + ("a", "abcde", 1), + ("b", "bbcde", 2), + ("y", "yzcde", 25), + ("z", "xbcde", 24), + ("z", None, 26), + ("z", "zbcde", 25), + ("z", "ybcde", 26), + ] + df = DataFrame(acc, columns=["a1", "a2", "cnt"]).set_index(["a1", "a2"]) + expected = DataFrame( + {"cnt": [24, 26, 25, 26]}, + index=Index(["xbcde", np.nan, "zbcde", "ybcde"], name="a2"), + ) + + result = df.xs("z", level="a1") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "key, level, exp_arr, exp_index", + [ + ("a", "lvl0", lambda x: x[:, 0:2], Index(["bar", "foo"], name="lvl1")), + ("foo", "lvl1", lambda x: x[:, 1:2], Index(["a"], name="lvl0")), + ], + ) + def test_xs_named_levels_axis_eq_1(self, key, level, exp_arr, exp_index): + # see GH#2903 + arr = np.random.randn(4, 4) + index = MultiIndex( + levels=[["a", "b"], ["bar", "foo", "hello", "world"]], + codes=[[0, 0, 1, 1], [0, 1, 2, 3]], + names=["lvl0", "lvl1"], + ) + df = DataFrame(arr, columns=index) + result = df.xs(key, level=level, axis=1) + expected = DataFrame(exp_arr(arr), columns=exp_index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "indexer", + [ + lambda df: df.xs(("a", 4), level=["one", "four"]), + lambda df: df.xs("a").xs(4, level="four"), + ], + ) + def test_xs_level_multiple(self, indexer, four_level_index_dataframe): + df = four_level_index_dataframe + expected_values = [[0.4473, 1.4152, 0.2834, 1.00661, 0.1744]] + expected_index = MultiIndex( + levels=[["q"], [20.0]], codes=[[0], [0]], names=["two", "three"] + ) + expected = DataFrame( + expected_values, index=expected_index, columns=list("ABCDE") + ) + result = indexer(df) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "indexer", [lambda df: df.xs("a", level=0), lambda df: df.xs("a")] + ) + def test_xs_level0(self, indexer, four_level_index_dataframe): + df = four_level_index_dataframe + expected_values = [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + ] + expected_index = MultiIndex( + levels=[["b", "q"], [10.0032, 20.0], [4, 5]], + codes=[[0, 1], [0, 1], [1, 0]], + names=["two", "three", "four"], + ) + expected = DataFrame( + expected_values, index=expected_index, columns=list("ABCDE") + ) + + result = indexer(df) + tm.assert_frame_equal(result, expected) + + def test_xs_values(self, multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs(("bar", "two")).values + expected = df.values[4] + tm.assert_almost_equal(result, expected) + + def test_xs_loc_equality(self, multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs(("bar", "two")) + expected = df.loc[("bar", "two")] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("klass", [DataFrame, Series]) + def test_xs_IndexSlice_argument_not_implemented(self, klass): + # GH#35301 + + index = MultiIndex( + levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) + + obj = DataFrame(np.random.randn(6, 4), index=index) + if klass is Series: + obj = obj[0] + + msg = ( + "Expected label or tuple of labels, got " + r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)" + ) + with pytest.raises(TypeError, match=msg): + obj.xs(IndexSlice[("foo", "qux", 0), :]) + + @pytest.mark.parametrize("klass", [DataFrame, Series]) + def test_xs_levels_raises(self, klass): + obj = DataFrame({"A": [1, 2, 3]}) + if klass is Series: + obj = obj["A"] + + msg = "Index must be a MultiIndex" + with pytest.raises(TypeError, match=msg): + obj.xs(0, level="as") diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index d19b59debfdea..5dd4f7f8f8800 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -1,12 +1,39 @@ import numpy as np import pytest +import pytz import pandas as pd -from pandas import DataFrame, Index, Series +from pandas import DataFrame, Index, Series, date_range import pandas._testing as tm class TestDataFrameAlign: + def test_frame_align_aware(self): + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") + df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) + df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) + new1, new2 = df1.align(df2) + assert df1.index.tz == new1.index.tz + assert df2.index.tz == new2.index.tz + + # different timezones convert to UTC + + # frame with frame + df1_central = df1.tz_convert("US/Central") + new1, new2 = df1.align(df1_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + # frame with Series + new1, new2 = df1.align(df1_central[0], axis=0) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + df1[0].align(df1_central, axis=0) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + def test_align_float(self, float_frame): af, bf = float_frame.align(float_frame) assert af._mgr is not float_frame._mgr @@ -133,8 +160,8 @@ def test_align_mixed_int(self, mixed_int_frame): "l_ordered,r_ordered,expected", [ [True, True, pd.CategoricalIndex], - [True, False, pd.Index], - [False, True, pd.Index], + [True, False, Index], + [False, True, Index], [False, False, pd.CategoricalIndex], ], ) @@ -169,9 +196,9 @@ def test_align_multiindex(self): midx = pd.MultiIndex.from_product( [range(2), range(3), range(2)], names=("a", "b", "c") ) - idx = pd.Index(range(2), name="b") - df1 = pd.DataFrame(np.arange(12, dtype="int64"), index=midx) - df2 = pd.DataFrame(np.arange(2, dtype="int64"), index=idx) + idx = Index(range(2), name="b") + df1 = DataFrame(np.arange(12, dtype="int64"), index=midx) + df2 = DataFrame(np.arange(2, dtype="int64"), index=idx) # these must be the same results (but flipped) res1l, res1r = df1.align(df2, join="left") @@ -180,7 +207,7 @@ def test_align_multiindex(self): expl = df1 tm.assert_frame_equal(expl, res1l) tm.assert_frame_equal(expl, res2r) - expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) tm.assert_frame_equal(expr, res1r) tm.assert_frame_equal(expr, res2l) @@ -190,24 +217,24 @@ def test_align_multiindex(self): exp_idx = pd.MultiIndex.from_product( [range(2), range(2), range(2)], names=("a", "b", "c") ) - expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) tm.assert_frame_equal(expl, res1l) tm.assert_frame_equal(expl, res2r) - expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx) + expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx) tm.assert_frame_equal(expr, res1r) tm.assert_frame_equal(expr, res2l) def test_align_series_combinations(self): - df = pd.DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) - s = pd.Series([1, 2, 4], index=list("ABD"), name="x") + df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = Series([1, 2, 4], index=list("ABD"), name="x") # frame + series res1, res2 = df.align(s, axis=0) - exp1 = pd.DataFrame( + exp1 = DataFrame( {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, index=list("ABCDE"), ) - exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") + exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") tm.assert_frame_equal(res1, exp1) tm.assert_series_equal(res2, exp2) diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 9fc3629e794e2..c08a77d00a96d 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -7,6 +7,18 @@ class TestDataFrameAppend: + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_append_multiindex(self, multiindex_dataframe_random_data, klass): + obj = multiindex_dataframe_random_data + if klass is Series: + obj = obj["A"] + + a = obj[:5] + b = obj[5:] + + result = a.append(b) + tm.assert_equal(result, obj) + def test_append_empty_list(self): # GH 28769 df = DataFrame() @@ -177,9 +189,9 @@ def test_append_dtypes(self): def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): # GH 30238 tz = tz_naive_fixture - df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)]) + df = DataFrame([pd.Timestamp(timestamp, tz=tz)]) result = df.append(df.iloc[0]).iloc[-1] - expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) + expected = Series(pd.Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -193,7 +205,7 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): ], ) def test_other_dtypes(self, data, dtype): - df = pd.DataFrame(data, dtype=dtype) + df = DataFrame(data, dtype=dtype) result = df.append(df.iloc[0]).iloc[-1] - expected = pd.Series(data, name=0, dtype=dtype) + expected = Series(data, name=0, dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 40b0ec0c0d811..cdcd922949bcf 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -2,13 +2,31 @@ import numpy as np -from pandas import DataFrame, DatetimeIndex, Series, date_range +from pandas import DataFrame, DatetimeIndex, Series, date_range, to_datetime import pandas._testing as tm from pandas.tseries import offsets class TestAsFreq: + def test_asfreq_resample_set_correct_freq(self): + # GH#5613 + # we test if .asfreq() and .resample() set the correct value for .freq + df = DataFrame( + {"date": ["2012-01-01", "2012-01-02", "2012-01-03"], "col": [1, 2, 3]} + ) + df = df.set_index(to_datetime(df.date)) + + # testing the settings before calling .asfreq() and .resample() + assert df.index.freq is None + assert df.index.inferred_freq == "D" + + # does .asfreq() set .freq correctly? + assert df.asfreq("D").index.freq == "D" + + # does .resample() set .freq correctly? + assert df.resample("D").asfreq().index.freq == "D" + def test_asfreq(self, datetime_frame): offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd()) rule_monthly = datetime_frame.asfreq("BM") diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index ca62b56664518..2da6c6e3f0a51 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm @@ -100,7 +99,7 @@ def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) - expected = pd.DataFrame(res, columns=original.columns, index=original.index) + expected = DataFrame(res, columns=original.columns, index=original.index) if inplace: result = original tm.assert_frame_equal(result, expected, check_exact=True) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 78f265d32f8df..d1f38d90547fd 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -18,7 +18,7 @@ def test_combine_first_mixed(self): b = Series(range(2), index=range(5, 7)) g = DataFrame({"A": a, "B": b}) - exp = pd.DataFrame( + exp = DataFrame( {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] ) combined = f.combine_first(g) @@ -169,13 +169,13 @@ def test_combine_first_mixed_bug(self): def test_combine_first_align_nan(self): # GH 7509 (not fixed) - dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) - dfb = pd.DataFrame([[4], [5]], columns=["b"]) + dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) + dfb = DataFrame([[4], [5]], columns=["b"]) assert dfa["a"].dtype == "datetime64[ns]" assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) - exp = pd.DataFrame( + exp = DataFrame( {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, columns=["a", "b"], ) @@ -185,7 +185,7 @@ def test_combine_first_align_nan(self): assert res["b"].dtype == "float64" res = dfa.iloc[:0].combine_first(dfb) - exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) + exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) tm.assert_frame_equal(res, exp) # ToDo: this must be datetime64 assert res["a"].dtype == "float64" @@ -195,21 +195,21 @@ def test_combine_first_align_nan(self): def test_combine_first_timezone(self): # see gh-7630 data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") - df1 = pd.DataFrame( + df1 = DataFrame( columns=["UTCdatetime", "abc"], data=data1, index=pd.date_range("20140627", periods=1), dtype="object", ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") - df2 = pd.DataFrame( + df2 = DataFrame( columns=["UTCdatetime", "xyz"], data=data2, index=pd.date_range("20140628", periods=1), dtype="object", ) res = df2[["UTCdatetime"]].combine_first(df1) - exp = pd.DataFrame( + exp = DataFrame( { "UTCdatetime": [ pd.Timestamp("2010-01-01 01:01", tz="UTC"), @@ -230,9 +230,9 @@ def test_combine_first_timezone(self): # see gh-10567 dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") - df1 = pd.DataFrame({"DATE": dts1}) + df1 = DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") - df2 = pd.DataFrame({"DATE": dts2}) + df2 = DataFrame({"DATE": dts2}) res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) @@ -241,11 +241,11 @@ def test_combine_first_timezone(self): dts1 = pd.DatetimeIndex( ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" ) - df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) + df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) dts2 = pd.DatetimeIndex( ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" ) - df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5]) + df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.DatetimeIndex( @@ -259,14 +259,14 @@ def test_combine_first_timezone(self): ], tz="US/Eastern", ) - exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) # different tz dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") - df1 = pd.DataFrame({"DATE": dts1}) + df1 = DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-03", "2015-01-05") - df2 = pd.DataFrame({"DATE": dts2}) + df2 = DataFrame({"DATE": dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) @@ -274,9 +274,9 @@ def test_combine_first_timezone(self): assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") - df1 = pd.DataFrame({"DATE": dts1}) + df1 = DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-01", "2015-01-03") - df2 = pd.DataFrame({"DATE": dts2}) + df2 = DataFrame({"DATE": dts2}) res = df1.combine_first(df2) exp_dts = [ @@ -284,41 +284,41 @@ def test_combine_first_timezone(self): pd.Timestamp("2015-01-02", tz="US/Eastern"), pd.Timestamp("2015-01-03"), ] - exp = pd.DataFrame({"DATE": exp_dts}) + exp = DataFrame({"DATE": exp_dts}) tm.assert_frame_equal(res, exp) assert res["DATE"].dtype == "object" def test_combine_first_timedelta(self): data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) - df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7]) + df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7]) data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) - df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5]) + df2 = DataFrame({"TD": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.TimedeltaIndex( ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] ) - exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["TD"].dtype == "timedelta64[ns]" def test_combine_first_period(self): data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") - df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7]) + df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7]) data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") - df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5]) + df2 = DataFrame({"P": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.PeriodIndex( ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" ) - exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["P"].dtype == data1.dtype # different freq dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") - df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5]) + df2 = DataFrame({"P": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = [ @@ -329,15 +329,15 @@ def test_combine_first_period(self): pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M"), ] - exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["P"].dtype == "object" def test_combine_first_int(self): # GH14687 - integer series that do no align exactly - df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") - df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") + df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") + df2 = DataFrame({"a": [1, 4]}, dtype="int64") res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) @@ -346,10 +346,10 @@ def test_combine_first_int(self): @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): # see gh-20699 - df1 = pd.DataFrame({"isNum": [val]}) - df2 = pd.DataFrame({"isBool": [True]}) + df1 = DataFrame({"isNum": [val]}) + df2 = DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) + exp = DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/frame/methods/test_convert.py b/pandas/tests/frame/methods/test_convert.py new file mode 100644 index 0000000000000..50add248f9614 --- /dev/null +++ b/pandas/tests/frame/methods/test_convert.py @@ -0,0 +1,54 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestConvert: + def test_convert_objects(self, float_string_frame): + + oops = float_string_frame.T.T + converted = oops._convert(datetime=True) + tm.assert_frame_equal(converted, float_string_frame) + assert converted["A"].dtype == np.float64 + + # force numeric conversion + float_string_frame["H"] = "1." + float_string_frame["I"] = "1" + + # add in some items that will be nan + length = len(float_string_frame) + float_string_frame["J"] = "1." + float_string_frame["K"] = "1" + float_string_frame.loc[float_string_frame.index[0:5], ["J", "K"]] = "garbled" + converted = float_string_frame._convert(datetime=True, numeric=True) + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" + assert converted["J"].dtype == "float64" + assert converted["K"].dtype == "float64" + assert len(converted["J"].dropna()) == length - 5 + assert len(converted["K"].dropna()) == length - 5 + + # via astype + converted = float_string_frame.copy() + converted["H"] = converted["H"].astype("float64") + converted["I"] = converted["I"].astype("int64") + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" + + # via astype, but errors + converted = float_string_frame.copy() + with pytest.raises(ValueError, match="invalid literal"): + converted["H"].astype("int32") + + # mixed in a single column + df = DataFrame(dict(s=Series([1, "na", 3, 4]))) + result = df._convert(datetime=True, numeric=True) + expected = DataFrame(dict(s=Series([1, np.nan, 3, 4]))) + tm.assert_frame_equal(result, expected) + + def test_convert_objects_no_conversion(self): + mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]}) + mixed2 = mixed1._convert(datetime=True) + tm.assert_frame_equal(mixed1, mixed2) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py new file mode 100644 index 0000000000000..cb0da59bc1afa --- /dev/null +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -0,0 +1,28 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestConvertDtypes: + @pytest.mark.parametrize( + "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] + ) + def test_convert_dtypes(self, convert_integer, expected): + # Specific types are tested in tests/series/test_dtypes.py + # Just check that it works for DataFrame here + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + } + ) + result = df.convert_dtypes(True, True, convert_integer, False) + expected = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=expected), + "b": pd.Series(["x", "y", "z"], dtype="string"), + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_count.py b/pandas/tests/frame/methods/test_count.py index 13a93e3efc48c..d738c7139093c 100644 --- a/pandas/tests/frame/methods/test_count.py +++ b/pandas/tests/frame/methods/test_count.py @@ -1,8 +1,29 @@ -from pandas import DataFrame, Series +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series import pandas._testing as tm class TestDataFrameCount: + def test_count_multiindex(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + frame = frame.copy() + frame.index.names = ["a", "b"] + + result = frame.count(level="b") + expected = frame.count(level=1) + tm.assert_frame_equal(result, expected, check_names=False) + + result = frame.count(level="a") + expected = frame.count(level=0) + tm.assert_frame_equal(result, expected, check_names=False) + + msg = "Level x not found" + with pytest.raises(KeyError, match=msg): + frame.count(level="x") + def test_count(self): # corner case frame = DataFrame() @@ -34,3 +55,85 @@ def test_count_objects(self, float_string_frame): tm.assert_series_equal(dm.count(), df.count()) tm.assert_series_equal(dm.count(1), df.count(1)) + + def test_count_level_corner(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + ser = frame["A"][:0] + result = ser.count(level=0) + expected = Series(0, index=ser.index.levels[0], name="A") + tm.assert_series_equal(result, expected) + + df = frame[:0] + result = df.count(level=0) + expected = ( + DataFrame( + index=ser.index.levels[0].set_names(["first"]), columns=df.columns + ) + .fillna(0) + .astype(np.int64) + ) + tm.assert_frame_equal(result, expected) + + def test_count_index_with_nan(self): + # https://github.com/pandas-dev/pandas/issues/21824 + df = DataFrame( + { + "Person": ["John", "Myla", None, "John", "Myla"], + "Age": [24.0, 5, 21.0, 33, 26], + "Single": [False, True, True, True, False], + } + ) + + # count on row labels + res = df.set_index(["Person", "Single"]).count(level="Person") + expected = DataFrame( + index=Index(["John", "Myla"], name="Person"), + columns=Index(["Age"]), + data=[2, 2], + ) + tm.assert_frame_equal(res, expected) + + # count on column labels + res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1) + expected = DataFrame( + columns=Index(["John", "Myla"], name="Person"), + index=Index(["Age"]), + data=[[2, 2]], + ) + tm.assert_frame_equal(res, expected) + + def test_count_level( + self, + multiindex_year_month_day_dataframe_random_data, + multiindex_dataframe_random_data, + ): + ymd = multiindex_year_month_day_dataframe_random_data + frame = multiindex_dataframe_random_data + + def _check_counts(frame, axis=0): + index = frame._get_axis(axis) + for i in range(index.nlevels): + result = frame.count(axis=axis, level=i) + expected = frame.groupby(axis=axis, level=i).count() + expected = expected.reindex_like(result).astype("i8") + tm.assert_frame_equal(result, expected) + + frame.iloc[1, [1, 2]] = np.nan + frame.iloc[7, [0, 1]] = np.nan + ymd.iloc[1, [1, 2]] = np.nan + ymd.iloc[7, [0, 1]] = np.nan + + _check_counts(frame) + _check_counts(ymd) + _check_counts(frame.T, axis=1) + _check_counts(ymd.T, axis=1) + + # can't call with level on regular DataFrame + df = tm.makeTimeDataFrame() + with pytest.raises(TypeError, match="hierarchical"): + df.count(level=0) + + frame["D"] = "foo" + result = frame.count(level=0, numeric_only=True) + tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index f307acd8c2178..7eeeb245534f5 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -74,10 +74,10 @@ def test_cov_ddof(self, test_ddof): ) def test_cov_nullable_integer(self, other_column): # https://github.com/pandas-dev/pandas/issues/33803 - data = pd.DataFrame({"a": pd.array([1, 2, None]), "b": other_column}) + data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column}) result = data.cov() arr = np.array([[0.5, 0.5], [0.5, 1.0]]) - expected = pd.DataFrame(arr, columns=["a", "b"], index=["a", "b"]) + expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -155,7 +155,7 @@ def test_corr_int_and_boolean(self): def test_corr_cov_independent_index_column(self): # GH#14617 - df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) + df = DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) for method in ["cov", "corr"]: result = getattr(df, method)() assert result.index is not result.columns @@ -163,7 +163,7 @@ def test_corr_cov_independent_index_column(self): def test_corr_invalid_method(self): # GH#22298 - df = pd.DataFrame(np.random.normal(size=(10, 2))) + df = DataFrame(np.random.normal(size=(10, 2))) msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " with pytest.raises(ValueError, match=msg): df.corr(method="____") @@ -186,15 +186,15 @@ def test_corr_int(self): @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) def test_corr_nullable_integer(self, nullable_column, other_column, method): # https://github.com/pandas-dev/pandas/issues/33803 - data = pd.DataFrame({"a": nullable_column, "b": other_column}) + data = DataFrame({"a": nullable_column, "b": other_column}) result = data.corr(method=method) - expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) + expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) def test_corr_item_cache(self): # Check that corr does not lead to incorrect entries in item_cache - df = pd.DataFrame({"A": range(10)}) + df = DataFrame({"A": range(10)}) df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache @@ -275,26 +275,26 @@ def test_corrwith_matches_corrcoef(self): def test_corrwith_mixed_dtypes(self): # GH#18570 - df = pd.DataFrame( + df = DataFrame( {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} ) - s = pd.Series([0, 6, 7, 3]) + s = Series([0, 6, 7, 3]) result = df.corrwith(s) corrs = [df["a"].corr(s), df["b"].corr(s)] - expected = pd.Series(data=corrs, index=["a", "b"]) + expected = Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected) def test_corrwith_index_intersection(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) + df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=True).index.sort_values() expected = df1.columns.intersection(df2.columns).sort_values() tm.assert_index_equal(result, expected) def test_corrwith_index_union(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) + df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=False).index.sort_values() expected = df1.columns.union(df2.columns).sort_values() @@ -302,18 +302,18 @@ def test_corrwith_index_union(self): def test_corrwith_dup_cols(self): # GH#21925 - df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) + df1 = DataFrame(np.vstack([np.arange(10)] * 3).T) df2 = df1.copy() df2 = pd.concat((df2, df2[0]), axis=1) result = df1.corrwith(df2) - expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) + expected = Series(np.ones(4), index=[0, 0, 1, 2]) tm.assert_series_equal(result, expected) @td.skip_if_no_scipy def test_corrwith_spearman(self): # GH#21925 - df = pd.DataFrame(np.random.random(size=(100, 3))) + df = DataFrame(np.random.random(size=(100, 3))) result = df.corrwith(df ** 2, method="spearman") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) @@ -321,7 +321,7 @@ def test_corrwith_spearman(self): @td.skip_if_no_scipy def test_corrwith_kendall(self): # GH#21925 - df = pd.DataFrame(np.random.random(size=(100, 3))) + df = DataFrame(np.random.random(size=(100, 3))) result = df.corrwith(df ** 2, method="kendall") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 0b70bead375da..0358bc3c04539 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -34,9 +34,9 @@ def test_describe_bool_in_mixed_frame(self): def test_describe_empty_object(self): # GH#27183 - df = pd.DataFrame({"A": [None, None]}, dtype=object) + df = DataFrame({"A": [None, None]}, dtype=object) result = df.describe() - expected = pd.DataFrame( + expected = DataFrame( {"A": [0, 0, np.nan, np.nan]}, dtype=object, index=["count", "unique", "top", "freq"], @@ -48,7 +48,7 @@ def test_describe_empty_object(self): def test_describe_bool_frame(self): # GH#13891 - df = pd.DataFrame( + df = DataFrame( { "bool_data_1": [False, False, True, True], "bool_data_2": [False, True, True, True], @@ -61,7 +61,7 @@ def test_describe_bool_frame(self): ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame( + df = DataFrame( { "bool_data": [False, False, True, True, False], "int_data": [0, 1, 2, 3, 4], @@ -74,7 +74,7 @@ def test_describe_bool_frame(self): ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame( + df = DataFrame( {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} ) result = df.describe() @@ -119,7 +119,7 @@ def test_describe_empty_categorical_column(self): # GH#26397 # Ensure the index of an an empty categorical DataFrame column # also contains (count, unique, top, freq) - df = pd.DataFrame({"empty_col": Categorical([])}) + df = DataFrame({"empty_col": Categorical([])}) result = df.describe() expected = DataFrame( {"empty_col": [0, 0, np.nan, np.nan]}, @@ -198,7 +198,7 @@ def test_describe_timedelta_values(self): # GH#6145 t1 = pd.timedelta_range("1 days", freq="D", periods=5) t2 = pd.timedelta_range("1 hours", freq="H", periods=5) - df = pd.DataFrame({"t1": t1, "t2": t2}) + df = DataFrame({"t1": t1, "t2": t2}) expected = DataFrame( { @@ -249,7 +249,7 @@ def test_describe_tz_values(self, tz_naive_fixture): start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s2 = Series(date_range(start, end, tz=tz)) - df = pd.DataFrame({"s1": s1, "s2": s2}) + df = DataFrame({"s1": s1, "s2": s2}) expected = DataFrame( { @@ -271,9 +271,9 @@ def test_describe_tz_values(self, tz_naive_fixture): tm.assert_frame_equal(result, expected) def test_datetime_is_numeric_includes_datetime(self): - df = pd.DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]}) + df = DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]}) result = df.describe(datetime_is_numeric=True) - expected = pd.DataFrame( + expected = DataFrame( { "a": [ 3, @@ -297,10 +297,10 @@ def test_describe_tz_values2(self): start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s2 = Series(date_range(start, end, tz=tz)) - df = pd.DataFrame({"s1": s1, "s2": s2}) + df = DataFrame({"s1": s1, "s2": s2}) s1_ = s1.describe() - s2_ = pd.Series( + s2_ = Series( [ 5, 5, @@ -334,7 +334,7 @@ def test_describe_tz_values2(self): def test_describe_percentiles_integer_idx(self): # GH#26660 - df = pd.DataFrame({"x": [1]}) + df = DataFrame({"x": [1]}) pct = np.linspace(0, 1, 10 + 1) result = df.describe(percentiles=pct) diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index e160d5d24d40a..8affcce478cf4 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -8,7 +8,7 @@ class TestDataFrameDiff: def test_diff_requires_integer(self): - df = pd.DataFrame(np.random.randn(2, 2)) + df = DataFrame(np.random.randn(2, 2)) with pytest.raises(ValueError, match="periods must be an integer"): df.diff(1.5) @@ -33,10 +33,10 @@ def test_diff(self, datetime_frame): tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) # GH#10907 - df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])}) + df = DataFrame({"y": Series([2]), "z": Series([3])}) df.insert(0, "x", 1) result = df.diff(axis=1) - expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}) + expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)}) tm.assert_frame_equal(result, expected) def test_diff_timedelta64_with_nat(self): @@ -44,12 +44,10 @@ def test_diff_timedelta64_with_nat(self): arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]") arr[:, 0] = np.timedelta64("NaT", "ns") - df = pd.DataFrame(arr) + df = DataFrame(arr) result = df.diff(1, axis=0) - expected = pd.DataFrame( - {0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]} - ) + expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]}) tm.assert_equal(result, expected) result = df.diff(0) @@ -65,20 +63,20 @@ def test_diff_timedelta64_with_nat(self): def test_diff_datetime_axis0_with_nat(self, tz): # GH#32441 dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz) - ser = pd.Series(dti) + ser = Series(dti) df = ser.to_frame() result = df.diff() ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]) - expected = pd.Series(ex_index).to_frame() + expected = Series(ex_index).to_frame() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_diff_datetime_with_nat_zero_periods(self, tz): # diff on NaT values should give NaT, not timedelta64(0) dti = pd.date_range("2016-01-01", periods=4, tz=tz) - ser = pd.Series(dti) + ser = Series(dti) df = ser.to_frame() df[1] = ser.copy() @@ -176,7 +174,7 @@ def test_diff_axis(self): def test_diff_period(self): # GH#32995 Don't pass an incorrect axis pi = pd.date_range("2016-01-01", periods=3).to_period("D") - df = pd.DataFrame({"A": pi}) + df = DataFrame({"A": pi}) result = df.diff(1, axis=1) @@ -185,24 +183,24 @@ def test_diff_period(self): def test_diff_axis1_mixed_dtypes(self): # GH#32995 operate column-wise when we have mixed dtypes and axis=1 - df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) - expected = pd.DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2}) + expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2}) result = df.diff(axis=1) tm.assert_frame_equal(result, expected) # GH#21437 mixed-float-dtypes - df = pd.DataFrame( + df = DataFrame( {"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")} ) result = df.diff(axis=1) - expected = pd.DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0}) + expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0}) tm.assert_frame_equal(result, expected) def test_diff_axis1_mixed_dtypes_large_periods(self): # GH#32995 operate column-wise when we have mixed dtypes and axis=1 - df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) expected = df * np.nan @@ -211,19 +209,19 @@ def test_diff_axis1_mixed_dtypes_large_periods(self): def test_diff_axis1_mixed_dtypes_negative_periods(self): # GH#32995 operate column-wise when we have mixed dtypes and axis=1 - df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) - expected = pd.DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan}) + expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan}) result = df.diff(axis=1, periods=-1) tm.assert_frame_equal(result, expected) def test_diff_sparse(self): # GH#28813 .diff() should work for sparse dataframes as well - sparse_df = pd.DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]") + sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]") result = sparse_df.diff() - expected = pd.DataFrame( + expected = DataFrame( [[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0) ) @@ -234,7 +232,7 @@ def test_diff_sparse(self): [ ( 0, - pd.DataFrame( + DataFrame( { "a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0], "b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan], @@ -246,7 +244,7 @@ def test_diff_sparse(self): ), ( 1, - pd.DataFrame( + DataFrame( { "a": np.repeat(np.nan, 8), "b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0], @@ -260,7 +258,7 @@ def test_diff_sparse(self): ) def test_diff_integer_na(self, axis, expected): # GH#24171 IntegerNA Support for DataFrame.diff() - df = pd.DataFrame( + df = DataFrame( { "a": np.repeat([0, 1, np.nan, 2], 2), "b": np.tile([0, 1, np.nan, 2], 2), @@ -278,7 +276,7 @@ def test_diff_readonly(self): # https://github.com/pandas-dev/pandas/issues/35559 arr = np.random.randn(5, 2) arr.flags.writeable = False - df = pd.DataFrame(arr) + df = DataFrame(arr) result = df.diff() - expected = pd.DataFrame(np.array(df)).diff() + expected = DataFrame(np.array(df)).diff() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index aa44a2427dc8f..09c10861e87c2 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -21,7 +21,7 @@ def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): # GH 8594 mi = pd.MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) s = pd.Series([10, 20, 30], index=mi) - df = pd.DataFrame([10, 20, 30], index=mi) + df = DataFrame([10, 20, 30], index=mi) with pytest.raises(KeyError, match=msg): s.drop(labels, level=level) @@ -34,7 +34,7 @@ def test_drop_errors_ignore(labels, level): # GH 8594 mi = pd.MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) s = pd.Series([10, 20, 30], index=mi) - df = pd.DataFrame([10, 20, 30], index=mi) + df = DataFrame([10, 20, 30], index=mi) expected_s = s.drop(labels, level=level, errors="ignore") tm.assert_series_equal(s, expected_s) @@ -47,7 +47,7 @@ def test_drop_with_non_unique_datetime_index_and_invalid_keys(): # GH 30399 # define dataframe with unique datetime index - df = pd.DataFrame( + df = DataFrame( np.random.randn(5, 3), columns=["a", "b", "c"], index=pd.date_range("2012", freq="H", periods=5), @@ -141,14 +141,14 @@ def test_drop(self): tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 - nu_df = nu_df.set_index(pd.Index(["X", "Y", "X"])) + nu_df = nu_df.set_index(Index(["X", "Y", "X"])) nu_df.columns = list("abc") tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) # inplace cache issue # GH#5628 - df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) + df = DataFrame(np.random.randn(10, 3), columns=list("abc")) expected = df[~(df.b > 0)] return_value = df.drop(labels=df[df.b > 0].index, inplace=True) assert return_value is None @@ -252,15 +252,15 @@ def test_raise_on_drop_duplicate_index(self, actual): def test_drop_empty_list(self, index, drop_labels): # GH#21494 expected_index = [i for i in index if i not in drop_labels] - frame = pd.DataFrame(index=index).drop(drop_labels) - tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) + frame = DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, DataFrame(index=expected_index)) @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) def test_drop_non_empty_list(self, index, drop_labels): # GH# 21494 with pytest.raises(KeyError, match="not found in axis"): - pd.DataFrame(index=index).drop(drop_labels) + DataFrame(index=index).drop(drop_labels) def test_mixed_depth_drop(self): arrays = [ @@ -420,3 +420,24 @@ def test_drop_preserve_names(self): result = df.drop([(0, 2)]) assert result.index.names == ("one", "two") + + @pytest.mark.parametrize( + "operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"] + ) + @pytest.mark.parametrize("inplace", [False, True]) + def test_inplace_drop_and_operation(self, operation, inplace): + # GH#30484 + df = DataFrame({"x": range(5)}) + expected = df.copy() + df["y"] = range(5) + y = df["y"] + + with tm.assert_produces_warning(None): + if inplace: + df.drop("y", axis=1, inplace=inplace) + else: + df = df.drop("y", axis=1, inplace=inplace) + + # Perform operation and check result + getattr(y, operation)(1) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py new file mode 100644 index 0000000000000..9cbfee5e663ae --- /dev/null +++ b/pandas/tests/frame/methods/test_dropna.py @@ -0,0 +1,209 @@ +import datetime + +import dateutil +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFrameMissingData: + def test_dropEmptyRows(self, float_frame): + N = len(float_frame.index) + mat = np.random.randn(N) + mat[:5] = np.nan + + frame = DataFrame({"foo": mat}, index=float_frame.index) + original = Series(mat, index=float_frame.index, name="foo") + expected = original.dropna() + inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() + + smaller_frame = frame.dropna(how="all") + # check that original was preserved + tm.assert_series_equal(frame["foo"], original) + return_value = inplace_frame1.dropna(how="all", inplace=True) + tm.assert_series_equal(smaller_frame["foo"], expected) + tm.assert_series_equal(inplace_frame1["foo"], expected) + assert return_value is None + + smaller_frame = frame.dropna(how="all", subset=["foo"]) + return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) + tm.assert_series_equal(smaller_frame["foo"], expected) + tm.assert_series_equal(inplace_frame2["foo"], expected) + assert return_value is None + + def test_dropIncompleteRows(self, float_frame): + N = len(float_frame.index) + mat = np.random.randn(N) + mat[:5] = np.nan + + frame = DataFrame({"foo": mat}, index=float_frame.index) + frame["bar"] = 5 + original = Series(mat, index=float_frame.index, name="foo") + inp_frame1, inp_frame2 = frame.copy(), frame.copy() + + smaller_frame = frame.dropna() + tm.assert_series_equal(frame["foo"], original) + return_value = inp_frame1.dropna(inplace=True) + + exp = Series(mat[5:], index=float_frame.index[5:], name="foo") + tm.assert_series_equal(smaller_frame["foo"], exp) + tm.assert_series_equal(inp_frame1["foo"], exp) + assert return_value is None + + samesize_frame = frame.dropna(subset=["bar"]) + tm.assert_series_equal(frame["foo"], original) + assert (frame["bar"] == 5).all() + return_value = inp_frame2.dropna(subset=["bar"], inplace=True) + tm.assert_index_equal(samesize_frame.index, float_frame.index) + tm.assert_index_equal(inp_frame2.index, float_frame.index) + assert return_value is None + + def test_dropna(self): + df = DataFrame(np.random.randn(6, 4)) + df[2][:2] = np.nan + + dropped = df.dropna(axis=1) + expected = df.loc[:, [0, 1, 3]] + inp = df.copy() + return_value = inp.dropna(axis=1, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + assert return_value is None + + dropped = df.dropna(axis=0) + expected = df.loc[list(range(2, 6))] + inp = df.copy() + return_value = inp.dropna(axis=0, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + assert return_value is None + + # threshold + dropped = df.dropna(axis=1, thresh=5) + expected = df.loc[:, [0, 1, 3]] + inp = df.copy() + return_value = inp.dropna(axis=1, thresh=5, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + assert return_value is None + + dropped = df.dropna(axis=0, thresh=4) + expected = df.loc[range(2, 6)] + inp = df.copy() + return_value = inp.dropna(axis=0, thresh=4, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + assert return_value is None + + dropped = df.dropna(axis=1, thresh=4) + tm.assert_frame_equal(dropped, df) + + dropped = df.dropna(axis=1, thresh=3) + tm.assert_frame_equal(dropped, df) + + # subset + dropped = df.dropna(axis=0, subset=[0, 1, 3]) + inp = df.copy() + return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) + tm.assert_frame_equal(dropped, df) + tm.assert_frame_equal(inp, df) + assert return_value is None + + # all + dropped = df.dropna(axis=1, how="all") + tm.assert_frame_equal(dropped, df) + + df[2] = np.nan + dropped = df.dropna(axis=1, how="all") + expected = df.loc[:, [0, 1, 3]] + tm.assert_frame_equal(dropped, expected) + + # bad input + msg = "No axis named 3 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.dropna(axis=3) + + def test_drop_and_dropna_caching(self): + # tst that cacher updates + original = Series([1, 2, np.nan], name="A") + expected = Series([1, 2], dtype=original.dtype, name="A") + df = DataFrame({"A": original.values.copy()}) + df2 = df.copy() + df["A"].dropna() + tm.assert_series_equal(df["A"], original) + + ser = df["A"] + return_value = ser.dropna(inplace=True) + tm.assert_series_equal(ser, expected) + tm.assert_series_equal(df["A"], original) + assert return_value is None + + df2["A"].drop([1]) + tm.assert_series_equal(df2["A"], original) + + ser = df2["A"] + return_value = ser.drop([1], inplace=True) + tm.assert_series_equal(ser, original.drop([1])) + tm.assert_series_equal(df2["A"], original) + assert return_value is None + + def test_dropna_corner(self, float_frame): + # bad input + msg = "invalid how option: foo" + with pytest.raises(ValueError, match=msg): + float_frame.dropna(how="foo") + msg = "must specify how or thresh" + with pytest.raises(TypeError, match=msg): + float_frame.dropna(how=None) + # non-existent column - 8303 + with pytest.raises(KeyError, match=r"^\['X'\]$"): + float_frame.dropna(subset=["A", "X"]) + + def test_dropna_multiple_axes(self): + df = DataFrame( + [ + [1, np.nan, 2, 3], + [4, np.nan, 5, 6], + [np.nan, np.nan, np.nan, np.nan], + [7, np.nan, 8, 9], + ] + ) + + # GH20987 + with pytest.raises(TypeError, match="supplying multiple axes"): + df.dropna(how="all", axis=[0, 1]) + with pytest.raises(TypeError, match="supplying multiple axes"): + df.dropna(how="all", axis=(0, 1)) + + inp = df.copy() + with pytest.raises(TypeError, match="supplying multiple axes"): + inp.dropna(how="all", axis=(0, 1), inplace=True) + + def test_dropna_tz_aware_datetime(self): + # GH13407 + df = DataFrame() + dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) + dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) + df["Time"] = [dt1] + result = df.dropna(axis=0) + expected = DataFrame({"Time": [dt1]}) + tm.assert_frame_equal(result, expected) + + # Ex2 + df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) + result = df.dropna(axis=0) + expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) + tm.assert_frame_equal(result, expected) + + def test_dropna_categorical_interval_index(self): + # GH 25087 + ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) + ci = pd.CategoricalIndex(ii) + df = DataFrame({"A": list("abc")}, index=ci) + + expected = df + result = df.dropna() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py new file mode 100644 index 0000000000000..0105eef435121 --- /dev/null +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -0,0 +1,128 @@ +from datetime import timedelta + +import numpy as np + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas import DataFrame, Series, date_range, option_context +import pandas._testing as tm + + +def _check_cast(df, v): + """ + Check if all dtypes of df are equal to v + """ + assert all(s.dtype.name == v for _, s in df.items()) + + +class TestDataFrameDataTypes: + def test_empty_frame_dtypes(self): + empty_df = DataFrame() + tm.assert_series_equal(empty_df.dtypes, Series(dtype=object)) + + nocols_df = DataFrame(index=[1, 2, 3]) + tm.assert_series_equal(nocols_df.dtypes, Series(dtype=object)) + + norows_df = DataFrame(columns=list("abc")) + tm.assert_series_equal(norows_df.dtypes, Series(object, index=list("abc"))) + + norows_int_df = DataFrame(columns=list("abc")).astype(np.int32) + tm.assert_series_equal( + norows_int_df.dtypes, Series(np.dtype("int32"), index=list("abc")) + ) + + df = DataFrame(dict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3]) + ex_dtypes = Series(dict([("a", np.int64), ("b", np.bool_), ("c", np.float64)])) + tm.assert_series_equal(df.dtypes, ex_dtypes) + + # same but for empty slice of df + tm.assert_series_equal(df[:0].dtypes, ex_dtypes) + + def test_datetime_with_tz_dtypes(self): + tzframe = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": date_range("20130101", periods=3, tz="US/Eastern"), + "C": date_range("20130101", periods=3, tz="CET"), + } + ) + tzframe.iloc[1, 1] = pd.NaT + tzframe.iloc[1, 2] = pd.NaT + result = tzframe.dtypes.sort_index() + expected = Series( + [ + np.dtype("datetime64[ns]"), + DatetimeTZDtype("ns", "US/Eastern"), + DatetimeTZDtype("ns", "CET"), + ], + ["A", "B", "C"], + ) + + tm.assert_series_equal(result, expected) + + def test_dtypes_are_correct_after_column_slice(self): + # GH6525 + df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) + tm.assert_series_equal( + df.dtypes, + Series(dict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), + ) + tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series(dict([("c", np.float_)]))) + tm.assert_series_equal( + df.dtypes, + Series(dict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), + ) + + def test_dtypes_gh8722(self, float_string_frame): + float_string_frame["bool"] = float_string_frame["A"] > 0 + result = float_string_frame.dtypes + expected = Series( + {k: v.dtype for k, v in float_string_frame.items()}, index=result.index + ) + tm.assert_series_equal(result, expected) + + # compat, GH 8722 + with option_context("use_inf_as_na", True): + df = DataFrame([[1]]) + result = df.dtypes + tm.assert_series_equal(result, Series({0: np.dtype("int64")})) + + def test_dtypes_timedeltas(self): + df = DataFrame( + dict( + A=Series(date_range("2012-1-1", periods=3, freq="D")), + B=Series([timedelta(days=i) for i in range(3)]), + ) + ) + result = df.dtypes + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB") + ) + tm.assert_series_equal(result, expected) + + df["C"] = df["A"] + df["B"] + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + ], + index=list("ABC"), + ) + tm.assert_series_equal(result, expected) + + # mixed int types + df["D"] = 1 + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + np.dtype("int64"), + ], + index=list("ABCD"), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py new file mode 100644 index 0000000000000..c024390297fec --- /dev/null +++ b/pandas/tests/frame/methods/test_equals.py @@ -0,0 +1,23 @@ +from pandas import DataFrame +import pandas._testing as tm + + +class TestEquals: + def test_dataframe_not_equal(self): + # see GH#28839 + df1 = DataFrame({"a": [1, 2], "b": ["s", "d"]}) + df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) + assert df1.equals(df2) is False + + def test_equals_different_blocks(self): + # GH#9330 + df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) + df1 = df0.reset_index()[["A", "B", "C"]] + # this assert verifies that the above operations have + # induced a block rearrangement + assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype + + # do the real tests + tm.assert_frame_equal(df0, df1) + assert df0.equals(df1) + assert df1.equals(df0) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/methods/test_fillna.py similarity index 62% rename from pandas/tests/frame/test_missing.py rename to pandas/tests/frame/methods/test_fillna.py index 5d3f8e3a2f7c1..9fa1aa65379c5 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,214 +1,22 @@ -import datetime - -import dateutil import numpy as np import pytest -import pandas as pd -from pandas import Categorical, DataFrame, Series, Timestamp, date_range +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + NaT, + PeriodIndex, + Series, + TimedeltaIndex, + Timestamp, + date_range, +) import pandas._testing as tm from pandas.tests.frame.common import _check_mixed_float -class TestDataFrameMissingData: - def test_dropEmptyRows(self, float_frame): - N = len(float_frame.index) - mat = np.random.randn(N) - mat[:5] = np.nan - - frame = DataFrame({"foo": mat}, index=float_frame.index) - original = Series(mat, index=float_frame.index, name="foo") - expected = original.dropna() - inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() - - smaller_frame = frame.dropna(how="all") - # check that original was preserved - tm.assert_series_equal(frame["foo"], original) - return_value = inplace_frame1.dropna(how="all", inplace=True) - tm.assert_series_equal(smaller_frame["foo"], expected) - tm.assert_series_equal(inplace_frame1["foo"], expected) - assert return_value is None - - smaller_frame = frame.dropna(how="all", subset=["foo"]) - return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) - tm.assert_series_equal(smaller_frame["foo"], expected) - tm.assert_series_equal(inplace_frame2["foo"], expected) - assert return_value is None - - def test_dropIncompleteRows(self, float_frame): - N = len(float_frame.index) - mat = np.random.randn(N) - mat[:5] = np.nan - - frame = DataFrame({"foo": mat}, index=float_frame.index) - frame["bar"] = 5 - original = Series(mat, index=float_frame.index, name="foo") - inp_frame1, inp_frame2 = frame.copy(), frame.copy() - - smaller_frame = frame.dropna() - tm.assert_series_equal(frame["foo"], original) - return_value = inp_frame1.dropna(inplace=True) - - exp = Series(mat[5:], index=float_frame.index[5:], name="foo") - tm.assert_series_equal(smaller_frame["foo"], exp) - tm.assert_series_equal(inp_frame1["foo"], exp) - assert return_value is None - - samesize_frame = frame.dropna(subset=["bar"]) - tm.assert_series_equal(frame["foo"], original) - assert (frame["bar"] == 5).all() - return_value = inp_frame2.dropna(subset=["bar"], inplace=True) - tm.assert_index_equal(samesize_frame.index, float_frame.index) - tm.assert_index_equal(inp_frame2.index, float_frame.index) - assert return_value is None - - def test_dropna(self): - df = DataFrame(np.random.randn(6, 4)) - df[2][:2] = np.nan - - dropped = df.dropna(axis=1) - expected = df.loc[:, [0, 1, 3]] - inp = df.copy() - return_value = inp.dropna(axis=1, inplace=True) - tm.assert_frame_equal(dropped, expected) - tm.assert_frame_equal(inp, expected) - assert return_value is None - - dropped = df.dropna(axis=0) - expected = df.loc[list(range(2, 6))] - inp = df.copy() - return_value = inp.dropna(axis=0, inplace=True) - tm.assert_frame_equal(dropped, expected) - tm.assert_frame_equal(inp, expected) - assert return_value is None - - # threshold - dropped = df.dropna(axis=1, thresh=5) - expected = df.loc[:, [0, 1, 3]] - inp = df.copy() - return_value = inp.dropna(axis=1, thresh=5, inplace=True) - tm.assert_frame_equal(dropped, expected) - tm.assert_frame_equal(inp, expected) - assert return_value is None - - dropped = df.dropna(axis=0, thresh=4) - expected = df.loc[range(2, 6)] - inp = df.copy() - return_value = inp.dropna(axis=0, thresh=4, inplace=True) - tm.assert_frame_equal(dropped, expected) - tm.assert_frame_equal(inp, expected) - assert return_value is None - - dropped = df.dropna(axis=1, thresh=4) - tm.assert_frame_equal(dropped, df) - - dropped = df.dropna(axis=1, thresh=3) - tm.assert_frame_equal(dropped, df) - - # subset - dropped = df.dropna(axis=0, subset=[0, 1, 3]) - inp = df.copy() - return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) - tm.assert_frame_equal(dropped, df) - tm.assert_frame_equal(inp, df) - assert return_value is None - - # all - dropped = df.dropna(axis=1, how="all") - tm.assert_frame_equal(dropped, df) - - df[2] = np.nan - dropped = df.dropna(axis=1, how="all") - expected = df.loc[:, [0, 1, 3]] - tm.assert_frame_equal(dropped, expected) - - # bad input - msg = "No axis named 3 for object type DataFrame" - with pytest.raises(ValueError, match=msg): - df.dropna(axis=3) - - def test_drop_and_dropna_caching(self): - # tst that cacher updates - original = Series([1, 2, np.nan], name="A") - expected = Series([1, 2], dtype=original.dtype, name="A") - df = pd.DataFrame({"A": original.values.copy()}) - df2 = df.copy() - df["A"].dropna() - tm.assert_series_equal(df["A"], original) - - ser = df["A"] - return_value = ser.dropna(inplace=True) - tm.assert_series_equal(ser, expected) - tm.assert_series_equal(df["A"], original) - assert return_value is None - - df2["A"].drop([1]) - tm.assert_series_equal(df2["A"], original) - - ser = df2["A"] - return_value = ser.drop([1], inplace=True) - tm.assert_series_equal(ser, original.drop([1])) - tm.assert_series_equal(df2["A"], original) - assert return_value is None - - def test_dropna_corner(self, float_frame): - # bad input - msg = "invalid how option: foo" - with pytest.raises(ValueError, match=msg): - float_frame.dropna(how="foo") - msg = "must specify how or thresh" - with pytest.raises(TypeError, match=msg): - float_frame.dropna(how=None) - # non-existent column - 8303 - with pytest.raises(KeyError, match=r"^\['X'\]$"): - float_frame.dropna(subset=["A", "X"]) - - def test_dropna_multiple_axes(self): - df = DataFrame( - [ - [1, np.nan, 2, 3], - [4, np.nan, 5, 6], - [np.nan, np.nan, np.nan, np.nan], - [7, np.nan, 8, 9], - ] - ) - - # GH20987 - with pytest.raises(TypeError, match="supplying multiple axes"): - df.dropna(how="all", axis=[0, 1]) - with pytest.raises(TypeError, match="supplying multiple axes"): - df.dropna(how="all", axis=(0, 1)) - - inp = df.copy() - with pytest.raises(TypeError, match="supplying multiple axes"): - inp.dropna(how="all", axis=(0, 1), inplace=True) - - def test_dropna_tz_aware_datetime(self): - # GH13407 - df = DataFrame() - dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) - dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) - df["Time"] = [dt1] - result = df.dropna(axis=0) - expected = DataFrame({"Time": [dt1]}) - tm.assert_frame_equal(result, expected) - - # Ex2 - df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) - result = df.dropna(axis=0) - expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) - tm.assert_frame_equal(result, expected) - - def test_dropna_categorical_interval_index(self): - # GH 25087 - ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) - ci = pd.CategoricalIndex(ii) - df = pd.DataFrame({"A": list("abc")}, index=ci) - - expected = df - result = df.dropna() - tm.assert_frame_equal(result, expected) - +class TestFillNA: def test_fillna_datetime(self, datetime_frame): tf = datetime_frame tf.loc[tf.index[:5], "A"] = np.nan @@ -251,7 +59,7 @@ def test_fillna_mixed_float(self, mixed_float_frame): _check_mixed_float(result, dtype=dict(C=None)) def test_fillna_empty(self): - # empty frame (GH #2778) + # empty frame (GH#2778) df = DataFrame(columns=["x"]) for m in ["pad", "backfill"]: df.x.fillna(method=m, inplace=True) @@ -290,8 +98,8 @@ def test_fillna_datelike(self): # GH#6344 df = DataFrame( { - "Date": [pd.NaT, Timestamp("2014-1-1")], - "Date2": [Timestamp("2013-1-1"), pd.NaT], + "Date": [NaT, Timestamp("2014-1-1")], + "Date2": [Timestamp("2013-1-1"), NaT], } ) @@ -303,23 +111,23 @@ def test_fillna_datelike(self): def test_fillna_tzaware(self): # with timezone # GH#15855 - df = pd.DataFrame({"A": [pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]}) - exp = pd.DataFrame( + df = DataFrame({"A": [Timestamp("2012-11-11 00:00:00+01:00"), NaT]}) + exp = DataFrame( { "A": [ - pd.Timestamp("2012-11-11 00:00:00+01:00"), - pd.Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), ] } ) tm.assert_frame_equal(df.fillna(method="pad"), exp) - df = pd.DataFrame({"A": [pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]}) - exp = pd.DataFrame( + df = DataFrame({"A": [NaT, Timestamp("2012-11-11 00:00:00+01:00")]}) + exp = DataFrame( { "A": [ - pd.Timestamp("2012-11-11 00:00:00+01:00"), - pd.Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), ] } ) @@ -328,16 +136,16 @@ def test_fillna_tzaware(self): def test_fillna_tzaware_different_column(self): # with timezone in another column # GH#15522 - df = pd.DataFrame( + df = DataFrame( { - "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), + "A": date_range("20130101", periods=4, tz="US/Eastern"), "B": [1, 2, np.nan, np.nan], } ) result = df.fillna(method="pad") - expected = pd.DataFrame( + expected = DataFrame( { - "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), + "A": date_range("20130101", periods=4, tz="US/Eastern"), "B": [1.0, 2.0, 2.0, 2.0], } ) @@ -378,7 +186,7 @@ def test_na_actions_categorical(self): # make sure that fillna takes missing values into account c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) - df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) + df = DataFrame({"cats": c, "vals": [1, 2, 3]}) cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) @@ -387,7 +195,7 @@ def test_na_actions_categorical(self): tm.assert_frame_equal(res, df_exp) def test_fillna_categorical_nan(self): - # GH 14021 + # GH#14021 # np.nan should always be a valid filler cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) @@ -408,34 +216,32 @@ def test_fillna_categorical_nan(self): result = df.vals.fillna(np.nan) tm.assert_series_equal(result, df.vals) - idx = pd.DatetimeIndex( - ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT] + idx = DatetimeIndex( + ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", NaT, NaT] ) df = DataFrame({"a": Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + tm.assert_frame_equal(df.fillna(value=NaT), df) - idx = pd.PeriodIndex( - ["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M" - ) + idx = PeriodIndex(["2011-01", "2011-01", "2011-01", NaT, NaT], freq="M") df = DataFrame({"a": Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + tm.assert_frame_equal(df.fillna(value=NaT), df) - idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT]) + idx = TimedeltaIndex(["1 days", "2 days", "1 days", NaT, NaT]) df = DataFrame({"a": Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + tm.assert_frame_equal(df.fillna(value=NaT), df) def test_fillna_downcast(self): - # GH 15277 + # GH#15277 # infer int64 from float64 - df = pd.DataFrame({"a": [1.0, np.nan]}) + df = DataFrame({"a": [1.0, np.nan]}) result = df.fillna(0, downcast="infer") - expected = pd.DataFrame({"a": [1, 0]}) + expected = DataFrame({"a": [1, 0]}) tm.assert_frame_equal(result, expected) # infer int64 from float64 when fillna value is a dict - df = pd.DataFrame({"a": [1.0, np.nan]}) + df = DataFrame({"a": [1.0, np.nan]}) result = df.fillna({"a": 0}, downcast="infer") - expected = pd.DataFrame({"a": [1, 0]}) + expected = DataFrame({"a": [1, 0]}) tm.assert_frame_equal(result, expected) def test_fillna_dtype_conversion(self): @@ -463,8 +269,8 @@ def test_fillna_dtype_conversion(self): tm.assert_frame_equal(result, expected) def test_fillna_datetime_columns(self): - # GH 7095 - df = pd.DataFrame( + # GH#7095 + df = DataFrame( { "A": [-1, -2, np.nan], "B": date_range("20130101", periods=3), @@ -474,7 +280,7 @@ def test_fillna_datetime_columns(self): index=date_range("20130110", periods=3), ) result = df.fillna("?") - expected = pd.DataFrame( + expected = DataFrame( { "A": [-1, -2, "?"], "B": date_range("20130101", periods=3), @@ -485,24 +291,24 @@ def test_fillna_datetime_columns(self): ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame( + df = DataFrame( { "A": [-1, -2, np.nan], - "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), pd.NaT], + "B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), NaT], "C": ["foo", "bar", None], "D": ["foo2", "bar2", None], }, index=date_range("20130110", periods=3), ) result = df.fillna("?") - expected = pd.DataFrame( + expected = DataFrame( { "A": [-1, -2, "?"], - "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), "?"], + "B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), "?"], "C": ["foo", "bar", "?"], "D": ["foo2", "bar2", "?"], }, - index=pd.date_range("20130110", periods=3), + index=date_range("20130110", periods=3), ) tm.assert_frame_equal(result, expected) @@ -631,7 +437,7 @@ def test_fillna_dict_series(self): df.fillna(df.max(1), axis=1) def test_fillna_dataframe(self): - # GH 8377 + # GH#8377 df = DataFrame( { "a": [np.nan, 1, 2, np.nan, np.nan], diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index 569b2fe21d1c2..af77db4058b43 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -133,7 +133,7 @@ def test_filter_corner(self): def test_filter_regex_non_string(self): # GH#5798 trying to filter on non-string columns should drop, # not raise - df = pd.DataFrame(np.random.random((3, 2)), columns=["STRING", 123]) + df = DataFrame(np.random.random((3, 2)), columns=["STRING", 123]) result = df.filter(regex="STRING") expected = df[["STRING"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_infer_objects.py b/pandas/tests/frame/methods/test_infer_objects.py new file mode 100644 index 0000000000000..a824a615b5c29 --- /dev/null +++ b/pandas/tests/frame/methods/test_infer_objects.py @@ -0,0 +1,42 @@ +from datetime import datetime + +from pandas import DataFrame +import pandas._testing as tm + + +class TestInferObjects: + def test_infer_objects(self): + # GH#11221 + df = DataFrame( + { + "a": ["a", 1, 2, 3], + "b": ["b", 2.0, 3.0, 4.1], + "c": [ + "c", + datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3), + ], + "d": [1, 2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + df = df.iloc[1:].infer_objects() + + assert df["a"].dtype == "int64" + assert df["b"].dtype == "float64" + assert df["c"].dtype == "M8[ns]" + assert df["d"].dtype == "object" + + expected = DataFrame( + { + "a": [1, 2, 3], + "b": [2.0, 3.0, 4.1], + "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], + "d": [2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + # reconstruct frame to verify inference is same + result = df.reset_index(drop=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py new file mode 100644 index 0000000000000..0fca4e988b775 --- /dev/null +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -0,0 +1,49 @@ +import numpy as np +import pytest + +from pandas import Categorical, DataFrame + + +@pytest.mark.parametrize( + "data, expected", + [ + # empty + (DataFrame(), True), + # multi-same + (DataFrame({"A": [1, 2], "B": [1, 2]}), True), + # multi-object + ( + DataFrame( + { + "A": np.array([1, 2], dtype=object), + "B": np.array(["a", "b"], dtype=object), + } + ), + True, + ), + # multi-extension + ( + DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["a", "b"])}), + True, + ), + # differ types + (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), + # differ sizes + ( + DataFrame( + { + "A": np.array([1, 2], dtype=np.int32), + "B": np.array([1, 2], dtype=np.int64), + } + ), + False, + ), + # multi-extension differ + ( + DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["b", "c"])}), + False, + ), + ], +) +def test_is_homogeneous_type(data, expected): + assert data._is_homogeneous_type is expected diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 35d45bd00131b..5e50e63016f26 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -87,7 +87,7 @@ def test_isin_df(self): def test_isin_tuples(self): # GH#16394 - df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) + df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) df["C"] = list(zip(df["A"], df["B"])) result = df["C"].isin([(1, "a")]) tm.assert_series_equal(result, Series([True, False, False], name="C")) @@ -124,10 +124,10 @@ def test_isin_dupe_self(self): tm.assert_frame_equal(result, expected) def test_isin_against_series(self): - df = pd.DataFrame( + df = DataFrame( {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] ) - s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) + s = Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) expected = DataFrame(False, index=df.index, columns=df.columns) expected["A"].loc["a"] = True expected.loc["d"] = True @@ -193,14 +193,23 @@ def test_isin_empty_datetimelike(self): @pytest.mark.parametrize( "values", [ - pd.DataFrame({"a": [1, 2, 3]}, dtype="category"), - pd.Series([1, 2, 3], dtype="category"), + DataFrame({"a": [1, 2, 3]}, dtype="category"), + Series([1, 2, 3], dtype="category"), ], ) def test_isin_category_frame(self, values): # GH#34256 - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = DataFrame({"a": [True, True, True], "b": [False, False, False]}) result = df.isin(values) tm.assert_frame_equal(result, expected) + + def test_isin_read_only(self): + # https://github.com/pandas-dev/pandas/issues/37174 + arr = np.array([1, 2, 3]) + arr.setflags(write=False) + df = DataFrame([1, 2, 3]) + result = df.isin(arr) + expected = DataFrame([True, True, True]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/methods/test_join.py similarity index 75% rename from pandas/tests/frame/test_join.py rename to pandas/tests/frame/methods/test_join.py index 4d6e675c6765f..eba92cc71a6d0 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, period_range +from pandas import DataFrame, Index, MultiIndex, date_range, period_range import pandas._testing as tm @@ -222,6 +222,31 @@ def test_suppress_future_warning_with_sort_kw(sort_kw): class TestDataFrameJoin: + def test_join(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + a = frame.loc[frame.index[:5], ["A"]] + b = frame.loc[frame.index[2:], ["B", "C"]] + + joined = a.join(b, how="outer").reindex(frame.index) + expected = frame.copy() + expected.values[np.isnan(joined.values)] = np.nan + + assert not np.isnan(joined.values).all() + + # TODO what should join do with names ? + tm.assert_frame_equal(joined, expected, check_names=False) + + def test_join_segfault(self): + # GH#1532 + df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) + df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) + df1 = df1.set_index(["a", "b"]) + df2 = df2.set_index(["a", "b"]) + # it works! + for how in ["left", "right", "outer"]: + df1.join(df2, how=how) + def test_join_str_datetime(self): str_dates = ["20120209", "20120222"] dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] @@ -235,7 +260,7 @@ def test_join_str_datetime(self): def test_join_multiindex_leftright(self): # GH 10741 - df1 = pd.DataFrame( + df1 = DataFrame( [ ["a", "x", 0.471780], ["a", "y", 0.774908], @@ -250,11 +275,11 @@ def test_join_multiindex_leftright(self): columns=["first", "second", "value1"], ).set_index(["first", "second"]) - df2 = pd.DataFrame( - [["a", 10], ["b", 20]], columns=["first", "value2"] - ).set_index(["first"]) + df2 = DataFrame([["a", 10], ["b", 20]], columns=["first", "value2"]).set_index( + ["first"] + ) - exp = pd.DataFrame( + exp = DataFrame( [ [0.471780, 10], [0.774908, 10], @@ -277,7 +302,7 @@ def test_join_multiindex_leftright(self): exp_idx = pd.MultiIndex.from_product( [["a", "b"], ["x", "y", "z"]], names=["first", "second"] ) - exp = pd.DataFrame( + exp = DataFrame( [ [0.471780, 10], [0.774908, 10], @@ -292,3 +317,48 @@ def test_join_multiindex_leftright(self): tm.assert_frame_equal(df1.join(df2, how="right"), exp) tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]]) + + def test_merge_join_different_levels(self): + # GH#9455 + + # first dataframe + df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]]) + + # second dataframe + columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")]) + df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]]) + + # merge + columns = ["a", "b", ("c", "c1")] + expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]]) + with tm.assert_produces_warning(UserWarning): + result = pd.merge(df1, df2, on="a") + tm.assert_frame_equal(result, expected) + + # join, see discussion in GH#12219 + columns = ["a", "b", ("a", ""), ("c", "c1")] + expected = DataFrame(columns=columns, data=[[1, 11, 0, 44], [0, 22, 1, 33]]) + with tm.assert_produces_warning(UserWarning): + result = df1.join(df2, on="a") + tm.assert_frame_equal(result, expected) + + def test_frame_join_tzaware(self): + test1 = DataFrame( + np.zeros((6, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" + ), + ) + test2 = DataFrame( + np.zeros((3, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" + ), + columns=range(3, 6), + ) + + result = test1.join(test2, how="outer") + expected = test1.index.union(test2.index) + + tm.assert_index_equal(result.index, expected) + assert result.index.tz.zone == "US/Central" diff --git a/pandas/tests/frame/methods/test_matmul.py b/pandas/tests/frame/methods/test_matmul.py new file mode 100644 index 0000000000000..c34bf991ffc4c --- /dev/null +++ b/pandas/tests/frame/methods/test_matmul.py @@ -0,0 +1,82 @@ +import operator + +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +class TestMatMul: + def test_matmul(self): + # matmul test is for GH#10259 + a = DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + b = DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] + ) + + # DataFrame @ DataFrame + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # DataFrame @ Series + result = operator.matmul(a, b.one) + expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) + + # np.array @ DataFrame + result = operator.matmul(a.values, b) + assert isinstance(result, DataFrame) + assert result.columns.equals(b.columns) + assert result.index.equals(Index(range(3))) + expected = np.dot(a.values, b.values) + tm.assert_almost_equal(result.values, expected) + + # nested list @ DataFrame (__rmatmul__) + result = operator.matmul(a.values.tolist(), b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_almost_equal(result.values, expected.values) + + # mixed dtype DataFrame @ DataFrame + a["q"] = a.q.round().astype(int) + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # different dtypes DataFrame @ DataFrame + a = a.astype(int) + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # unaligned + df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) + df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) + + with pytest.raises(ValueError, match="aligned"): + operator.matmul(df, df2) + + def test_matmul_message_shapes(self): + # GH#21581 exception message should reflect original shapes, + # not transposed shapes + a = np.random.rand(10, 4) + b = np.random.rand(5, 3) + + df = DataFrame(b) + + msg = r"shapes \(10, 4\) and \(5, 3\) not aligned" + with pytest.raises(ValueError, match=msg): + a @ df + with pytest.raises(ValueError, match=msg): + a.tolist() @ df diff --git a/pandas/tests/frame/methods/test_pop.py b/pandas/tests/frame/methods/test_pop.py index fccb3f10dde45..2926e29e61d56 100644 --- a/pandas/tests/frame/methods/test_pop.py +++ b/pandas/tests/frame/methods/test_pop.py @@ -1,4 +1,6 @@ -from pandas import DataFrame, Series +import numpy as np + +from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm @@ -38,3 +40,28 @@ def test_pop_non_unique_cols(self): assert "b" in df.columns assert "a" not in df.columns assert len(df.index) == 2 + + def test_mixed_depth_pop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.randn(4, 6), columns=index) + + df1 = df.copy() + df2 = df.copy() + result = df1.pop("a") + expected = df2.pop(("a", "", "")) + tm.assert_series_equal(expected, result, check_names=False) + tm.assert_frame_equal(df1, df2) + assert result.name == "a" + + expected = df1["top"] + df1 = df1.drop(["top"], axis=1) + result = df2.pop("top") + tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(df1, df2) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0b8f1e0495155..5cdd65b8cf6e2 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -11,17 +11,17 @@ class TestDataFrameQuantile: "df,expected", [ [ - pd.DataFrame( + DataFrame( { - 0: pd.Series(pd.arrays.SparseArray([1, 2])), - 1: pd.Series(pd.arrays.SparseArray([3, 4])), + 0: Series(pd.arrays.SparseArray([1, 2])), + 1: Series(pd.arrays.SparseArray([3, 4])), } ), - pd.Series([1.5, 3.5], name=0.5), + Series([1.5, 3.5], name=0.5), ], [ - pd.DataFrame(pd.Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")), - pd.Series([1.0], name=0.5), + DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")), + Series([1.0], name=0.5), ], ], ) @@ -78,11 +78,11 @@ def test_quantile_date_range(self): # GH 2460 dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") - ser = pd.Series(dti) - df = pd.DataFrame(ser) + ser = Series(dti) + df = DataFrame(ser) result = df.quantile(numeric_only=False) - expected = pd.Series( + expected = Series( ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" ) @@ -307,7 +307,7 @@ def test_quantile_box(self): res = df.quantile(0.5, numeric_only=False) - exp = pd.Series( + exp = Series( [ pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-02", tz="US/Eastern"), @@ -319,7 +319,7 @@ def test_quantile_box(self): tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = pd.DataFrame( + exp = DataFrame( [ [ pd.Timestamp("2011-01-02"), @@ -376,7 +376,7 @@ def test_quantile_box(self): ) res = df.quantile(0.5, numeric_only=False) - exp = pd.Series( + exp = Series( [ pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-02"), @@ -391,7 +391,7 @@ def test_quantile_box(self): tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = pd.DataFrame( + exp = DataFrame( [ [ pd.Timestamp("2011-01-02"), @@ -506,14 +506,14 @@ def test_quantile_empty_no_rows(self): def test_quantile_empty_no_columns(self): # GH#23925 _get_numeric_data may drop all columns - df = pd.DataFrame(pd.date_range("1/1/18", periods=5)) + df = DataFrame(pd.date_range("1/1/18", periods=5)) df.columns.name = "captain tightpants" result = df.quantile(0.5) - expected = pd.Series([], index=[], name=0.5, dtype=np.float64) + expected = Series([], index=[], name=0.5, dtype=np.float64) expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) result = df.quantile([0.5]) - expected = pd.DataFrame([], index=[0.5], columns=[]) + expected = DataFrame([], index=[0.5], columns=[]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py new file mode 100644 index 0000000000000..bc3750a196c5f --- /dev/null +++ b/pandas/tests/frame/methods/test_reindex.py @@ -0,0 +1,866 @@ +from datetime import datetime +import inspect +from itertools import permutations + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + date_range, + isna, +) +import pandas._testing as tm +from pandas.api.types import CategoricalDtype as CDT +import pandas.core.common as com + + +class TestDataFrameSelectReindex: + # These are specific reindex-based tests; other indexing tests should go in + # test_indexing + + def test_reindex_with_multi_index(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests for reindexing a multi-indexed DataFrame with a new MultiIndex + # + # confirms that we can reindex a multi-indexed DataFrame with a new + # MultiIndex object correctly when using no filling, backfilling, and + # padding + # + # The DataFrame, `df`, used in this test is: + # c + # a b + # -1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 0 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # + # and the other MultiIndex, `new_multi_index`, is: + # 0: 0 0.5 + # 1: 2.0 + # 2: 5.0 + # 3: 5.8 + df = DataFrame( + { + "a": [-1] * 7 + [0] * 7 + [1] * 7, + "b": list(range(7)) * 3, + "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, + } + ).set_index(["a", "b"]) + new_index = [0.5, 2.0, 5.0, 5.8] + new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) + + # reindexing w/o a `method` value + reindexed = df.reindex(new_multi_index) + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} + ).set_index(["a", "b"]) + tm.assert_frame_equal(expected, reindexed) + + # reindexing with backfilling + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} + ).set_index(["a", "b"]) + reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + # reindexing with padding + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} + ).set_index(["a", "b"]) + reindexed_with_padding = df.reindex(new_multi_index, method="pad") + tm.assert_frame_equal(expected, reindexed_with_padding) + + reindexed_with_padding = df.reindex(new_multi_index, method="ffill") + tm.assert_frame_equal(expected, reindexed_with_padding) + + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) + def test_reindex_methods(self, method, expected_values): + df = DataFrame({"x": list(range(5))}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + expected = DataFrame({"x": expected_values}, index=target) + actual = df.reindex(target, method=method) + tm.assert_frame_equal(expected, actual) + + actual = df.reindex(target, method=method, tolerance=1) + tm.assert_frame_equal(expected, actual) + actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) + tm.assert_frame_equal(expected, actual) + + e2 = expected[::-1] + actual = df.reindex(target[::-1], method=method) + tm.assert_frame_equal(e2, actual) + + new_order = [3, 0, 2, 1] + e2 = expected.iloc[new_order] + actual = df.reindex(target[new_order], method=method) + tm.assert_frame_equal(e2, actual) + + switched_method = ( + "pad" if method == "backfill" else "backfill" if method == "pad" else method + ) + actual = df[::-1].reindex(target, method=switched_method) + tm.assert_frame_equal(expected, actual) + + def test_reindex_methods_nearest_special(self): + df = DataFrame({"x": list(range(5))}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target) + actual = df.reindex(target, method="nearest", tolerance=0.2) + tm.assert_frame_equal(expected, actual) + + expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) + actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) + tm.assert_frame_equal(expected, actual) + + def test_reindex_nearest_tz(self, tz_aware_fixture): + # GH26683 + tz = tz_aware_fixture + idx = pd.date_range("2019-01-01", periods=5, tz=tz) + df = DataFrame({"x": list(range(5))}, index=idx) + + expected = df.head(3) + actual = df.reindex(idx[:3], method="nearest") + tm.assert_frame_equal(expected, actual) + + def test_reindex_nearest_tz_empty_frame(self): + # https://github.com/pandas-dev/pandas/issues/31964 + dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"]) + df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"])) + expected = DataFrame(index=dti) + result = df.reindex(dti, method="nearest") + tm.assert_frame_equal(result, expected) + + def test_reindex_frame_add_nat(self): + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") + df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) + + result = df.reindex(range(15)) + assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) + + mask = com.isna(result)["B"] + assert mask[-5:].all() + assert not mask[:-5].any() + + def test_reindex_limit(self): + # GH 28631 + data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] + exp_data = [ + ["A", "A", "A"], + ["B", "B", "B"], + ["C", "C", "C"], + ["D", "D", "D"], + ["D", "D", "D"], + [np.nan, np.nan, np.nan], + ] + df = DataFrame(data) + result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) + expected = DataFrame(exp_data) + tm.assert_frame_equal(result, expected) + + def test_reindex_level(self): + icol = ["jim", "joe", "jolie"] + + def verify_first_level(df, level, idx, check_index_type=True): + def f(val): + return np.nonzero((df[level] == val).to_numpy())[0] + + i = np.concatenate(list(map(f, idx))) + left = df.set_index(icol).reindex(idx, level=level) + right = df.iloc[i].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + + def verify(df, level, idx, indexer, check_index_type=True): + left = df.set_index(icol).reindex(idx, level=level) + right = df.iloc[indexer].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + + df = DataFrame( + { + "jim": list("B" * 4 + "A" * 2 + "C" * 3), + "joe": list("abcdeabcd")[::-1], + "jolie": [10, 20, 30] * 3, + "joline": np.random.randint(0, 1000, 9), + } + ) + + target = [ + ["C", "B", "A"], + ["F", "C", "A", "D"], + ["A"], + ["A", "B", "C"], + ["C", "A", "B"], + ["C", "B"], + ["C", "A"], + ["A", "B"], + ["B", "A", "C"], + ] + + for idx in target: + verify_first_level(df, "jim", idx) + + # reindex by these causes different MultiIndex levels + for idx in [["D", "F"], ["A", "C", "B"]]: + verify_first_level(df, "jim", idx, check_index_type=False) + + verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) + verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) + verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) + verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) + verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) + verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) + verify(df, "joe", list("edwq"), [0, 4, 5]) + verify(df, "joe", list("wq"), [], check_index_type=False) + + df = DataFrame( + { + "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, + "joe": ["3rd"] * 2 + + ["1st"] * 3 + + ["2nd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["2nd"] * 2, + # this needs to be jointly unique with jim and joe or + # reindexing will fail ~1.5% of the time, this works + # out to needing unique groups of same size as joe + "jolie": np.concatenate( + [ + np.random.choice(1000, x, replace=False) + for x in [2, 3, 3, 2, 3, 2, 3, 2] + ] + ), + "joline": np.random.randn(20).round(3) * 10, + } + ) + + for idx in permutations(df["jim"].unique()): + for i in range(3): + verify_first_level(df, "jim", idx[: i + 1]) + + i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] + verify(df, "joe", ["1st", "2nd", "3rd"], i) + + i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] + verify(df, "joe", ["3rd", "2nd", "1st"], i) + + i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] + verify(df, "joe", ["2nd", "3rd"], i) + + i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] + verify(df, "joe", ["3rd", "1st"], i) + + def test_non_monotonic_reindex_methods(self): + dr = date_range("2013-08-01", periods=6, freq="B") + data = np.random.randn(6, 1) + df = DataFrame(data, index=dr, columns=list("A")) + df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) + # index is not monotonic increasing or decreasing + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="pad") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="ffill") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="bfill") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="nearest") + + def test_reindex_sparse(self): + # https://github.com/pandas-dev/pandas/issues/35286 + df = DataFrame( + {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} + ) + result = df.reindex([0, 2]) + expected = DataFrame( + { + "A": [0.0, np.nan], + "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), + }, + index=[0, 2], + ) + tm.assert_frame_equal(result, expected) + + def test_reindex(self, float_frame): + datetime_series = tm.makeTimeSeries(nper=30) + + newFrame = float_frame.reindex(datetime_series.index) + + for col in newFrame.columns: + for idx, val in newFrame[col].items(): + if idx in float_frame.index: + if np.isnan(val): + assert np.isnan(float_frame[col][idx]) + else: + assert val == float_frame[col][idx] + else: + assert np.isnan(val) + + for col, series in newFrame.items(): + assert tm.equalContents(series.index, newFrame.index) + emptyFrame = float_frame.reindex(Index([])) + assert len(emptyFrame.index) == 0 + + # Cython code should be unit-tested directly + nonContigFrame = float_frame.reindex(datetime_series.index[::2]) + + for col in nonContigFrame.columns: + for idx, val in nonContigFrame[col].items(): + if idx in float_frame.index: + if np.isnan(val): + assert np.isnan(float_frame[col][idx]) + else: + assert val == float_frame[col][idx] + else: + assert np.isnan(val) + + for col, series in nonContigFrame.items(): + assert tm.equalContents(series.index, nonContigFrame.index) + + # corner cases + + # Same index, copies values but not index if copy=False + newFrame = float_frame.reindex(float_frame.index, copy=False) + assert newFrame.index is float_frame.index + + # length zero + newFrame = float_frame.reindex([]) + assert newFrame.empty + assert len(newFrame.columns) == len(float_frame.columns) + + # length zero with columns reindexed with non-empty index + newFrame = float_frame.reindex([]) + newFrame = newFrame.reindex(float_frame.index) + assert len(newFrame.index) == len(float_frame.index) + assert len(newFrame.columns) == len(float_frame.columns) + + # pass non-Index + newFrame = float_frame.reindex(list(datetime_series.index)) + expected = datetime_series.index._with_freq(None) + tm.assert_index_equal(newFrame.index, expected) + + # copy with no axes + result = float_frame.reindex() + tm.assert_frame_equal(result, float_frame) + assert result is not float_frame + + def test_reindex_nan(self): + df = DataFrame( + [[1, 2], [3, 5], [7, 11], [9, 23]], + index=[2, np.nan, 1, 5], + columns=["joe", "jim"], + ) + + i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1] + tm.assert_frame_equal(df.reindex(i), df.iloc[j]) + + df.index = df.index.astype("object") + tm.assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False) + + # GH10388 + df = DataFrame( + { + "other": ["a", "b", np.nan, "c"], + "date": ["2015-03-22", np.nan, "2012-01-08", np.nan], + "amount": [2, 3, 4, 5], + } + ) + + df["date"] = pd.to_datetime(df.date) + df["delta"] = (pd.to_datetime("2015-06-18") - df["date"]).shift(1) + + left = df.set_index(["delta", "other", "date"]).reset_index() + right = df.reindex(columns=["delta", "other", "date", "amount"]) + tm.assert_frame_equal(left, right) + + def test_reindex_name_remains(self): + s = Series(np.random.rand(10)) + df = DataFrame(s, index=np.arange(len(s))) + i = Series(np.arange(10), name="iname") + + df = df.reindex(i) + assert df.index.name == "iname" + + df = df.reindex(Index(np.arange(10), name="tmpname")) + assert df.index.name == "tmpname" + + s = Series(np.random.rand(10)) + df = DataFrame(s.T, index=np.arange(len(s))) + i = Series(np.arange(10), name="iname") + df = df.reindex(columns=i) + assert df.columns.name == "iname" + + def test_reindex_int(self, int_frame): + smaller = int_frame.reindex(int_frame.index[::2]) + + assert smaller["A"].dtype == np.int64 + + bigger = smaller.reindex(int_frame.index) + assert bigger["A"].dtype == np.float64 + + smaller = int_frame.reindex(columns=["A", "B"]) + assert smaller["A"].dtype == np.int64 + + def test_reindex_columns(self, float_frame): + new_frame = float_frame.reindex(columns=["A", "B", "E"]) + + tm.assert_series_equal(new_frame["B"], float_frame["B"]) + assert np.isnan(new_frame["E"]).all() + assert "C" not in new_frame + + # Length zero + new_frame = float_frame.reindex(columns=[]) + assert new_frame.empty + + def test_reindex_columns_method(self): + + # GH 14992, reindexing over columns ignored method + df = DataFrame( + data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], + index=[1, 2, 4], + columns=[1, 2, 4], + dtype=float, + ) + + # default method + result = df.reindex(columns=range(6)) + expected = DataFrame( + data=[ + [np.nan, 11, 12, np.nan, 13, np.nan], + [np.nan, 21, 22, np.nan, 23, np.nan], + [np.nan, 31, 32, np.nan, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + # method='ffill' + result = df.reindex(columns=range(6), method="ffill") + expected = DataFrame( + data=[ + [np.nan, 11, 12, 12, 13, 13], + [np.nan, 21, 22, 22, 23, 23], + [np.nan, 31, 32, 32, 33, 33], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + # method='bfill' + result = df.reindex(columns=range(6), method="bfill") + expected = DataFrame( + data=[ + [11, 11, 12, 13, 13, np.nan], + [21, 21, 22, 23, 23, np.nan], + [31, 31, 32, 33, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + def test_reindex_axes(self): + # GH 3317, reindexing by both axes loses freq of the index + df = DataFrame( + np.ones((3, 3)), + index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], + columns=["a", "b", "c"], + ) + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + some_cols = ["a", "b"] + + index_freq = df.reindex(index=time_freq).index.freq + both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq + seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq + assert index_freq == both_freq + assert index_freq == seq_freq + + def test_reindex_fill_value(self): + df = DataFrame(np.random.randn(10, 4)) + + # axis=0 + result = df.reindex(list(range(15))) + assert np.isnan(result.values[-5:]).all() + + result = df.reindex(range(15), fill_value=0) + expected = df.reindex(range(15)).fillna(0) + tm.assert_frame_equal(result, expected) + + # axis=1 + result = df.reindex(columns=range(5), fill_value=0.0) + expected = df.copy() + expected[4] = 0.0 + tm.assert_frame_equal(result, expected) + + result = df.reindex(columns=range(5), fill_value=0) + expected = df.copy() + expected[4] = 0 + tm.assert_frame_equal(result, expected) + + result = df.reindex(columns=range(5), fill_value="foo") + expected = df.copy() + expected[4] = "foo" + tm.assert_frame_equal(result, expected) + + # other dtypes + df["foo"] = "foo" + result = df.reindex(range(15), fill_value=0) + expected = df.reindex(range(15)).fillna(0) + tm.assert_frame_equal(result, expected) + + def test_reindex_dups(self): + + # GH4746, reindex on duplicate index error messages + arr = np.random.randn(10) + df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]) + + # set index is ok + result = df.copy() + result.index = list(range(len(df))) + expected = DataFrame(arr, index=list(range(len(df)))) + tm.assert_frame_equal(result, expected) + + # reindex fails + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df.reindex(index=list(range(len(df)))) + + def test_reindex_axis_style(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = DataFrame( + {"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3] + ) + result = df.reindex([0, 1, 3]) + tm.assert_frame_equal(result, expected) + + result = df.reindex([0, 1, 3], axis=0) + tm.assert_frame_equal(result, expected) + + result = df.reindex([0, 1, 3], axis="index") + tm.assert_frame_equal(result, expected) + + def test_reindex_positional_warns(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = DataFrame({"A": [1.0, 2], "B": [4.0, 5], "C": [np.nan, np.nan]}) + with tm.assert_produces_warning(FutureWarning): + result = df.reindex([0, 1], ["A", "B", "C"]) + + tm.assert_frame_equal(result, expected) + + def test_reindex_axis_style_raises(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex([0, 1], ["A"], axis=1) + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex([0, 1], ["A"], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(columns=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], columns=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify all"): + df.reindex([0, 1], [0], ["A"]) + + # Mixing styles + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="columns") + + # Duplicates + with pytest.raises(TypeError, match="multiple values"): + df.reindex([0, 1], labels=[0, 1]) + + def test_reindex_single_named_indexer(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}) + result = df.reindex([0, 1], columns=["A"]) + expected = DataFrame({"A": [1, 2]}) + tm.assert_frame_equal(result, expected) + + def test_reindex_api_equivalence(self): + # https://github.com/pandas-dev/pandas/issues/12392 + # equivalence of the labels/axis and index/columns API's + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.reindex(["b", "a"]) + res2 = df.reindex(index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"]) + res4 = df.reindex(labels=["b", "a"], axis=0) + res5 = df.reindex(["b", "a"], axis=0) + for res in [res2, res3, res4, res5]: + tm.assert_frame_equal(res1, res) + + res1 = df.reindex(columns=["e", "d"]) + res2 = df.reindex(["e", "d"], axis=1) + res3 = df.reindex(labels=["e", "d"], axis=1) + for res in [res2, res3]: + tm.assert_frame_equal(res1, res) + + with tm.assert_produces_warning(FutureWarning) as m: + res1 = df.reindex(["b", "a"], ["e", "d"]) + assert "reindex" in str(m[0].message) + res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) + for res in [res2, res3]: + tm.assert_frame_equal(res1, res) + + def test_reindex_boolean(self): + frame = DataFrame( + np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2] + ) + + reindexed = frame.reindex(np.arange(10)) + assert reindexed.values.dtype == np.object_ + assert isna(reindexed[0][1]) + + reindexed = frame.reindex(columns=range(3)) + assert reindexed.values.dtype == np.object_ + assert isna(reindexed[1]).all() + + def test_reindex_objects(self, float_string_frame): + reindexed = float_string_frame.reindex(columns=["foo", "A", "B"]) + assert "foo" in reindexed + + reindexed = float_string_frame.reindex(columns=["A", "B"]) + assert "foo" not in reindexed + + def test_reindex_corner(self, int_frame): + index = Index(["a", "b", "c"]) + dm = DataFrame({}).reindex(index=[1, 2, 3]) + reindexed = dm.reindex(columns=index) + tm.assert_index_equal(reindexed.columns, index) + + # ints are weird + smaller = int_frame.reindex(columns=["A", "B", "E"]) + assert smaller["E"].dtype == np.float64 + + def test_reindex_with_nans(self): + df = DataFrame( + [[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], + columns=["a", "b"], + index=[100.0, 101.0, np.nan, 102.0, 103.0], + ) + + result = df.reindex(index=[101.0, 102.0, 103.0]) + expected = df.iloc[[1, 3, 4]] + tm.assert_frame_equal(result, expected) + + result = df.reindex(index=[103.0]) + expected = df.iloc[[4]] + tm.assert_frame_equal(result, expected) + + result = df.reindex(index=[101.0]) + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + def test_reindex_multi(self): + df = DataFrame(np.random.randn(3, 3)) + + result = df.reindex(index=range(4), columns=range(4)) + expected = df.reindex(list(range(4))).reindex(columns=range(4)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame(np.random.randint(0, 10, (3, 3))) + + result = df.reindex(index=range(4), columns=range(4)) + expected = df.reindex(list(range(4))).reindex(columns=range(4)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame(np.random.randint(0, 10, (3, 3))) + + result = df.reindex(index=range(2), columns=range(2)) + expected = df.reindex(range(2)).reindex(columns=range(2)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame(np.random.randn(5, 3) + 1j, columns=["a", "b", "c"]) + + result = df.reindex(index=[0, 1], columns=["a", "b"]) + expected = df.reindex([0, 1]).reindex(columns=["a", "b"]) + + tm.assert_frame_equal(result, expected) + + def test_reindex_multi_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = pd.MultiIndex.from_product( + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="H")), + ] + ) + df = DataFrame({"a": range(len(midx))}, index=midx) + df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] + + result = df2.reindex(midx) + expected = DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + tm.assert_frame_equal(result, expected) + + def test_reindex_with_categoricalindex(self): + df = DataFrame( + { + "A": np.arange(3, dtype="int64"), + }, + index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), + ) + + # reindexing + # convert to a regular index + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["d"]) + expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # since we are actually reindexing with a Categorical + # then return a Categorical + cats = list("cabe") + + result = df.reindex(Categorical(["a", "e"], categories=cats)) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(Categorical(["a"], categories=cats)) + expected = DataFrame( + {"A": [0], "B": Series(list("a")).astype(CDT(cats))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # give back the type of categorical that we received + result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + df2 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + ) + # passed duplicate indexers are not allowed + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df2.reindex(["a", "b"]) + + # args NotImplemented ATM + msg = r"argument {} is not implemented for CategoricalIndex\.reindex" + with pytest.raises(NotImplementedError, match=msg.format("method")): + df.reindex(["a"], method="ffill") + with pytest.raises(NotImplementedError, match=msg.format("level")): + df.reindex(["a"], level=1) + with pytest.raises(NotImplementedError, match=msg.format("limit")): + df.reindex(["a"], limit=2) + + def test_reindex_signature(self): + sig = inspect.signature(DataFrame.reindex) + parameters = set(sig.parameters) + assert parameters == { + "self", + "labels", + "index", + "columns", + "axis", + "limit", + "copy", + "level", + "method", + "fill_value", + "tolerance", + } diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index eb908e9472fe2..857dd0ad7268b 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -1,13 +1,37 @@ from collections import ChainMap +import inspect import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex +from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm class TestRename: + def test_rename_signature(self): + sig = inspect.signature(DataFrame.rename) + parameters = set(sig.parameters) + assert parameters == { + "self", + "mapper", + "index", + "columns", + "axis", + "inplace", + "copy", + "level", + "errors", + } + + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_rename_mi(self, klass): + obj = klass( + [11, 21, 31], + index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), + ) + obj.rename(str.lower) + def test_rename(self, float_frame): mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index a9cf840470ae0..2c909ab2f8227 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -553,13 +553,13 @@ def test_regex_replace_dict_nested(self, mix_abc): def test_regex_replace_dict_nested_non_first_character(self): # GH 25259 - df = pd.DataFrame({"first": ["abc", "bca", "cab"]}) - expected = pd.DataFrame({"first": [".bc", "bc.", "c.b"]}) + df = DataFrame({"first": ["abc", "bca", "cab"]}) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) result = df.replace({"a": "."}, regex=True) tm.assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): - df = pd.DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) + df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) @@ -669,11 +669,11 @@ def test_replace(self, datetime_frame): # GH 11698 # test for mixed data types. - df = pd.DataFrame( + df = DataFrame( [("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] ) df1 = df.replace("-", np.nan) - expected_df = pd.DataFrame( + expected_df = DataFrame( [(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] ) tm.assert_frame_equal(df1, expected_df) @@ -711,8 +711,8 @@ def test_replace_list(self): def test_replace_with_empty_list(self): # GH 21977 - s = pd.Series([["a", "b"], [], np.nan, [1]]) - df = pd.DataFrame({"col": s}) + s = Series([["a", "b"], [], np.nan, [1]]) + df = DataFrame({"col": s}) expected = df result = df.replace([], np.nan) tm.assert_frame_equal(result, expected) @@ -1162,7 +1162,7 @@ def test_replace_with_dict_with_bool_keys(self): def test_replace_dict_strings_vs_ints(self): # GH#34789 - df = pd.DataFrame({"Y0": [1, 2], "Y1": [3, 4]}) + df = DataFrame({"Y0": [1, 2], "Y1": [3, 4]}) result = df.replace({"replace_string": "test"}) tm.assert_frame_equal(result, df) @@ -1196,14 +1196,14 @@ def test_nested_dict_overlapping_keys_replace_str(self): tm.assert_frame_equal(result, expected) def test_replace_swapping_bug(self): - df = pd.DataFrame({"a": [True, False, True]}) + df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) - expect = pd.DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) - df = pd.DataFrame({"a": [0, 1, 0]}) + df = DataFrame({"a": [0, 1, 0]}) res = df.replace({"a": {0: "Y", 1: "N"}}) - expect = pd.DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) def test_replace_period(self): @@ -1221,7 +1221,7 @@ def test_replace_period(self): } } - df = pd.DataFrame( + df = DataFrame( [ "out_augmented_AUG_2012.json", "out_augmented_SEP_2013.json", @@ -1255,7 +1255,7 @@ def test_replace_datetime(self): } } - df = pd.DataFrame( + df = DataFrame( [ "out_augmented_AUG_2012.json", "out_augmented_SEP_2013.json", @@ -1453,9 +1453,9 @@ def test_replace_commutative(self, df, to_replace, exp): # DataFrame.replace() overwrites when values are non-numeric # also added to data frame whilst issue was for series - df = pd.DataFrame(df) + df = DataFrame(df) - expected = pd.DataFrame(exp) + expected = DataFrame(exp) result = df.replace(to_replace) tm.assert_frame_equal(result, expected) @@ -1471,22 +1471,22 @@ def test_replace_commutative(self, df, to_replace, exp): ) def test_replace_replacer_dtype(self, replacer): # GH26632 - df = pd.DataFrame(["a"]) + df = DataFrame(["a"]) result = df.replace({"a": replacer, "b": replacer}) - expected = pd.DataFrame([replacer]) + expected = DataFrame([replacer]) tm.assert_frame_equal(result, expected) def test_replace_after_convert_dtypes(self): # GH31517 - df = pd.DataFrame({"grp": [1, 2, 3, 4, 5]}, dtype="Int64") + df = DataFrame({"grp": [1, 2, 3, 4, 5]}, dtype="Int64") result = df.replace(1, 10) - expected = pd.DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64") + expected = DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64") tm.assert_frame_equal(result, expected) def test_replace_invalid_to_replace(self): # GH 18634 # API: replace() should raise an exception if invalid argument is given - df = pd.DataFrame({"one": ["a", "b ", "c"], "two": ["d ", "e ", "f "]}) + df = DataFrame({"one": ["a", "b ", "c"], "two": ["d ", "e ", "f "]}) msg = ( r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*" @@ -1498,17 +1498,17 @@ def test_replace_invalid_to_replace(self): @pytest.mark.parametrize("value", [np.nan, pd.NA]) def test_replace_no_replacement_dtypes(self, dtype, value): # https://github.com/pandas-dev/pandas/issues/32988 - df = pd.DataFrame(np.eye(2), dtype=dtype) + df = DataFrame(np.eye(2), dtype=dtype) result = df.replace(to_replace=[None, -np.inf, np.inf], value=value) tm.assert_frame_equal(result, df) @pytest.mark.parametrize("replacement", [np.nan, 5]) def test_replace_with_duplicate_columns(self, replacement): # GH 24798 - result = pd.DataFrame({"A": [1, 2, 3], "A1": [4, 5, 6], "B": [7, 8, 9]}) + result = DataFrame({"A": [1, 2, 3], "A1": [4, 5, 6], "B": [7, 8, 9]}) result.columns = list("AAB") - expected = pd.DataFrame( + expected = DataFrame( {"A": [1, 2, 3], "A1": [4, 5, 6], "B": [replacement, 8, 9]} ) expected.columns = list("AAB") @@ -1525,9 +1525,9 @@ def test_replace_period_ignore_float(self): Regression test for GH#34871: if df.replace(1.0, 0.0) is called on a df with a Period column the old, faulty behavior is to raise TypeError. """ - df = pd.DataFrame({"Per": [pd.Period("2020-01")] * 3}) + df = DataFrame({"Per": [pd.Period("2020-01")] * 3}) result = df.replace(1.0, 0.0) - expected = pd.DataFrame({"Per": [pd.Period("2020-01")] * 3}) + expected = DataFrame({"Per": [pd.Period("2020-01")] * 3}) tm.assert_frame_equal(expected, result) def test_replace_value_category_type(self): @@ -1545,7 +1545,7 @@ def test_replace_value_category_type(self): "col5": ["obj1", "obj2", "obj3", "obj4"], } # explicitly cast columns as category and order them - input_df = pd.DataFrame(data=input_dict).astype( + input_df = DataFrame(data=input_dict).astype( {"col2": "category", "col4": "category"} ) input_df["col2"] = input_df["col2"].cat.reorder_categories( @@ -1564,7 +1564,7 @@ def test_replace_value_category_type(self): "col5": ["obj9", "obj2", "obj3", "obj4"], } # explicitly cast columns as category and order them - expected = pd.DataFrame(data=expected_dict).astype( + expected = DataFrame(data=expected_dict).astype( {"col2": "category", "col4": "category"} ) expected["col2"] = expected["col2"].cat.reorder_categories( @@ -1594,14 +1594,14 @@ def test_replace_dict_category_type(self, input_category_df, expected_category_d # create input dataframe input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]} # explicitly cast columns as category - input_df = pd.DataFrame(data=input_dict).astype( + input_df = DataFrame(data=input_dict).astype( {"col1": "category", "col2": "category", "col3": "category"} ) # create expected dataframe expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]} # explicitly cast columns as category - expected = pd.DataFrame(data=expected_dict).astype( + expected = DataFrame(data=expected_dict).astype( {"col1": "category", "col2": "category", "col3": "category"} ) @@ -1612,23 +1612,23 @@ def test_replace_dict_category_type(self, input_category_df, expected_category_d def test_replace_with_compiled_regex(self): # https://github.com/pandas-dev/pandas/issues/35680 - df = pd.DataFrame(["a", "b", "c"]) + df = DataFrame(["a", "b", "c"]) regex = re.compile("^a$") result = df.replace({regex: "z"}, regex=True) - expected = pd.DataFrame(["z", "b", "c"]) + expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 - df = pd.DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) + df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) result = df.replace({"a": {pd.Interval(0, 1): "x"}}) - expected = pd.DataFrame({"a": ["x", "x"]}) + expected = DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) def test_replace_unicode(self): # GH: 16784 columns_values_map = {"positive": {"正面": 1, "中立": 1, "负面": 0}} - df1 = pd.DataFrame({"positive": np.ones(3)}) + df1 = DataFrame({"positive": np.ones(3)}) result = df1.replace(columns_values_map) - expected = pd.DataFrame({"positive": np.ones(3)}) + expected = DataFrame({"positive": np.ones(3)}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index b88ef0e6691cb..3be45e2d48e19 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -1,8 +1,11 @@ from datetime import datetime +from itertools import product import numpy as np import pytest +from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype + import pandas as pd from pandas import ( DataFrame, @@ -18,6 +21,30 @@ class TestResetIndex: + def test_set_reset(self): + + idx = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10], name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == np.dtype("uint64") + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_set_index_reset_index_dt64tz(self): + + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == "datetime64[ns, US/Eastern]" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + def test_reset_index_tz(self, tz_aware_fixture): # GH 3950 # reset_index with single level @@ -301,6 +328,220 @@ def test_reset_index_range(self): ) tm.assert_frame_equal(result, expected) + def test_reset_index_multiindex_columns(self): + levels = [["A", ""], ["B", "b"]] + df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + result = df[["B"]].rename_axis("A").reset_index() + tm.assert_frame_equal(result, df) + + # GH#16120: already existing column + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.rename_axis("A").reset_index() + + # GH#16164: multiindex (tuple) full key + result = df.set_index([("A", "")]).reset_index() + tm.assert_frame_equal(result, df) + + # with additional (unnamed) index level + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")]) + ) + expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1) + result = df.set_index([("B", "b")], append=True).reset_index() + tm.assert_frame_equal(result, expected) + + # with index name which is a too long tuple... + msg = "Item must have length equal to number of levels." + with pytest.raises(ValueError, match=msg): + df.rename_axis([("C", "c", "i")]).reset_index() + + # or too short... + levels = [["A", "a", ""], ["B", "b", "i"]] + df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")]) + ) + expected = pd.concat([idx_col, df2], axis=1) + result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii") + tm.assert_frame_equal(result, expected) + + # ... which is incompatible with col_fill=None + with pytest.raises( + ValueError, + match=( + "col_fill=None is incompatible with " + r"incomplete column name \('C', 'c'\)" + ), + ): + df2.rename_axis([("C", "c")]).reset_index(col_fill=None) + + # with col_level != 0 + result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C") + tm.assert_frame_equal(result, expected) + + def test_reset_index_datetime(self, tz_naive_fixture): + # GH#3950 + tz = tz_naive_fixture + idx1 = pd.date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") + idx = MultiIndex.from_arrays([idx1, idx2]) + df = DataFrame( + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": [ + datetime(2011, 1, 1), + datetime(2011, 1, 2), + datetime(2011, 1, 3), + datetime(2011, 1, 4), + datetime(2011, 1, 5), + ], + "idx2": np.arange(5, dtype="int64"), + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "a", "b"], + ) + expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) + + tm.assert_frame_equal(df.reset_index(), expected) + + idx3 = pd.date_range( + "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3" + ) + idx = MultiIndex.from_arrays([idx1, idx2, idx3]) + df = DataFrame( + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": [ + datetime(2011, 1, 1), + datetime(2011, 1, 2), + datetime(2011, 1, 3), + datetime(2011, 1, 4), + datetime(2011, 1, 5), + ], + "idx2": np.arange(5, dtype="int64"), + "idx3": [ + datetime(2012, 1, 1), + datetime(2012, 2, 1), + datetime(2012, 3, 1), + datetime(2012, 4, 1), + datetime(2012, 5, 1), + ], + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "idx3", "a", "b"], + ) + expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) + expected["idx3"] = expected["idx3"].apply( + lambda d: Timestamp(d, tz="Europe/Paris") + ) + tm.assert_frame_equal(df.reset_index(), expected) + + # GH#7793 + idx = MultiIndex.from_product( + [["a", "b"], pd.date_range("20130101", periods=3, tz=tz)] + ) + df = DataFrame( + np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx + ) + + expected = DataFrame( + { + "level_0": "a a a b b b".split(), + "level_1": [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + ] + * 2, + "a": np.arange(6, dtype="int64"), + }, + columns=["level_0", "level_1", "a"], + ) + expected["level_1"] = expected["level_1"].apply( + lambda d: Timestamp(d, freq="D", tz=tz) + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + def test_reset_index_period(self): + # GH#7746 + idx = MultiIndex.from_product( + [pd.period_range("20130101", periods=3, freq="M"), list("abc")], + names=["month", "feature"], + ) + + df = DataFrame( + np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"] + ) + expected = DataFrame( + { + "month": ( + [pd.Period("2013-01", freq="M")] * 3 + + [pd.Period("2013-02", freq="M")] * 3 + + [pd.Period("2013-03", freq="M")] * 3 + ), + "feature": ["a", "b", "c"] * 3, + "a": np.arange(9, dtype="int64"), + }, + columns=["month", "feature", "a"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + def test_reset_index_delevel_infer_dtype(self): + tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1])) + index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) + df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) + deleveled = df.reset_index() + assert is_integer_dtype(deleveled["prm1"]) + assert is_float_dtype(deleveled["prm2"]) + + def test_reset_index_with_drop( + self, multiindex_year_month_day_dataframe_random_data + ): + ymd = multiindex_year_month_day_dataframe_random_data + + deleveled = ymd.reset_index(drop=True) + assert len(deleveled.columns) == len(ymd.columns) + assert deleveled.index.name == ymd.index.name + + @pytest.mark.parametrize( + "ix_data, exp_data", + [ + ( + [(pd.NaT, 1), (pd.NaT, 2)], + {"a": [pd.NaT, pd.NaT], "b": [1, 2], "x": [11, 12]}, + ), + ( + [(pd.NaT, 1), (pd.Timestamp("2020-01-01"), 2)], + {"a": [pd.NaT, pd.Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, + ), + ( + [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], + {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, + ), + ], + ) + def test_reset_index_nat_multiindex(self, ix_data, exp_data): + # GH#36541: that reset_index() does not raise ValueError + ix = MultiIndex.from_tuples(ix_data, names=["a", "b"]) + result = DataFrame({"x": [11, 12]}, index=ix) + result = result.reset_index() + + expected = DataFrame(exp_data) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "array, dtype", diff --git a/pandas/tests/frame/methods/test_round.py b/pandas/tests/frame/methods/test_round.py index 3051f27882fb8..5cf5aea8846c5 100644 --- a/pandas/tests/frame/methods/test_round.py +++ b/pandas/tests/frame/methods/test_round.py @@ -168,7 +168,7 @@ def test_round_mixed_type(self): def test_round_with_duplicate_columns(self): # GH#11611 - df = pd.DataFrame( + df = DataFrame( np.random.random([3, 3]), columns=["A", "B", "C"], index=["first", "second", "third"], @@ -178,7 +178,7 @@ def test_round_with_duplicate_columns(self): rounded = dfs.round() tm.assert_index_equal(rounded.index, dfs.index) - decimals = pd.Series([1, 0, 2], index=["A", "B", "A"]) + decimals = Series([1, 0, 2], index=["A", "B", "A"]) msg = "Index of decimals must be unique" with pytest.raises(ValueError, match=msg): df.round(decimals) @@ -195,7 +195,7 @@ def test_round_builtin(self): def test_round_nonunique_categorical(self): # See GH#21809 idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3) - df = pd.DataFrame(np.random.rand(6, 3), columns=list("abc")) + df = DataFrame(np.random.rand(6, 3), columns=list("abc")) expected = df.round(3) expected.index = idx diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index ebe7eabd53b46..aea8caff5936b 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -3,11 +3,32 @@ import numpy as np import pytest -from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series, date_range +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + date_range, + period_range, + to_datetime, +) import pandas._testing as tm class TestSetIndex: + def test_set_index_multiindex(self): + # segfault in GH#3308 + d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} + df = DataFrame(d) + tuples = [(0, 1), (0, 2), (1, 2)] + df["tuples"] = tuples + + index = MultiIndex.from_tuples(df["tuples"]) + # it works! + df.set_index(index) + def test_set_index_empty_column(self): # GH#1971 df = DataFrame( @@ -352,6 +373,127 @@ def test_construction_with_categorical_index(self): idf = idf.reset_index().set_index("B") tm.assert_index_equal(idf.index, ci) + def test_set_index_preserve_categorical_dtype(self): + # GH#13743, GH#13854 + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) + for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: + result = df.set_index(cols).reset_index() + result = result.reindex(columns=df.columns) + tm.assert_frame_equal(result, df) + + def test_set_index_datetime(self): + # GH#3950 + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "datetime": [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + "value": range(6), + } + ) + df.index = to_datetime(df.pop("datetime"), utc=True) + df.index = df.index.tz_convert("US/Pacific") + + expected = DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + name="datetime", + ) + expected = expected.tz_localize("UTC").tz_convert("US/Pacific") + + df = df.set_index("label", append=True) + tm.assert_index_equal(df.index.levels[0], expected) + tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) + assert df.index.names == ["datetime", "label"] + + df = df.swaplevel(0, 1) + tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) + tm.assert_index_equal(df.index.levels[1], expected) + assert df.index.names == ["label", "datetime"] + + df = DataFrame(np.random.random(6)) + idx1 = DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + tz="US/Eastern", + ) + idx2 = DatetimeIndex( + [ + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + ], + tz="US/Eastern", + ) + idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") + idx3 = idx3._with_freq(None) + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="US/Eastern", + ) + expected2 = DatetimeIndex( + ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern" + ) + + tm.assert_index_equal(df.index.levels[0], expected1) + tm.assert_index_equal(df.index.levels[1], expected2) + tm.assert_index_equal(df.index.levels[2], idx3) + + # GH#7092 + tm.assert_index_equal(df.index.get_level_values(0), idx1) + tm.assert_index_equal(df.index.get_level_values(1), idx2) + tm.assert_index_equal(df.index.get_level_values(2), idx3) + + def test_set_index_period(self): + # GH#6631 + df = DataFrame(np.random.random(6)) + idx1 = period_range("2011-01-01", periods=3, freq="M") + idx1 = idx1.append(idx1) + idx2 = period_range("2013-01-01 09:00", periods=2, freq="H") + idx2 = idx2.append(idx2).append(idx2) + idx3 = period_range("2005", periods=6, freq="A") + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = period_range("2011-01-01", periods=3, freq="M") + expected2 = period_range("2013-01-01 09:00", periods=2, freq="H") + + tm.assert_index_equal(df.index.levels[0], expected1) + tm.assert_index_equal(df.index.levels[1], expected2) + tm.assert_index_equal(df.index.levels[2], idx3) + + tm.assert_index_equal(df.index.get_level_values(0), idx1) + tm.assert_index_equal(df.index.get_level_values(1), idx2) + tm.assert_index_equal(df.index.get_level_values(2), idx3) + class TestSetIndexInvalid: def test_set_index_verify_integrity(self, frame_of_index_cols): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 8f6902eca816f..2e21ce8ec2256 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -92,8 +92,8 @@ def test_shift_bool(self): def test_shift_categorical(self): # GH#9416 - s1 = pd.Series(["a", "b", "c"], dtype="category") - s2 = pd.Series(["A", "B", "C"], dtype="category") + s1 = Series(["a", "b", "c"], dtype="category") + s2 = Series(["A", "B", "C"], dtype="category") df = DataFrame({"one": s1, "two": s2}) rs = df.shift(1) xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)}) @@ -131,7 +131,7 @@ def test_shift_duplicate_columns(self): shifted = [] for columns in column_lists: - df = pd.DataFrame(data.copy(), columns=columns) + df = DataFrame(data.copy(), columns=columns) for s in range(5): df.iloc[:, s] = df.iloc[:, s].shift(s + 1) df.columns = range(5) @@ -147,8 +147,8 @@ def test_shift_duplicate_columns(self): def test_shift_axis1_multiple_blocks(self): # GH#35488 - df1 = pd.DataFrame(np.random.randint(1000, size=(5, 3))) - df2 = pd.DataFrame(np.random.randint(1000, size=(5, 2))) + df1 = DataFrame(np.random.randint(1000, size=(5, 3))) + df2 = DataFrame(np.random.randint(1000, size=(5, 2))) df3 = pd.concat([df1, df2], axis=1) assert len(df3._mgr.blocks) == 2 @@ -274,23 +274,21 @@ def test_datetime_frame_shift_with_freq_error(self, datetime_frame): def test_shift_dt64values_int_fill_deprecated(self): # GH#31971 - ser = pd.Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) + ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) df = ser.to_frame() with tm.assert_produces_warning(FutureWarning): result = df.shift(1, fill_value=0) - expected = pd.Series([pd.Timestamp(0), ser[0]]).to_frame() + expected = Series([pd.Timestamp(0), ser[0]]).to_frame() tm.assert_frame_equal(result, expected) # axis = 1 - df2 = pd.DataFrame({"A": ser, "B": ser}) + df2 = DataFrame({"A": ser, "B": ser}) df2._consolidate_inplace() with tm.assert_produces_warning(FutureWarning): result = df2.shift(1, axis=1, fill_value=0) - expected = pd.DataFrame( - {"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]} - ) + expected = DataFrame({"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index a106702aff807..16d451a12efc0 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -2,7 +2,16 @@ import pytest import pandas as pd -from pandas import CategoricalDtype, DataFrame, Index, IntervalIndex, MultiIndex, Series +from pandas import ( + CategoricalDtype, + CategoricalIndex, + DataFrame, + Index, + IntervalIndex, + MultiIndex, + Series, + Timestamp, +) import pandas._testing as tm @@ -352,7 +361,7 @@ def test_sort_index_multiindex(self, level): expected_mi = MultiIndex.from_tuples( [[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC") ) - expected = pd.DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) + expected = DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) result = df.sort_index(level=level) tm.assert_frame_equal(result, expected) @@ -360,7 +369,7 @@ def test_sort_index_multiindex(self, level): expected_mi = MultiIndex.from_tuples( [[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC") ) - expected = pd.DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) + expected = DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) result = df.sort_index(level=level, sort_remaining=False) tm.assert_frame_equal(result, expected) @@ -487,7 +496,7 @@ def test_sort_index_categorical_multiindex(self): columns=["a"], index=MultiIndex( levels=[ - pd.CategoricalIndex( + CategoricalIndex( ["c", "a", "b"], categories=["c", "a", "b"], ordered=True, @@ -662,6 +671,100 @@ def test_sort_index_level_mixed(self): sorted_after.drop([("foo", "three")], axis=1), ) + def test_sort_index_preserve_levels(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + result = frame.sort_index() + assert result.index.names == frame.index.names + + @pytest.mark.parametrize( + "gen,extra", + [ + ([1.0, 3.0, 2.0, 5.0], 4.0), + ([1, 3, 2, 5], 4), + ( + [ + Timestamp("20130101"), + Timestamp("20130103"), + Timestamp("20130102"), + Timestamp("20130105"), + ], + Timestamp("20130104"), + ), + (["1one", "3one", "2one", "5one"], "4one"), + ], + ) + def test_sort_index_multilevel_repr_8017(self, gen, extra): + + np.random.seed(0) + data = np.random.randn(3, 4) + + columns = MultiIndex.from_tuples([("red", i) for i in gen]) + df = DataFrame(data, index=list("def"), columns=columns) + df2 = pd.concat( + [ + df, + DataFrame( + "world", + index=list("def"), + columns=MultiIndex.from_tuples([("red", extra)]), + ), + ], + axis=1, + ) + + # check that the repr is good + # make sure that we have a correct sparsified repr + # e.g. only 1 header of read + assert str(df2).splitlines()[0].split() == ["red"] + + # GH 8017 + # sorting fails after columns added + + # construct single-dtype then sort + result = df.copy().sort_index(axis=1) + expected = df.iloc[:, [0, 2, 1, 3]] + tm.assert_frame_equal(result, expected) + + result = df2.sort_index(axis=1) + expected = df2.iloc[:, [0, 2, 1, 4, 3]] + tm.assert_frame_equal(result, expected) + + # setitem then sort + result = df.copy() + result[("red", extra)] = "world" + + result = result.sort_index(axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "categories", + [ + pytest.param(["a", "b", "c"], id="str"), + pytest.param( + [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], + id="pd.Interval", + ), + ], + ) + def test_sort_index_with_categories(self, categories): + # GH#23452 + df = DataFrame( + {"foo": range(len(categories))}, + index=CategoricalIndex( + data=categories, categories=categories, ordered=True + ), + ) + df.index = df.index.reorder_categories(df.index.categories[::-1]) + result = df.sort_index() + expected = DataFrame( + {"foo": reversed(range(len(categories)))}, + index=CategoricalIndex( + data=categories[::-1], categories=categories[::-1], ordered=True + ), + ) + tm.assert_frame_equal(result, expected) + class TestDataFrameSortIndexKey: def test_sort_multi_index_key(self): @@ -736,14 +839,14 @@ def test_sort_multi_index_key_str(self): tm.assert_frame_equal(result, expected) def test_changes_length_raises(self): - df = pd.DataFrame({"A": [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 3]}) with pytest.raises(ValueError, match="change the shape"): df.sort_index(key=lambda x: x[:1]) def test_sort_index_multiindex_sparse_column(self): # GH 29735, testing that sort_index on a multiindexed frame with sparse # columns fills with 0. - expected = pd.DataFrame( + expected = DataFrame( { i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)) for i in range(0, 4) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 0ca232ec433e7..7a1f0a35e1486 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import Categorical, DataFrame, NaT, Timestamp, date_range import pandas._testing as tm @@ -130,7 +132,7 @@ def test_sort_values_multicolumn_uint64(self): # GH#9918 # uint64 multicolumn sort - df = pd.DataFrame( + df = DataFrame( { "a": pd.Series([18446637057563306014, 1162265347240853609]), "b": pd.Series([1, 2]), @@ -139,7 +141,7 @@ def test_sort_values_multicolumn_uint64(self): df["a"] = df["a"].astype(np.uint64) result = df.sort_values(["a", "b"]) - expected = pd.DataFrame( + expected = DataFrame( { "a": pd.Series([18446637057563306014, 1162265347240853609]), "b": pd.Series([1, 2]), @@ -355,14 +357,14 @@ def test_sort_nat(self): Timestamp(x) for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] ] - df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] d4 = [ Timestamp(x) for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] ] - expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) sorted_df = df.sort_values(by=["a", "b"]) tm.assert_frame_equal(sorted_df, expected) @@ -381,7 +383,7 @@ def test_sort_values_na_position_with_categories(self): reversed_category_indices = sorted(category_indices, reverse=True) reversed_na_indices = sorted(na_indices) - df = pd.DataFrame( + df = DataFrame( { column_name: pd.Categorical( ["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True @@ -461,19 +463,19 @@ def test_sort_values_nat(self): Timestamp(x) for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] ] - df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] d4 = [ Timestamp(x) for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] ] - expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) sorted_df = df.sort_values(by=["a", "b"]) tm.assert_frame_equal(sorted_df, expected) def test_sort_values_na_position_with_categories_raises(self): - df = pd.DataFrame( + df = DataFrame( { "c": pd.Categorical( ["A", np.nan, "B", np.nan, "C"], @@ -525,7 +527,7 @@ def test_sort_values_ignore_index( def test_sort_values_nat_na_position_default(self): # GH 13230 - expected = pd.DataFrame( + expected = DataFrame( { "A": [1, 2, 3, 4, 4], "date": pd.DatetimeIndex( @@ -666,7 +668,7 @@ def test_sort_values_key_empty(self, sort_by_key): df.sort_index(key=sort_by_key) def test_changes_length_raises(self): - df = pd.DataFrame({"A": [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 3]}) with pytest.raises(ValueError, match="change the shape"): df.sort_values("A", key=lambda x: x[:1]) @@ -696,7 +698,7 @@ def test_sort_values_key_dict_axis(self): def test_sort_values_key_casts_to_categorical(self, ordered): # https://github.com/pandas-dev/pandas/issues/36383 categories = ["c", "b", "a"] - df = pd.DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]}) + df = DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]}) def sorter(key): if key.name == "y": @@ -706,8 +708,95 @@ def sorter(key): return key result = df.sort_values(by=["x", "y"], key=sorter) - expected = pd.DataFrame( + expected = DataFrame( {"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0]) ) tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def df_none(): + return DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 2, 2, 1, 1], + "A": np.arange(6, 0, -1), + ("B", 5): ["one", "one", "two", "two", "one", "one"], + } + ) + + +@pytest.fixture(params=[["outer"], ["outer", "inner"]]) +def df_idx(request, df_none): + levels = request.param + return df_none.set_index(levels) + + +@pytest.fixture( + params=[ + "inner", # index level + ["outer"], # list of index level + "A", # column + [("B", 5)], # list of column + ["inner", "outer"], # two index levels + [("B", 5), "outer"], # index level and column + ["A", ("B", 5)], # Two columns + ["inner", "outer"], # two index levels and column + ] +) +def sort_names(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def ascending(request): + return request.param + + +class TestSortValuesLevelAsStr: + def test_sort_index_level_and_column_label( + self, df_none, df_idx, sort_names, ascending + ): + # GH#14353 + + # Get index levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on columns and the setting index + expected = df_none.sort_values( + by=sort_names, ascending=ascending, axis=0 + ).set_index(levels) + + # Compute result sorting on mix on columns and index levels + result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) + + tm.assert_frame_equal(result, expected) + + def test_sort_column_level_and_index_label( + self, df_none, df_idx, sort_names, ascending + ): + # GH#14353 + + # Get levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on axis=0, setting index levels, and then + # transposing. For some cases this will result in a frame with + # multiple column levels + expected = ( + df_none.sort_values(by=sort_names, ascending=ascending, axis=0) + .set_index(levels) + .T + ) + + # Compute result by transposing and sorting on axis=1. + result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) + + if len(levels) > 1: + # Accessing multi-level columns that are not lexsorted raises a + # performance warning + with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): + tm.assert_frame_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_swaplevel.py b/pandas/tests/frame/methods/test_swaplevel.py new file mode 100644 index 0000000000000..5511ac7d6b1b2 --- /dev/null +++ b/pandas/tests/frame/methods/test_swaplevel.py @@ -0,0 +1,36 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestSwaplevel: + def test_swaplevel(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + swapped = frame["A"].swaplevel() + swapped2 = frame["A"].swaplevel(0) + swapped3 = frame["A"].swaplevel(0, 1) + swapped4 = frame["A"].swaplevel("first", "second") + assert not swapped.index.equals(frame.index) + tm.assert_series_equal(swapped, swapped2) + tm.assert_series_equal(swapped, swapped3) + tm.assert_series_equal(swapped, swapped4) + + back = swapped.swaplevel() + back2 = swapped.swaplevel(0) + back3 = swapped.swaplevel(0, 1) + back4 = swapped.swaplevel("second", "first") + assert back.index.equals(frame.index) + tm.assert_series_equal(back, back2) + tm.assert_series_equal(back, back3) + tm.assert_series_equal(back, back4) + + ft = frame.T + swapped = ft.swaplevel("first", "second", axis=1) + exp = frame.swaplevel("first", "second").T + tm.assert_frame_equal(swapped, exp) + + msg = "Can only swap levels on a hierarchical axis." + with pytest.raises(TypeError, match=msg): + DataFrame(range(3)).swaplevel() diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py similarity index 99% rename from pandas/tests/frame/test_to_csv.py rename to pandas/tests/frame/methods/test_to_csv.py index b085704e8b06f..5bf1ce508dfc4 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -133,9 +133,9 @@ def test_to_csv_from_csv4(self): with tm.ensure_clean("__tmp_to_csv_from_csv4__") as path: # GH 10833 (TimedeltaIndex formatting) dt = pd.Timedelta(seconds=1) - df = pd.DataFrame( + df = DataFrame( {"dt_data": [i * dt for i in range(3)]}, - index=pd.Index([i * dt for i in range(3)], name="dt_index"), + index=Index([i * dt for i in range(3)], name="dt_index"), ) df.to_csv(path) @@ -1257,7 +1257,7 @@ def test_to_csv_quoting(self): # xref gh-7791: make sure the quoting parameter is passed through # with multi-indexes - df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) df = df.set_index(["a", "b"]) expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"'] @@ -1270,7 +1270,7 @@ def test_period_index_date_overflow(self): dates = ["1990-01-01", "2000-01-01", "3005-01-01"] index = pd.PeriodIndex(dates, freq="D") - df = pd.DataFrame([4, 5, 6], index=index) + df = DataFrame([4, 5, 6], index=index) result = df.to_csv() expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"] @@ -1288,7 +1288,7 @@ def test_period_index_date_overflow(self): dates = ["1990-01-01", pd.NaT, "3005-01-01"] index = pd.PeriodIndex(dates, freq="D") - df = pd.DataFrame([4, 5, 6], index=index) + df = DataFrame([4, 5, 6], index=index) result = df.to_csv() expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"] @@ -1298,7 +1298,7 @@ def test_period_index_date_overflow(self): def test_multi_index_header(self): # see gh-5539 columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) - df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) df.columns = columns header = ["a", "b", "c", "d"] @@ -1310,8 +1310,8 @@ def test_multi_index_header(self): def test_to_csv_single_level_multi_index(self): # see gh-26303 - index = pd.Index([(1,), (2,), (3,)]) - df = pd.DataFrame([[1, 2, 3]], columns=index) + index = Index([(1,), (2,), (3,)]) + df = DataFrame([[1, 2, 3]], columns=index) df = df.reindex(columns=[(1,), (3,)]) expected = ",1,3\n0,1,3\n" result = df.to_csv(line_terminator="\n") @@ -1319,7 +1319,7 @@ def test_to_csv_single_level_multi_index(self): def test_gz_lineend(self): # GH 25311 - df = pd.DataFrame({"a": [1, 2]}) + df = DataFrame({"a": [1, 2]}) expected_rows = ["a", "1", "2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) with tm.ensure_clean("__test_gz_lineend.csv.gz") as path: diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py new file mode 100644 index 0000000000000..436e70fbb0e56 --- /dev/null +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -0,0 +1,52 @@ +import numpy as np + +from pandas import DataFrame +from pandas.core.arrays import PandasArray + + +class TestToDictOfBlocks: + def test_copy_blocks(self, float_frame): + # GH#9607 + df = DataFrame(float_frame, copy=True) + column = df.columns[0] + + # use the default copy=True, change a column + blocks = df._to_dict_of_blocks(copy=True) + for dtype, _df in blocks.items(): + if column in _df: + _df.loc[:, column] = _df[column] + 1 + + # make sure we did not change the original DataFrame + assert not _df[column].equals(df[column]) + + def test_no_copy_blocks(self, float_frame): + # GH#9607 + df = DataFrame(float_frame, copy=True) + column = df.columns[0] + + # use the copy=False, change a column + blocks = df._to_dict_of_blocks(copy=False) + for dtype, _df in blocks.items(): + if column in _df: + _df.loc[:, column] = _df[column] + 1 + + # make sure we did change the original DataFrame + assert _df[column].equals(df[column]) + + +def test_to_dict_of_blocks_item_cache(): + # Calling to_dict_of_blocks should not poison item_cache + df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df["c"] = PandasArray(np.array([1, 2, None, 3], dtype=object)) + mgr = df._mgr + assert len(mgr.blocks) == 3 # i.e. not consolidated + + ser = df["b"] # populations item_cache["b"] + + df._to_dict_of_blocks() + + # Check that the to_dict_of_blocks didnt break link between ser and df + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" + + assert df["b"] is ser diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py new file mode 100644 index 0000000000000..564a659724768 --- /dev/null +++ b/pandas/tests/frame/methods/test_values.py @@ -0,0 +1,192 @@ +import numpy as np + +from pandas import DataFrame, NaT, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameValues: + def test_values(self, float_frame): + float_frame.values[:, 0] = 5.0 + assert (float_frame.values[:, 0] == 5).all() + + def test_more_values(self, float_string_frame): + values = float_string_frame.values + assert values.shape[1] == len(float_string_frame.columns) + + def test_values_mixed_dtypes(self, float_frame, float_string_frame): + frame = float_frame + arr = frame.values + + frame_cols = frame.columns + for i, row in enumerate(arr): + for j, value in enumerate(row): + col = frame_cols[j] + if np.isnan(value): + assert np.isnan(frame[col][i]) + else: + assert value == frame[col][i] + + # mixed type + arr = float_string_frame[["foo", "A"]].values + assert arr[0, 0] == "bar" + + df = DataFrame({"complex": [1j, 2j, 3j], "real": [1, 2, 3]}) + arr = df.values + assert arr[0, 0] == 1j + + def test_values_duplicates(self): + df = DataFrame( + [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"] + ) + + result = df.values + expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object) + + tm.assert_numpy_array_equal(result, expected) + + def test_frame_values_with_tz(self): + tz = "US/Central" + df = DataFrame({"A": date_range("2000", periods=4, tz=tz)}) + result = df.values + expected = np.array( + [ + [Timestamp("2000-01-01", tz=tz)], + [Timestamp("2000-01-02", tz=tz)], + [Timestamp("2000-01-03", tz=tz)], + [Timestamp("2000-01-04", tz=tz)], + ] + ) + tm.assert_numpy_array_equal(result, expected) + + # two columns, homogenous + + df["B"] = df["A"] + result = df.values + expected = np.concatenate([expected, expected], axis=1) + tm.assert_numpy_array_equal(result, expected) + + # three columns, heterogeneous + est = "US/Eastern" + df["C"] = df["A"].dt.tz_convert(est) + + new = np.array( + [ + [Timestamp("2000-01-01T01:00:00", tz=est)], + [Timestamp("2000-01-02T01:00:00", tz=est)], + [Timestamp("2000-01-03T01:00:00", tz=est)], + [Timestamp("2000-01-04T01:00:00", tz=est)], + ] + ) + expected = np.concatenate([expected, new], axis=1) + result = df.values + tm.assert_numpy_array_equal(result, expected) + + def test_interleave_with_tzaware(self, timezone_frame): + + # interleave with object + result = timezone_frame.assign(D="foo").values + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ["foo", "foo", "foo"], + ], + dtype=object, + ).T + tm.assert_numpy_array_equal(result, expected) + + # interleave with only datetime64[ns] + result = timezone_frame.values + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T + tm.assert_numpy_array_equal(result, expected) + + def test_values_interleave_non_unique_cols(self): + df = DataFrame( + [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], + columns=["x", "x"], + index=[1, 2], + ) + + df_unique = df.copy() + df_unique.columns = ["x", "y"] + assert df_unique.values.shape == df.values.shape + tm.assert_numpy_array_equal(df_unique.values[0], df.values[0]) + tm.assert_numpy_array_equal(df_unique.values[1], df.values[1]) + + def test_values_numeric_cols(self, float_frame): + float_frame["foo"] = "bar" + + values = float_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + def test_values_lcd(self, mixed_float_frame, mixed_int_frame): + + # mixed lcd + values = mixed_float_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + values = mixed_float_frame[["A", "B", "C"]].values + assert values.dtype == np.float32 + + values = mixed_float_frame[["C"]].values + assert values.dtype == np.float16 + + # GH#10364 + # B uint64 forces float because there are other signed int types + values = mixed_int_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + values = mixed_int_frame[["A", "D"]].values + assert values.dtype == np.int64 + + # B uint64 forces float because there are other signed int types + values = mixed_int_frame[["A", "B", "C"]].values + assert values.dtype == np.float64 + + # as B and C are both unsigned, no forcing to float is needed + values = mixed_int_frame[["B", "C"]].values + assert values.dtype == np.uint64 + + values = mixed_int_frame[["A", "C"]].values + assert values.dtype == np.int32 + + values = mixed_int_frame[["C", "D"]].values + assert values.dtype == np.int64 + + values = mixed_int_frame[["A"]].values + assert values.dtype == np.int32 + + values = mixed_int_frame[["C"]].values + assert values.dtype == np.uint8 diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index deac1792737a1..e4a0d37e3a017 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,5 +1,4 @@ from datetime import datetime -import inspect import numpy as np import pytest @@ -137,34 +136,6 @@ def test_dti_set_index_reindex(self): # Renaming - def test_reindex_api_equivalence(self): - # equivalence of the labels/axis and index/columns API's - df = DataFrame( - [[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=["a", "b", "c"], - columns=["d", "e", "f"], - ) - - res1 = df.reindex(["b", "a"]) - res2 = df.reindex(index=["b", "a"]) - res3 = df.reindex(labels=["b", "a"]) - res4 = df.reindex(labels=["b", "a"], axis=0) - res5 = df.reindex(["b", "a"], axis=0) - for res in [res2, res3, res4, res5]: - tm.assert_frame_equal(res1, res) - - res1 = df.reindex(columns=["e", "d"]) - res2 = df.reindex(["e", "d"], axis=1) - res3 = df.reindex(labels=["e", "d"], axis=1) - for res in [res2, res3]: - tm.assert_frame_equal(res1, res) - - res1 = df.reindex(index=["b", "a"], columns=["e", "d"]) - res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) - res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) - for res in [res2, res3]: - tm.assert_frame_equal(res1, res) - def test_assign_columns(self, float_frame): float_frame["hi"] = "there" @@ -173,38 +144,6 @@ def test_assign_columns(self, float_frame): tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) - def test_rename_signature(self): - sig = inspect.signature(DataFrame.rename) - parameters = set(sig.parameters) - assert parameters == { - "self", - "mapper", - "index", - "columns", - "axis", - "inplace", - "copy", - "level", - "errors", - } - - def test_reindex_signature(self): - sig = inspect.signature(DataFrame.reindex) - parameters = set(sig.parameters) - assert parameters == { - "self", - "labels", - "index", - "columns", - "axis", - "limit", - "copy", - "level", - "method", - "fill_value", - "tolerance", - } - class TestIntervalIndex: def test_setitem(self): @@ -256,21 +195,6 @@ def test_set_reset_index(self): class TestCategoricalIndex: - def test_set_index_preserve_categorical_dtype(self): - # GH13743, GH13854 - df = DataFrame( - { - "A": [1, 2, 1, 1, 2], - "B": [10, 16, 22, 28, 34], - "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), - "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), - } - ) - for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: - result = df.set_index(cols).reset_index() - result = result.reindex(columns=df.columns) - tm.assert_frame_equal(result, df) - @pytest.mark.parametrize( "codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]]) ) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index ee136533b0775..f2847315f4959 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1,6 +1,5 @@ from datetime import timedelta from decimal import Decimal -import operator import numpy as np import pytest @@ -132,9 +131,9 @@ def wrapper(x): r1 = getattr(all_na, opname)(axis=1) if opname in ["sum", "prod"]: unit = 1 if opname == "prod" else 0 # result for empty sum/prod - expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) + expected = Series(unit, index=r0.index, dtype=r0.dtype) tm.assert_series_equal(r0, expected) - expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) + expected = Series(unit, index=r1.index, dtype=r1.dtype) tm.assert_series_equal(r1, expected) @@ -464,10 +463,10 @@ def test_nunique(self): @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_mixed_datetime_numeric(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 - df = pd.DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2}) + df = DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2}) with tm.assert_produces_warning(FutureWarning): result = df.mean() - expected = pd.Series([1.0], index=["A"]) + expected = Series([1.0], index=["A"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) @@ -475,11 +474,11 @@ def test_mean_excludes_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. - df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) + df = DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) with tm.assert_produces_warning(FutureWarning): result = df.mean() - expected = pd.Series(dtype=np.float64) + expected = Series(dtype=np.float64) tm.assert_series_equal(result, expected) def test_mean_mixed_string_decimal(self): @@ -499,10 +498,10 @@ def test_mean_mixed_string_decimal(self): {"A": 5, "B": None, "C": Decimal("1223.00")}, ] - df = pd.DataFrame(d) + df = DataFrame(d) result = df.mean() - expected = pd.Series([2.7, 681.6], index=["A", "C"]) + expected = Series([2.7, 681.6], index=["A", "C"]) tm.assert_series_equal(result, expected) def test_var_std(self, datetime_frame): @@ -754,6 +753,22 @@ def test_operators_timedelta64(self): assert df["off1"].dtype == "timedelta64[ns]" assert df["off2"].dtype == "timedelta64[ns]" + def test_std_timedelta64_skipna_false(self): + # GH#37392 + tdi = pd.timedelta_range("1 Day", periods=10) + df = DataFrame({"A": tdi, "B": tdi}) + df.iloc[-2, -1] = pd.NaT + + result = df.std(skipna=False) + expected = Series( + [df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[ns]" + ) + tm.assert_series_equal(result, expected) + + result = df.std(axis=1, skipna=False) + expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)]) + tm.assert_series_equal(result, expected) + def test_sum_corner(self): empty_frame = DataFrame() @@ -767,47 +782,45 @@ def test_sum_corner(self): @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) def test_sum_prod_nanops(self, method, unit): idx = ["a", "b", "c"] - df = pd.DataFrame( - {"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]} - ) + df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]}) # The default result = getattr(df, method) - expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") + expected = Series([unit, unit, unit], index=idx, dtype="float64") # min_count=1 result = getattr(df, method)(min_count=1) - expected = pd.Series([unit, unit, np.nan], index=idx) + expected = Series([unit, unit, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count=0 result = getattr(df, method)(min_count=0) - expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") + expected = Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) result = getattr(df.iloc[1:], method)(min_count=1) - expected = pd.Series([unit, np.nan, np.nan], index=idx) + expected = Series([unit, np.nan, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count > 1 - df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) + df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) result = getattr(df, method)(min_count=5) - expected = pd.Series(result, index=["A", "B"]) + expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) result = getattr(df, method)(min_count=6) - expected = pd.Series(result, index=["A", "B"]) + expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) def test_sum_nanops_timedelta(self): # prod isn't defined on timedeltas idx = ["a", "b", "c"] - df = pd.DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]}) + df = DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]}) df2 = df.apply(pd.to_timedelta) # 0 by default result = df2.sum() - expected = pd.Series([0, 0, 0], dtype="m8[ns]", index=idx) + expected = Series([0, 0, 0], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) # min_count=0 @@ -816,7 +829,7 @@ def test_sum_nanops_timedelta(self): # min_count=1 result = df2.sum(min_count=1) - expected = pd.Series([0, 0, np.nan], dtype="m8[ns]", index=idx) + expected = Series([0, 0, np.nan], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) def test_sum_object(self, float_frame): @@ -833,12 +846,12 @@ def test_sum_bool(self, float_frame): def test_sum_mixed_datetime(self): # GH#30886 - df = pd.DataFrame( + df = DataFrame( {"A": pd.date_range("2000", periods=4), "B": [1, 2, 3, 4]} ).reindex([2, 3, 4]) result = df.sum() - expected = pd.Series({"B": 7.0}) + expected = Series({"B": 7.0}) tm.assert_series_equal(result, expected) def test_mean_corner(self, float_frame, float_string_frame): @@ -862,7 +875,7 @@ def test_mean_datetimelike(self): # GH#24757 check that datetimelike are excluded by default, handled # correctly with numeric_only=True - df = pd.DataFrame( + df = DataFrame( { "A": np.arange(3), "B": pd.date_range("2016-01-01", periods=3), @@ -871,17 +884,17 @@ def test_mean_datetimelike(self): } ) result = df.mean(numeric_only=True) - expected = pd.Series({"A": 1.0}) + expected = Series({"A": 1.0}) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # in the future datetime columns will be included result = df.mean() - expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) + expected = Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) def test_mean_datetimelike_numeric_only_false(self): - df = pd.DataFrame( + df = DataFrame( { "A": np.arange(3), "B": pd.date_range("2016-01-01", periods=3), @@ -891,7 +904,7 @@ def test_mean_datetimelike_numeric_only_false(self): # datetime(tz) and timedelta work result = df.mean(numeric_only=False) - expected = pd.Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]}) + expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) # mean of period is not allowed @@ -903,9 +916,9 @@ def test_mean_datetimelike_numeric_only_false(self): def test_mean_extensionarray_numeric_only_true(self): # https://github.com/pandas-dev/pandas/issues/33256 arr = np.random.randint(1000, size=(10, 5)) - df = pd.DataFrame(arr, dtype="Int64") + df = DataFrame(arr, dtype="Int64") result = df.mean(numeric_only=True) - expected = pd.DataFrame(arr).mean() + expected = DataFrame(arr).mean() tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): @@ -1056,28 +1069,28 @@ def test_any_all_bool_only(self): (np.any, {"A": [False, False], "B": [False, True]}, True), (np.all, {"A": [False, False], "B": [False, True]}, False), # other types - (np.all, {"A": pd.Series([0.0, 1.0], dtype="float")}, False), - (np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True), - (np.all, {"A": pd.Series([0, 1], dtype=int)}, False), - (np.any, {"A": pd.Series([0, 1], dtype=int)}, True), - pytest.param(np.all, {"A": pd.Series([0, 1], dtype="M8[ns]")}, False), - pytest.param(np.any, {"A": pd.Series([0, 1], dtype="M8[ns]")}, True), - pytest.param(np.all, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True), - pytest.param(np.any, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True), - pytest.param(np.all, {"A": pd.Series([0, 1], dtype="m8[ns]")}, False), - pytest.param(np.any, {"A": pd.Series([0, 1], dtype="m8[ns]")}, True), - pytest.param(np.all, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True), - pytest.param(np.any, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True), - (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), - (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), - (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), - (np.any, {"A": pd.Series([1, 2], dtype="category")}, True), + (np.all, {"A": Series([0.0, 1.0], dtype="float")}, False), + (np.any, {"A": Series([0.0, 1.0], dtype="float")}, True), + (np.all, {"A": Series([0, 1], dtype=int)}, False), + (np.any, {"A": Series([0, 1], dtype=int)}, True), + pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False), + pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True), + pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True), + pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True), + pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False), + pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True), + pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True), + pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True), + (np.all, {"A": Series([0, 1], dtype="category")}, False), + (np.any, {"A": Series([0, 1], dtype="category")}, True), + (np.all, {"A": Series([1, 2], dtype="category")}, True), + (np.any, {"A": Series([1, 2], dtype="category")}, True), # Mix GH#21484 pytest.param( np.all, { - "A": pd.Series([10, 20], dtype="M8[ns]"), - "B": pd.Series([10, 20], dtype="m8[ns]"), + "A": Series([10, 20], dtype="M8[ns]"), + "B": Series([10, 20], dtype="m8[ns]"), }, True, ), @@ -1115,82 +1128,6 @@ def test_any_all_level_axis_none_raises(self, method): with pytest.raises(ValueError, match=xpr): getattr(df, method)(axis=None, level="out") - # --------------------------------------------------------------------- - # Matrix-like - - def test_matmul(self): - # matmul test is for GH 10259 - a = DataFrame( - np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] - ) - b = DataFrame( - np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] - ) - - # DataFrame @ DataFrame - result = operator.matmul(a, b) - expected = DataFrame( - np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] - ) - tm.assert_frame_equal(result, expected) - - # DataFrame @ Series - result = operator.matmul(a, b.one) - expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"]) - tm.assert_series_equal(result, expected) - - # np.array @ DataFrame - result = operator.matmul(a.values, b) - assert isinstance(result, DataFrame) - assert result.columns.equals(b.columns) - assert result.index.equals(pd.Index(range(3))) - expected = np.dot(a.values, b.values) - tm.assert_almost_equal(result.values, expected) - - # nested list @ DataFrame (__rmatmul__) - result = operator.matmul(a.values.tolist(), b) - expected = DataFrame( - np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] - ) - tm.assert_almost_equal(result.values, expected.values) - - # mixed dtype DataFrame @ DataFrame - a["q"] = a.q.round().astype(int) - result = operator.matmul(a, b) - expected = DataFrame( - np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] - ) - tm.assert_frame_equal(result, expected) - - # different dtypes DataFrame @ DataFrame - a = a.astype(int) - result = operator.matmul(a, b) - expected = DataFrame( - np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] - ) - tm.assert_frame_equal(result, expected) - - # unaligned - df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) - df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) - - with pytest.raises(ValueError, match="aligned"): - operator.matmul(df, df2) - - def test_matmul_message_shapes(self): - # GH#21581 exception message should reflect original shapes, - # not transposed shapes - a = np.random.rand(10, 4) - b = np.random.rand(5, 3) - - df = DataFrame(b) - - msg = r"shapes \(10, 4\) and \(5, 3\) not aligned" - with pytest.raises(ValueError, match=msg): - a @ df - with pytest.raises(ValueError, match=msg): - a.tolist() @ df - # --------------------------------------------------------------------- # Unsorted @@ -1211,34 +1148,59 @@ def test_series_broadcasting(self): class TestDataFrameReductions: def test_min_max_dt64_with_NaT(self): # Both NaT and Timestamp are in DataFrame. - df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) + df = DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) res = df.min() - exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) + exp = Series([pd.Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) res = df.max() - exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) + exp = Series([pd.Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) # GH12941, only NaTs are in DataFrame. - df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]}) + df = DataFrame({"foo": [pd.NaT, pd.NaT]}) res = df.min() - exp = pd.Series([pd.NaT], index=["foo"]) + exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) res = df.max() - exp = pd.Series([pd.NaT], index=["foo"]) + exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): + # GH#36907 + tz = tz_naive_fixture + df = DataFrame( + { + "a": [ + Timestamp("2020-01-01 08:00:00", tz=tz), + Timestamp("1920-02-01 09:00:00", tz=tz), + ], + "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], + } + ) + + res = df.min(axis=1, skipna=False) + expected = Series([df.loc[0, "a"], pd.NaT]) + assert expected.dtype == df["a"].dtype + + tm.assert_series_equal(res, expected) + + res = df.max(axis=1, skipna=False) + expected = Series([df.loc[0, "b"], pd.NaT]) + assert expected.dtype == df["a"].dtype + + tm.assert_series_equal(res, expected) + def test_min_max_dt64_api_consistency_with_NaT(self): # Calling the following sum functions returned an error for dataframes but # returned NaT for series. These tests check that the API is consistent in # min/max calls on empty Series/DataFrames. See GH:33704 for more # information - df = pd.DataFrame(dict(x=pd.to_datetime([]))) - expected_dt_series = pd.Series(pd.to_datetime([])) + df = DataFrame(dict(x=pd.to_datetime([]))) + expected_dt_series = Series(pd.to_datetime([])) # check axis 0 assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT) assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT) @@ -1250,8 +1212,8 @@ def test_min_max_dt64_api_consistency_with_NaT(self): def test_min_max_dt64_api_consistency_empty_df(self): # check DataFrame/Series api consistency when calling min/max on an empty # DataFrame/Series. - df = pd.DataFrame(dict(x=[])) - expected_float_series = pd.Series([], dtype=float) + df = DataFrame(dict(x=[])) + expected_float_series = Series([], dtype=float) # check axis 0 assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min()) assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max()) @@ -1272,13 +1234,73 @@ def test_preserve_timezone(self, initial: str, method): result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) + def test_frame_any_all_with_level(self): + df = DataFrame( + {"data": [False, False, True, False, True, False, True]}, + index=[ + ["one", "one", "two", "one", "two", "two", "two"], + [0, 1, 0, 2, 1, 2, 3], + ], + ) + + result = df.any(level=0) + ex = DataFrame({"data": [False, True]}, index=["one", "two"]) + tm.assert_frame_equal(result, ex) + + result = df.all(level=0) + ex = DataFrame({"data": [False, False]}, index=["one", "two"]) + tm.assert_frame_equal(result, ex) + + def test_frame_any_with_timedelta(self): + # GH#17667 + df = DataFrame( + { + "a": Series([0, 0]), + "t": Series([pd.to_timedelta(0, "s"), pd.to_timedelta(1, "ms")]), + } + ) + + result = df.any(axis=0) + expected = Series(data=[False, True], index=["a", "t"]) + tm.assert_series_equal(result, expected) + + result = df.any(axis=1) + expected = Series(data=[False, True]) + tm.assert_series_equal(result, expected) + + +def test_sum_timedelta64_skipna_false(): + # GH#17235 + arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) + arr[-1, -1] = "Nat" + + df = DataFrame(arr) + + result = df.sum(skipna=False) + expected = Series([pd.Timedelta(seconds=12), pd.NaT]) + tm.assert_series_equal(result, expected) + + result = df.sum(axis=0, skipna=False) + tm.assert_series_equal(result, expected) + + result = df.sum(axis=1, skipna=False) + expected = Series( + [ + pd.Timedelta(seconds=1), + pd.Timedelta(seconds=5), + pd.Timedelta(seconds=9), + pd.NaT, + ] + ) + tm.assert_series_equal(result, expected) + def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 - df = pd.DataFrame([["a", 1]], columns=list("ab")) + df = DataFrame([["a", 1]], columns=list("ab")) df = df.astype({"b": "Int64"}) result = df.sum() - expected = pd.Series(["a", 1], index=["a", "b"]) + expected = Series(["a", 1], index=["a", "b"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index f5d1808f367e7..34d56672e5536 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -39,13 +39,6 @@ def test_getitem_pop_assign_name(self, float_frame): s2 = s.loc[:] assert s2.name == "B" - def test_get_value(self, float_frame): - for idx in float_frame.index: - for col in float_frame.columns: - result = float_frame._get_value(idx, col) - expected = float_frame[col][idx] - tm.assert_almost_equal(result, expected) - def test_add_prefix_suffix(self, float_frame): with_prefix = float_frame.add_prefix("foo#") expected = pd.Index([f"foo#{c}" for c in float_frame.columns]) @@ -102,14 +95,14 @@ def test_column_contains_raises(self, float_frame): def test_tab_completion(self): # DataFrame whose columns are identifiers shall have them in __dir__. - df = pd.DataFrame([list("abcd"), list("efgh")], columns=list("ABCD")) + df = DataFrame([list("abcd"), list("efgh")], columns=list("ABCD")) for key in list("ABCD"): assert key in dir(df) assert isinstance(df.__getitem__("A"), pd.Series) # DataFrame whose first-level columns are identifiers shall have # them in __dir__. - df = pd.DataFrame( + df = DataFrame( [list("abcd"), list("efgh")], columns=pd.MultiIndex.from_tuples(list(zip("ABCD", "EFGH"))), ) @@ -315,54 +308,33 @@ def test_sequence_like_with_categorical(self): def test_len(self, float_frame): assert len(float_frame) == len(float_frame.index) - def test_values_mixed_dtypes(self, float_frame, float_string_frame): - frame = float_frame - arr = frame.values - - frame_cols = frame.columns - for i, row in enumerate(arr): - for j, value in enumerate(row): - col = frame_cols[j] - if np.isnan(value): - assert np.isnan(frame[col][i]) - else: - assert value == frame[col][i] - - # mixed type - arr = float_string_frame[["foo", "A"]].values - assert arr[0, 0] == "bar" - - df = DataFrame({"complex": [1j, 2j, 3j], "real": [1, 2, 3]}) - arr = df.values - assert arr[0, 0] == 1j - # single block corner case arr = float_frame[["A", "B"]].values expected = float_frame.reindex(columns=["A", "B"]).values tm.assert_almost_equal(arr, expected) def test_to_numpy(self): - df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]}) + df = DataFrame({"A": [1, 2], "B": [3, 4.5]}) expected = np.array([[1, 3], [2, 4.5]]) result = df.to_numpy() tm.assert_numpy_array_equal(result, expected) def test_to_numpy_dtype(self): - df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]}) + df = DataFrame({"A": [1, 2], "B": [3, 4.5]}) expected = np.array([[1, 3], [2, 4]], dtype="int64") result = df.to_numpy(dtype="int64") tm.assert_numpy_array_equal(result, expected) def test_to_numpy_copy(self): arr = np.random.randn(4, 3) - df = pd.DataFrame(arr) + df = DataFrame(arr) assert df.values.base is arr assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 - df = pd.DataFrame([[pd.Timestamp("2020-01-01 00:00:00"), 100.0]]) + df = DataFrame([[pd.Timestamp("2020-01-01 00:00:00"), 100.0]]) result = df.to_numpy(dtype=str) expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) tm.assert_numpy_array_equal(result, expected) @@ -394,18 +366,6 @@ def test_class_axis(self): assert pydoc.getdoc(DataFrame.index) assert pydoc.getdoc(DataFrame.columns) - def test_more_values(self, float_string_frame): - values = float_string_frame.values - assert values.shape[1] == len(float_string_frame.columns) - - def test_repr_with_mi_nat(self, float_string_frame): - df = DataFrame( - {"X": [1, 2]}, index=[[pd.NaT, pd.Timestamp("20130101")], ["a", "b"]] - ) - result = repr(df) - expected = " X\nNaT a 1\n2013-01-01 b 2" - assert result == expected - def test_items_names(self, float_string_frame): for k, v in float_string_frame.items(): assert v.name == k @@ -447,10 +407,6 @@ def test_with_datetimelikes(self): expected = Series({np.dtype("object"): 10}) tm.assert_series_equal(result, expected) - def test_values(self, float_frame): - float_frame.values[:, 0] = 5.0 - assert (float_frame.values[:, 0] == 5).all() - def test_deepcopy(self, float_frame): cp = deepcopy(float_frame) series = cp["A"] @@ -529,7 +485,7 @@ async def test_tab_complete_warning(self, ip): pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter - code = "import pandas as pd; df = pd.DataFrame()" + code = "from pandas import DataFrame; df = DataFrame()" await ip.run_code(code) # TODO: remove it when Ipython updates @@ -547,7 +503,7 @@ async def test_tab_complete_warning(self, ip): list(ip.Completer.completions("df.", 1)) def test_attrs(self): - df = pd.DataFrame({"A": [2, 3]}) + df = DataFrame({"A": [2, 3]}) assert df.attrs == {} df.attrs["version"] = 1 @@ -556,7 +512,7 @@ def test_attrs(self): @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags(self, allows_duplicate_labels): - df = pd.DataFrame({"A": [1, 2]}) + df = DataFrame({"A": [1, 2]}) result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) if allows_duplicate_labels is None: # We don't update when it's not provided diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 94f813fd08128..0ee74beea4858 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -14,6 +14,32 @@ from pandas.core.computation.expressions import _MIN_ELEMENTS, NUMEXPR_INSTALLED from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int + +class DummyElement: + def __init__(self, value, dtype): + self.value = value + self.dtype = np.dtype(dtype) + + def __array__(self): + return np.array(self.value, dtype=self.dtype) + + def __str__(self) -> str: + return f"DummyElement({self.value}, {self.dtype})" + + def __repr__(self) -> str: + return str(self) + + def astype(self, dtype, copy=False): + self.dtype = dtype + return self + + def view(self, dtype): + return type(self)(self.value.view(dtype), dtype) + + def any(self, axis=None): + return bool(self.value) + + # ------------------------------------------------------------------- # Comparisons @@ -23,7 +49,7 @@ class TestFrameComparisons: def test_frame_in_list(self): # GH#12689 this should raise at the DataFrame level, not blocks - df = pd.DataFrame(np.random.randn(6, 4), columns=list("ABCD")) + df = DataFrame(np.random.randn(6, 4), columns=list("ABCD")) msg = "The truth value of a DataFrame is ambiguous" with pytest.raises(ValueError, match=msg): df in [None] @@ -35,7 +61,7 @@ def check(df, df2): # we expect the result to match Series comparisons for # == and !=, inequalities should raise result = x == y - expected = pd.DataFrame( + expected = DataFrame( {col: x[col] == y[col] for col in x.columns}, index=x.index, columns=x.columns, @@ -43,7 +69,7 @@ def check(df, df2): tm.assert_frame_equal(result, expected) result = x != y - expected = pd.DataFrame( + expected = DataFrame( {col: x[col] != y[col] for col in x.columns}, index=x.index, columns=x.columns, @@ -71,15 +97,15 @@ def check(df, df2): # GH4968 # invalid date/int comparisons - df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=["a"]) + df = DataFrame(np.random.randint(10, size=(10, 1)), columns=["a"]) df["dates"] = pd.date_range("20010101", periods=len(df)) df2 = df.copy() df2["dates"] = df["a"] check(df, df2) - df = pd.DataFrame(np.random.randint(10, size=(10, 2)), columns=["a", "b"]) - df2 = pd.DataFrame( + df = DataFrame(np.random.randint(10, size=(10, 2)), columns=["a", "b"]) + df2 = DataFrame( { "a": pd.date_range("20010101", periods=len(df)), "b": pd.date_range("20100101", periods=len(df)), @@ -90,7 +116,7 @@ def check(df, df2): def test_timestamp_compare(self): # make sure we can compare Timestamps on the right AND left hand side # GH#4982 - df = pd.DataFrame( + df = DataFrame( { "dates1": pd.date_range("20010101", periods=10), "dates2": pd.date_range("20010102", periods=10), @@ -129,8 +155,8 @@ def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError # (this appears to be fixed before GH#22163, not sure when) - df = pd.DataFrame([["1989-08-01", 1], ["1989-08-01", 2]]) - other = pd.DataFrame([["a", "b"], ["c", "d"]]) + df = DataFrame([["1989-08-01", 1], ["1989-08-01", 2]]) + other = DataFrame([["a", "b"], ["c", "d"]]) result = df == other assert not result.any().any() @@ -142,9 +168,9 @@ def test_df_boolean_comparison_error(self): # GH#4576, GH#22880 # comparing DataFrame against list/tuple with len(obj) matching # len(df.columns) is supported as of GH#22800 - df = pd.DataFrame(np.arange(6).reshape((3, 2))) + df = DataFrame(np.arange(6).reshape((3, 2))) - expected = pd.DataFrame([[False, False], [True, False], [False, False]]) + expected = DataFrame([[False, False], [True, False], [False, False]]) result = df == (2, 2) tm.assert_frame_equal(result, expected) @@ -153,15 +179,13 @@ def test_df_boolean_comparison_error(self): tm.assert_frame_equal(result, expected) def test_df_float_none_comparison(self): - df = pd.DataFrame( - np.random.randn(8, 3), index=range(8), columns=["A", "B", "C"] - ) + df = DataFrame(np.random.randn(8, 3), index=range(8), columns=["A", "B", "C"]) result = df.__eq__(None) assert not result.any().any() def test_df_string_comparison(self): - df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) + df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) mask_a = df.a > 1 tm.assert_frame_equal(df[mask_a], df.loc[1:1, :]) tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :]) @@ -176,8 +200,8 @@ class TestFrameFlexComparisons: def test_bool_flex_frame(self): data = np.random.randn(5, 3) other_data = np.random.randn(5, 3) - df = pd.DataFrame(data) - other = pd.DataFrame(other_data) + df = DataFrame(data) + other = DataFrame(other_data) ndim_5 = np.ones(df.shape + (1, 3)) # Unaligned @@ -212,12 +236,12 @@ def _test_seq(df, idx_ser, col_ser): col_eq = df.eq(col_ser) idx_ne = df.ne(idx_ser, axis=0) col_ne = df.ne(col_ser) - tm.assert_frame_equal(col_eq, df == pd.Series(col_ser)) + tm.assert_frame_equal(col_eq, df == Series(col_ser)) tm.assert_frame_equal(col_eq, -col_ne) tm.assert_frame_equal(idx_eq, -idx_ne) tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T) tm.assert_frame_equal(col_eq, df.eq(list(col_ser))) - tm.assert_frame_equal(idx_eq, df.eq(pd.Series(idx_ser), axis=0)) + tm.assert_frame_equal(idx_eq, df.eq(Series(idx_ser), axis=0)) tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0)) idx_gt = df.gt(idx_ser, axis=0) @@ -225,7 +249,7 @@ def _test_seq(df, idx_ser, col_ser): idx_le = df.le(idx_ser, axis=0) col_le = df.le(col_ser) - tm.assert_frame_equal(col_gt, df > pd.Series(col_ser)) + tm.assert_frame_equal(col_gt, df > Series(col_ser)) tm.assert_frame_equal(col_gt, -col_le) tm.assert_frame_equal(idx_gt, -idx_le) tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T) @@ -234,13 +258,13 @@ def _test_seq(df, idx_ser, col_ser): col_ge = df.ge(col_ser) idx_lt = df.lt(idx_ser, axis=0) col_lt = df.lt(col_ser) - tm.assert_frame_equal(col_ge, df >= pd.Series(col_ser)) + tm.assert_frame_equal(col_ge, df >= Series(col_ser)) tm.assert_frame_equal(col_ge, -col_lt) tm.assert_frame_equal(idx_ge, -idx_lt) tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T) - idx_ser = pd.Series(np.random.randn(5)) - col_ser = pd.Series(np.random.randn(3)) + idx_ser = Series(np.random.randn(5)) + col_ser = Series(np.random.randn(3)) _test_seq(df, idx_ser, col_ser) # list/tuple @@ -265,8 +289,8 @@ def test_bool_flex_frame_complex_dtype(self): # complex arr = np.array([np.nan, 1, 6, np.nan]) arr2 = np.array([2j, np.nan, 7, None]) - df = pd.DataFrame({"a": arr}) - df2 = pd.DataFrame({"a": arr2}) + df = DataFrame({"a": arr}) + df2 = DataFrame({"a": arr2}) msg = "|".join( [ @@ -288,7 +312,7 @@ def test_bool_flex_frame_complex_dtype(self): assert rs.values.all() arr3 = np.array([2j, np.nan, None]) - df3 = pd.DataFrame({"a": arr3}) + df3 = DataFrame({"a": arr3}) with pytest.raises(TypeError, match=msg): # inequalities are not well-defined for complex numbers @@ -302,16 +326,16 @@ def test_bool_flex_frame_complex_dtype(self): def test_bool_flex_frame_object_dtype(self): # corner, dtype=object - df1 = pd.DataFrame({"col": ["foo", np.nan, "bar"]}) - df2 = pd.DataFrame({"col": ["foo", datetime.now(), "bar"]}) + df1 = DataFrame({"col": ["foo", np.nan, "bar"]}) + df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}) result = df1.ne(df2) - exp = pd.DataFrame({"col": [False, True, False]}) + exp = DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) def test_flex_comparison_nat(self): # GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT, # and _definitely_ not be NaN - df = pd.DataFrame([pd.NaT]) + df = DataFrame([pd.NaT]) result = df == pd.NaT # result.iloc[0, 0] is a np.bool_ object @@ -329,33 +353,33 @@ def test_flex_comparison_nat(self): @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types(self, opname): # GH 15077, non-empty DataFrame - df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) + df = DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 result = getattr(df, opname)(const).dtypes.value_counts() - tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) + tm.assert_series_equal(result, Series([2], index=[np.dtype(bool)])) @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types_empty(self, opname): # GH 15077 empty DataFrame - df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) + df = DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 empty = df.iloc[:0] result = getattr(empty, opname)(const).dtypes.value_counts() - tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) + tm.assert_series_equal(result, Series([2], index=[np.dtype(bool)])) def test_df_flex_cmp_ea_dtype_with_ndarray_series(self): ii = pd.IntervalIndex.from_breaks([1, 2, 3]) - df = pd.DataFrame({"A": ii, "B": ii}) + df = DataFrame({"A": ii, "B": ii}) - ser = pd.Series([0, 0]) + ser = Series([0, 0]) res = df.eq(ser, axis=0) - expected = pd.DataFrame({"A": [False, False], "B": [False, False]}) + expected = DataFrame({"A": [False, False], "B": [False, False]}) tm.assert_frame_equal(res, expected) - ser2 = pd.Series([1, 2], index=["A", "B"]) + ser2 = Series([1, 2], index=["A", "B"]) res2 = df.eq(ser2, axis=1) tm.assert_frame_equal(res2, expected) @@ -368,12 +392,12 @@ class TestFrameFlexArithmetic: def test_floordiv_axis0(self): # make sure we df.floordiv(ser, axis=0) matches column-wise result arr = np.arange(3) - ser = pd.Series(arr) - df = pd.DataFrame({"A": ser, "B": ser}) + ser = Series(arr) + df = DataFrame({"A": ser, "B": ser}) result = df.floordiv(ser, axis=0) - expected = pd.DataFrame({col: df[col] // ser for col in df.columns}) + expected = DataFrame({col: df[col] // ser for col in df.columns}) tm.assert_frame_equal(result, expected) @@ -387,13 +411,13 @@ def test_floordiv_axis0_numexpr_path(self, opname): op = getattr(operator, opname) arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100 - df = pd.DataFrame(arr) + df = DataFrame(arr) df["C"] = 1.0 ser = df[0] result = getattr(df, opname)(ser, axis=0) - expected = pd.DataFrame({col: op(df[col], ser) for col in df.columns}) + expected = DataFrame({col: op(df[col], ser) for col in df.columns}) tm.assert_frame_equal(result, expected) result2 = getattr(df, opname)(ser.values, axis=0) @@ -403,25 +427,25 @@ def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly dti = pd.date_range("2016-01-01", periods=10) tdi = pd.timedelta_range("1", periods=10) - tser = pd.Series(tdi) - df = pd.DataFrame({0: dti, 1: tdi}) + tser = Series(tdi) + df = DataFrame({0: dti, 1: tdi}) result = df.add(tser, axis=0) - expected = pd.DataFrame({0: dti + tdi, 1: tdi + tdi}) + expected = DataFrame({0: dti + tdi, 1: tdi + tdi}) tm.assert_frame_equal(result, expected) def test_df_add_flex_filled_mixed_dtypes(self): # GH 19611 dti = pd.date_range("2016-01-01", periods=3) - ser = pd.Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]") - df = pd.DataFrame({"A": dti, "B": ser}) - other = pd.DataFrame({"A": ser, "B": ser}) + ser = Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]") + df = DataFrame({"A": dti, "B": ser}) + other = DataFrame({"A": ser, "B": ser}) fill = pd.Timedelta(days=1).to_timedelta64() result = df.add(other, fill_value=fill) - expected = pd.DataFrame( + expected = DataFrame( { - "A": pd.Series( + "A": Series( ["2016-01-02", "2016-01-03", "2016-01-05"], dtype="datetime64[ns]" ), "B": ser * 2, @@ -531,22 +555,22 @@ def test_arith_flex_series(self, simple_frame): tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T) # broadcasting issue in GH 7325 - df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="int64") - expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) + df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="int64") + expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) - df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="float64") - expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) + df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="float64") + expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) def test_arith_flex_zero_len_raises(self): # GH 19522 passing fill_value to frame flex arith methods should # raise even in the zero-length special cases - ser_len0 = pd.Series([], dtype=object) - df_len0 = pd.DataFrame(columns=["A", "B"]) - df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + ser_len0 = Series([], dtype=object) + df_len0 = DataFrame(columns=["A", "B"]) + df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) with pytest.raises(NotImplementedError, match="fill_value"): df.add(ser_len0, fill_value="E") @@ -557,7 +581,7 @@ def test_arith_flex_zero_len_raises(self): def test_flex_add_scalar_fill_value(self): # GH#12723 dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") - df = pd.DataFrame({"foo": dat}, index=range(6)) + df = DataFrame({"foo": dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) @@ -568,22 +592,22 @@ class TestFrameArithmetic: def test_td64_op_nat_casting(self): # Make sure we don't accidentally treat timedelta64(NaT) as datetime64 # when calling dispatch_to_series in DataFrame arithmetic - ser = pd.Series(["NaT", "NaT"], dtype="timedelta64[ns]") - df = pd.DataFrame([[1, 2], [3, 4]]) + ser = Series(["NaT", "NaT"], dtype="timedelta64[ns]") + df = DataFrame([[1, 2], [3, 4]]) result = df * ser - expected = pd.DataFrame({0: ser, 1: ser}) + expected = DataFrame({0: ser, 1: ser}) tm.assert_frame_equal(result, expected) def test_df_add_2d_array_rowlike_broadcasts(self): # GH#23000 arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) + df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) rowlike = arr[[1], :] # shape --> (1, ncols) assert rowlike.shape == (1, df.shape[1]) - expected = pd.DataFrame( + expected = DataFrame( [[2, 4], [4, 6], [6, 8]], columns=df.columns, index=df.index, @@ -599,12 +623,12 @@ def test_df_add_2d_array_rowlike_broadcasts(self): def test_df_add_2d_array_collike_broadcasts(self): # GH#23000 arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) + df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) collike = arr[:, [1]] # shape --> (nrows, 1) assert collike.shape == (df.shape[0], 1) - expected = pd.DataFrame( + expected = DataFrame( [[1, 2], [5, 6], [9, 10]], columns=df.columns, index=df.index, @@ -622,7 +646,7 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): opname = all_arithmetic_operators arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) + df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) rowlike = arr[[1], :] # shape --> (1, ncols) assert rowlike.shape == (1, df.shape[1]) @@ -633,7 +657,7 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): getattr(df.loc["C"], opname)(rowlike.squeeze()), ] - expected = pd.DataFrame(exvals, columns=df.columns, index=df.index) + expected = DataFrame(exvals, columns=df.columns, index=df.index) result = getattr(df, opname)(rowlike) tm.assert_frame_equal(result, expected) @@ -643,7 +667,7 @@ def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators): opname = all_arithmetic_operators arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) + df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) collike = arr[:, [1]] # shape --> (nrows, 1) assert collike.shape == (df.shape[0], 1) @@ -659,7 +683,7 @@ def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators): # DataFrame op will return all-float. So we upcast `expected` dtype = np.common_type(*[x.values for x in exvals.values()]) - expected = pd.DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype) + expected = DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype) result = getattr(df, opname)(collike) tm.assert_frame_equal(result, expected) @@ -667,7 +691,7 @@ def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators): def test_df_bool_mul_int(self): # GH 22047, GH 22163 multiplication by 1 should result in int dtype, # not object dtype - df = pd.DataFrame([[False, True], [False, False]]) + df = DataFrame([[False, True], [False, False]]) result = df * 1 # On appveyor this comes back as np.int32 instead of np.int64, @@ -681,14 +705,14 @@ def test_df_bool_mul_int(self): def test_arith_mixed(self): - left = pd.DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) + left = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) result = left + left - expected = pd.DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]}) + expected = DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]}) tm.assert_frame_equal(result, expected) def test_arith_getitem_commute(self): - df = pd.DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) + df = DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) def _test_op(df, op): result = op(df, 1) @@ -723,35 +747,35 @@ def _test_op(df, op): ) def test_arith_alignment_non_pandas_object(self, values): # GH#17901 - df = pd.DataFrame({"A": [1, 1], "B": [1, 1]}) - expected = pd.DataFrame({"A": [2, 2], "B": [3, 3]}) + df = DataFrame({"A": [1, 1], "B": [1, 1]}) + expected = DataFrame({"A": [2, 2], "B": [3, 3]}) result = df + values tm.assert_frame_equal(result, expected) def test_arith_non_pandas_object(self): - df = pd.DataFrame( + df = DataFrame( np.arange(1, 10, dtype="f8").reshape(3, 3), columns=["one", "two", "three"], index=["a", "b", "c"], ) val1 = df.xs("a").values - added = pd.DataFrame(df.values + val1, index=df.index, columns=df.columns) + added = DataFrame(df.values + val1, index=df.index, columns=df.columns) tm.assert_frame_equal(df + val1, added) - added = pd.DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns) + added = DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val1, axis=0), added) val2 = list(df["two"]) - added = pd.DataFrame(df.values + val2, index=df.index, columns=df.columns) + added = DataFrame(df.values + val2, index=df.index, columns=df.columns) tm.assert_frame_equal(df + val2, added) - added = pd.DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns) + added = DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val2, axis="index"), added) val3 = np.random.rand(*df.shape) - added = pd.DataFrame(df.values + val3, index=df.index, columns=df.columns) + added = DataFrame(df.values + val3, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val3), added) def test_operations_with_interval_categories_index(self, all_arithmetic_operators): @@ -759,15 +783,15 @@ def test_operations_with_interval_categories_index(self, all_arithmetic_operator op = all_arithmetic_operators ind = pd.CategoricalIndex(pd.interval_range(start=0.0, end=2.0)) data = [1, 2] - df = pd.DataFrame([data], columns=ind) + df = DataFrame([data], columns=ind) num = 10 result = getattr(df, op)(num) - expected = pd.DataFrame([[getattr(n, op)(num) for n in data]], columns=ind) + expected = DataFrame([[getattr(n, op)(num) for n in data]], columns=ind) tm.assert_frame_equal(result, expected) def test_frame_with_frame_reindex(self): # GH#31623 - df = pd.DataFrame( + df = DataFrame( { "foo": [pd.Timestamp("2019"), pd.Timestamp("2020")], "bar": [pd.Timestamp("2018"), pd.Timestamp("2021")], @@ -778,42 +802,113 @@ def test_frame_with_frame_reindex(self): result = df - df2 - expected = pd.DataFrame( + expected = DataFrame( {"foo": [pd.Timedelta(0), pd.Timedelta(0)], "bar": [np.nan, np.nan]}, columns=["bar", "foo"], ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "value, dtype", + [ + (1, "i8"), + (1.0, "f8"), + (2 ** 63, "f8"), + (1j, "complex128"), + (2 ** 63, "complex128"), + (True, "bool"), + (np.timedelta64(20, "ns"), " 1] = 2 tm.assert_almost_equal(expected, float_frame.values) - def test_values_numeric_cols(self, float_frame): - float_frame["foo"] = "bar" - - values = float_frame[["A", "B", "C", "D"]].values - assert values.dtype == np.float64 - - def test_values_lcd(self, mixed_float_frame, mixed_int_frame): - - # mixed lcd - values = mixed_float_frame[["A", "B", "C", "D"]].values - assert values.dtype == np.float64 - - values = mixed_float_frame[["A", "B", "C"]].values - assert values.dtype == np.float32 - - values = mixed_float_frame[["C"]].values - assert values.dtype == np.float16 - - # GH 10364 - # B uint64 forces float because there are other signed int types - values = mixed_int_frame[["A", "B", "C", "D"]].values - assert values.dtype == np.float64 - - values = mixed_int_frame[["A", "D"]].values - assert values.dtype == np.int64 - - # B uint64 forces float because there are other signed int types - values = mixed_int_frame[["A", "B", "C"]].values - assert values.dtype == np.float64 - - # as B and C are both unsigned, no forcing to float is needed - values = mixed_int_frame[["B", "C"]].values - assert values.dtype == np.uint64 - - values = mixed_int_frame[["A", "C"]].values - assert values.dtype == np.int32 - - values = mixed_int_frame[["C", "D"]].values - assert values.dtype == np.int64 - - values = mixed_int_frame[["A"]].values - assert values.dtype == np.int32 - - values = mixed_int_frame[["C"]].values - assert values.dtype == np.uint8 - def test_constructor_with_convert(self): # this is actually mostly a test of lib.maybe_convert_objects # #2845 @@ -300,47 +254,6 @@ def f(dtype): if not compat.is_platform_windows(): f("M8[ns]") - def test_equals_different_blocks(self): - # GH 9330 - df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) - df1 = df0.reset_index()[["A", "B", "C"]] - # this assert verifies that the above operations have - # induced a block rearrangement - assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype - - # do the real tests - tm.assert_frame_equal(df0, df1) - assert df0.equals(df1) - assert df1.equals(df0) - - def test_copy_blocks(self, float_frame): - # API/ENH 9607 - df = DataFrame(float_frame, copy=True) - column = df.columns[0] - - # use the default copy=True, change a column - blocks = df._to_dict_of_blocks(copy=True) - for dtype, _df in blocks.items(): - if column in _df: - _df.loc[:, column] = _df[column] + 1 - - # make sure we did not change the original DataFrame - assert not _df[column].equals(df[column]) - - def test_no_copy_blocks(self, float_frame): - # API/ENH 9607 - df = DataFrame(float_frame, copy=True) - column = df.columns[0] - - # use the copy=False, change a column - blocks = df._to_dict_of_blocks(copy=False) - for dtype, _df in blocks.items(): - if column in _df: - _df.loc[:, column] = _df[column] + 1 - - # make sure we did change the original DataFrame - assert _df[column].equals(df[column]) - def test_copy(self, float_frame, float_string_frame): cop = float_frame.copy() cop["E"] = cop["A"] @@ -469,88 +382,6 @@ def test_get_numeric_data_extension_dtype(self): expected = df.loc[:, ["A", "C"]] tm.assert_frame_equal(result, expected) - def test_convert_objects(self, float_string_frame): - - oops = float_string_frame.T.T - converted = oops._convert(datetime=True) - tm.assert_frame_equal(converted, float_string_frame) - assert converted["A"].dtype == np.float64 - - # force numeric conversion - float_string_frame["H"] = "1." - float_string_frame["I"] = "1" - - # add in some items that will be nan - length = len(float_string_frame) - float_string_frame["J"] = "1." - float_string_frame["K"] = "1" - float_string_frame.loc[float_string_frame.index[0:5], ["J", "K"]] = "garbled" - converted = float_string_frame._convert(datetime=True, numeric=True) - assert converted["H"].dtype == "float64" - assert converted["I"].dtype == "int64" - assert converted["J"].dtype == "float64" - assert converted["K"].dtype == "float64" - assert len(converted["J"].dropna()) == length - 5 - assert len(converted["K"].dropna()) == length - 5 - - # via astype - converted = float_string_frame.copy() - converted["H"] = converted["H"].astype("float64") - converted["I"] = converted["I"].astype("int64") - assert converted["H"].dtype == "float64" - assert converted["I"].dtype == "int64" - - # via astype, but errors - converted = float_string_frame.copy() - with pytest.raises(ValueError, match="invalid literal"): - converted["H"].astype("int32") - - # mixed in a single column - df = DataFrame(dict(s=Series([1, "na", 3, 4]))) - result = df._convert(datetime=True, numeric=True) - expected = DataFrame(dict(s=Series([1, np.nan, 3, 4]))) - tm.assert_frame_equal(result, expected) - - def test_convert_objects_no_conversion(self): - mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]}) - mixed2 = mixed1._convert(datetime=True) - tm.assert_frame_equal(mixed1, mixed2) - - def test_infer_objects(self): - # GH 11221 - df = DataFrame( - { - "a": ["a", 1, 2, 3], - "b": ["b", 2.0, 3.0, 4.1], - "c": [ - "c", - datetime(2016, 1, 1), - datetime(2016, 1, 2), - datetime(2016, 1, 3), - ], - "d": [1, 2, 3, "d"], - }, - columns=["a", "b", "c", "d"], - ) - df = df.iloc[1:].infer_objects() - - assert df["a"].dtype == "int64" - assert df["b"].dtype == "float64" - assert df["c"].dtype == "M8[ns]" - assert df["d"].dtype == "object" - - expected = DataFrame( - { - "a": [1, 2, 3], - "b": [2.0, 3.0, 4.1], - "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], - "d": [2, 3, "d"], - }, - columns=["a", "b", "c", "d"], - ) - # reconstruct frame to verify inference is same - tm.assert_frame_equal(df.reset_index(drop=True), expected) - def test_stale_cached_series_bug_473(self): # this is chained, but ok @@ -606,17 +437,17 @@ def test_strange_column_corruption_issue(self): def test_constructor_no_pandas_array(self): # Ensure that PandasArray isn't allowed inside Series # See https://github.com/pandas-dev/pandas/issues/23995 for more. - arr = pd.Series([1, 2, 3]).array - result = pd.DataFrame({"A": arr}) - expected = pd.DataFrame({"A": [1, 2, 3]}) + arr = Series([1, 2, 3]).array + result = DataFrame({"A": arr}) + expected = DataFrame({"A": [1, 2, 3]}) tm.assert_frame_equal(result, expected) assert isinstance(result._mgr.blocks[0], IntBlock) def test_add_column_with_pandas_array(self): # GH 26390 - df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) - df2 = pd.DataFrame( + df2 = DataFrame( { "a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], @@ -628,27 +459,9 @@ def test_add_column_with_pandas_array(self): tm.assert_frame_equal(df, df2) -def test_to_dict_of_blocks_item_cache(): - # Calling to_dict_of_blocks should not poison item_cache - df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) - mgr = df._mgr - assert len(mgr.blocks) == 3 # i.e. not consolidated - - ser = df["b"] # populations item_cache["b"] - - df._to_dict_of_blocks() - - # Check that the to_dict_of_blocks didnt break link between ser and df - ser.values[0] = "foo" - assert df.loc[0, "b"] == "foo" - - assert df["b"] is ser - - def test_update_inplace_sets_valid_block_values(): # https://github.com/pandas-dev/pandas/issues/33457 - df = pd.DataFrame({"a": pd.Series([1, 2, None], dtype="category")}) + df = DataFrame({"a": Series([1, 2, None], dtype="category")}) # inplace update of a single column df["a"].fillna(1, inplace=True) @@ -664,9 +477,9 @@ def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 # create non-consolidated dataframe with object dtype columns - df = pd.DataFrame() - df["col1"] = pd.Series(["a"], dtype=object) - df["col2"] = pd.Series([0], dtype=object) + df = DataFrame() + df["col1"] = Series(["a"], dtype=object) + df["col2"] = Series([0], dtype=object) # access column (item cache) df["col1"] == "A" @@ -678,6 +491,6 @@ def test_nonconsolidated_item_cache_take(): # now setting value should update actual dataframe df.at[0, "col1"] = "A" - expected = pd.DataFrame({"col1": ["A"], "col2": [0]}, dtype=object) + expected = DataFrame({"col1": ["A"], "col2": [0]}, dtype=object) tm.assert_frame_equal(df, expected) assert df.at[0, "col1"] == "A" diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8ec11d14cd606..479d7902aa111 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -154,17 +154,17 @@ def test_constructor_dtype_list_data(self): @pytest.mark.skipif(_np_version_under1p19, reason="NumPy change.") def test_constructor_list_of_2d_raises(self): # https://github.com/pandas-dev/pandas/issues/32289 - a = pd.DataFrame() + a = DataFrame() b = np.empty((0, 0)) with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"): - pd.DataFrame([a]) + DataFrame([a]) with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"): - pd.DataFrame([b]) + DataFrame([b]) - a = pd.DataFrame({"A": [1, 2]}) + a = DataFrame({"A": [1, 2]}) with pytest.raises(ValueError, match=r"shape=\(2, 2, 1\)"): - pd.DataFrame([a, a]) + DataFrame([a, a]) def test_constructor_mixed_dtypes(self): def _make_mixed_dtypes_df(typ, ad=None): @@ -1101,10 +1101,10 @@ def test_constructor_list_of_lists(self): def test_constructor_list_like_data_nested_list_column(self): # GH 32173 arrays = [list("abcd"), list("cdef")] - result = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) + result = DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) mi = MultiIndex.from_arrays(arrays) - expected = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi) + expected = DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi) tm.assert_frame_equal(result, expected) @@ -1561,6 +1561,19 @@ def test_constructor_from_dict_tuples(self, data_dict, keys, orient): tm.assert_index_equal(result, expected) + def test_frame_dict_constructor_empty_series(self): + s1 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)]) + ) + s2 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]) + ) + s3 = Series(dtype=object) + + # it works! + DataFrame({"foo": s1, "bar": s2, "baz": s3}) + DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2}) + def test_constructor_Series_named(self): a = Series([1, 2, 3], index=["a", "b", "c"], name="x") df = DataFrame(a) @@ -1648,17 +1661,17 @@ def test_constructor_Series_differently_indexed(self): def test_constructor_index_names(self, name_in1, name_in2, name_in3, name_out): # GH13475 indices = [ - pd.Index(["a", "b", "c"], name=name_in1), - pd.Index(["b", "c", "d"], name=name_in2), - pd.Index(["c", "d", "e"], name=name_in3), + Index(["a", "b", "c"], name=name_in1), + Index(["b", "c", "d"], name=name_in2), + Index(["c", "d", "e"], name=name_in3), ] series = { - c: pd.Series([0, 1, 2], index=i) for i, c in zip(indices, ["x", "y", "z"]) + c: Series([0, 1, 2], index=i) for i, c in zip(indices, ["x", "y", "z"]) } - result = pd.DataFrame(series) + result = DataFrame(series) - exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out) - expected = pd.DataFrame( + exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out) + expected = DataFrame( { "x": [0, 1, 2, np.nan, np.nan], "y": [np.nan, 0, 1, 2, np.nan], @@ -2342,7 +2355,7 @@ def test_from_records_empty_with_nonempty_fields_gh3682(self): def test_check_dtype_empty_numeric_column(self, dtype): # GH24386: Ensure dtypes are set correctly for an empty DataFrame. # Empty DataFrame is generated via dictionary data with non-overlapping columns. - data = pd.DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) + data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) assert data.b.dtype == dtype @@ -2352,7 +2365,7 @@ def test_check_dtype_empty_numeric_column(self, dtype): def test_check_dtype_empty_string_column(self, dtype): # GH24386: Ensure dtypes are set correctly for an empty DataFrame. # Empty DataFrame is generated via dictionary data with non-overlapping columns. - data = pd.DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) + data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) assert data.b.dtype.name == "object" @@ -2566,7 +2579,7 @@ def test_from_records_series_categorical_index(self): index = CategoricalIndex( [pd.Interval(-20, -10), pd.Interval(-10, 0), pd.Interval(0, 10)] ) - series_of_dicts = pd.Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index) + series_of_dicts = Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index) frame = pd.DataFrame.from_records(series_of_dicts, index=index) expected = DataFrame( {"a": [1, 2, np.NaN], "b": [np.NaN, np.NaN, 3]}, index=index @@ -2668,7 +2681,7 @@ def test_from_datetime_subclass(self): class DatetimeSubclass(datetime): pass - data = pd.DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) + data = DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) assert data.datetime.dtype == "datetime64[ns]" def test_with_mismatched_index_length_raises(self): @@ -2677,8 +2690,60 @@ def test_with_mismatched_index_length_raises(self): with pytest.raises(ValueError, match="Shape of passed values"): DataFrame(dti, index=range(4)) + def test_frame_ctor_datetime64_column(self): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") + dates = np.asarray(rng) + + df = DataFrame({"A": np.random.randn(len(rng)), "B": dates}) + assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]")) + + def test_dataframe_constructor_infer_multiindex(self): + index_lists = [["a", "a", "b", "b"], ["x", "y", "x", "y"]] + + multi = DataFrame( + np.random.randn(4, 4), + index=[np.array(x) for x in index_lists], + ) + assert isinstance(multi.index, MultiIndex) + assert not isinstance(multi.columns, MultiIndex) + + multi = DataFrame(np.random.randn(4, 4), columns=index_lists) + assert isinstance(multi.columns, MultiIndex) + + @pytest.mark.parametrize( + "input_vals", + [ + ([1, 2]), + (["1", "2"]), + (list(date_range("1/1/2011", periods=2, freq="H"))), + (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + ([pd.Interval(left=0, right=5)]), + ], + ) + def test_constructor_list_str(self, input_vals, string_dtype): + # GH#16605 + # Ensure that data elements are converted to strings when + # dtype is str, 'str', or 'U' + + result = DataFrame({"A": input_vals}, dtype=string_dtype) + expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) + tm.assert_frame_equal(result, expected) + + def test_constructor_list_str_na(self, string_dtype): + + result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) + expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorWithDatetimeTZ: + def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): + # GH#25843 + tz = tz_aware_fixture + result = DataFrame({"d": [Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]") + expected = DataFrame({"d": [Timestamp("2019")]}) + tm.assert_frame_equal(result, expected) + def test_from_dict(self): # 8260 @@ -2816,4 +2881,4 @@ def test_construction_from_set_raises(self): # https://github.com/pandas-dev/pandas/issues/32582 msg = "Set type is unordered" with pytest.raises(TypeError, match=msg): - pd.DataFrame({"a": {1, 2, 3}}) + DataFrame({"a": {1, 2, 3}}) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py deleted file mode 100644 index 5917520802519..0000000000000 --- a/pandas/tests/frame/test_dtypes.py +++ /dev/null @@ -1,324 +0,0 @@ -from datetime import timedelta - -import numpy as np -import pytest - -from pandas.core.dtypes.dtypes import DatetimeTZDtype - -import pandas as pd -from pandas import DataFrame, Series, Timestamp, date_range, option_context -import pandas._testing as tm - - -def _check_cast(df, v): - """ - Check if all dtypes of df are equal to v - """ - assert all(s.dtype.name == v for _, s in df.items()) - - -class TestDataFrameDataTypes: - def test_concat_empty_dataframe_dtypes(self): - df = DataFrame(columns=list("abc")) - df["a"] = df["a"].astype(np.bool_) - df["b"] = df["b"].astype(np.int32) - df["c"] = df["c"].astype(np.float64) - - result = pd.concat([df, df]) - assert result["a"].dtype == np.bool_ - assert result["b"].dtype == np.int32 - assert result["c"].dtype == np.float64 - - result = pd.concat([df, df.astype(np.float64)]) - assert result["a"].dtype == np.object_ - assert result["b"].dtype == np.float64 - assert result["c"].dtype == np.float64 - - def test_empty_frame_dtypes(self): - empty_df = pd.DataFrame() - tm.assert_series_equal(empty_df.dtypes, pd.Series(dtype=object)) - - nocols_df = pd.DataFrame(index=[1, 2, 3]) - tm.assert_series_equal(nocols_df.dtypes, pd.Series(dtype=object)) - - norows_df = pd.DataFrame(columns=list("abc")) - tm.assert_series_equal(norows_df.dtypes, pd.Series(object, index=list("abc"))) - - norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32) - tm.assert_series_equal( - norows_int_df.dtypes, pd.Series(np.dtype("int32"), index=list("abc")) - ) - - df = pd.DataFrame(dict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3]) - ex_dtypes = pd.Series( - dict([("a", np.int64), ("b", np.bool_), ("c", np.float64)]) - ) - tm.assert_series_equal(df.dtypes, ex_dtypes) - - # same but for empty slice of df - tm.assert_series_equal(df[:0].dtypes, ex_dtypes) - - def test_datetime_with_tz_dtypes(self): - tzframe = DataFrame( - { - "A": date_range("20130101", periods=3), - "B": date_range("20130101", periods=3, tz="US/Eastern"), - "C": date_range("20130101", periods=3, tz="CET"), - } - ) - tzframe.iloc[1, 1] = pd.NaT - tzframe.iloc[1, 2] = pd.NaT - result = tzframe.dtypes.sort_index() - expected = Series( - [ - np.dtype("datetime64[ns]"), - DatetimeTZDtype("ns", "US/Eastern"), - DatetimeTZDtype("ns", "CET"), - ], - ["A", "B", "C"], - ) - - tm.assert_series_equal(result, expected) - - def test_dtypes_are_correct_after_column_slice(self): - # GH6525 - df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) - tm.assert_series_equal( - df.dtypes, - pd.Series(dict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), - ) - tm.assert_series_equal( - df.iloc[:, 2:].dtypes, pd.Series(dict([("c", np.float_)])) - ) - tm.assert_series_equal( - df.dtypes, - pd.Series(dict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), - ) - - def test_dtypes_gh8722(self, float_string_frame): - float_string_frame["bool"] = float_string_frame["A"] > 0 - result = float_string_frame.dtypes - expected = Series( - {k: v.dtype for k, v in float_string_frame.items()}, index=result.index - ) - tm.assert_series_equal(result, expected) - - # compat, GH 8722 - with option_context("use_inf_as_na", True): - df = DataFrame([[1]]) - result = df.dtypes - tm.assert_series_equal(result, Series({0: np.dtype("int64")})) - - def test_singlerow_slice_categoricaldtype_gives_series(self): - # GH29521 - df = pd.DataFrame({"x": pd.Categorical("a b c d e".split())}) - result = df.iloc[0] - raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"]) - expected = pd.Series(raw_cat, index=["x"], name=0, dtype="category") - - tm.assert_series_equal(result, expected) - - def test_timedeltas(self): - df = DataFrame( - dict( - A=Series(date_range("2012-1-1", periods=3, freq="D")), - B=Series([timedelta(days=i) for i in range(3)]), - ) - ) - result = df.dtypes - expected = Series( - [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB") - ) - tm.assert_series_equal(result, expected) - - df["C"] = df["A"] + df["B"] - result = df.dtypes - expected = Series( - [ - np.dtype("datetime64[ns]"), - np.dtype("timedelta64[ns]"), - np.dtype("datetime64[ns]"), - ], - index=list("ABC"), - ) - tm.assert_series_equal(result, expected) - - # mixed int types - df["D"] = 1 - result = df.dtypes - expected = Series( - [ - np.dtype("datetime64[ns]"), - np.dtype("timedelta64[ns]"), - np.dtype("datetime64[ns]"), - np.dtype("int64"), - ], - index=list("ABCD"), - ) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "input_vals", - [ - ([1, 2]), - (["1", "2"]), - (list(pd.date_range("1/1/2011", periods=2, freq="H"))), - (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), - ([pd.Interval(left=0, right=5)]), - ], - ) - def test_constructor_list_str(self, input_vals, string_dtype): - # GH 16605 - # Ensure that data elements are converted to strings when - # dtype is str, 'str', or 'U' - - result = DataFrame({"A": input_vals}, dtype=string_dtype) - expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) - tm.assert_frame_equal(result, expected) - - def test_constructor_list_str_na(self, string_dtype): - - result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) - expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "data, expected", - [ - # empty - (DataFrame(), True), - # multi-same - (DataFrame({"A": [1, 2], "B": [1, 2]}), True), - # multi-object - ( - DataFrame( - { - "A": np.array([1, 2], dtype=object), - "B": np.array(["a", "b"], dtype=object), - } - ), - True, - ), - # multi-extension - ( - DataFrame( - {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"])} - ), - True, - ), - # differ types - (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), - # differ sizes - ( - DataFrame( - { - "A": np.array([1, 2], dtype=np.int32), - "B": np.array([1, 2], dtype=np.int64), - } - ), - False, - ), - # multi-extension differ - ( - DataFrame( - {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"])} - ), - False, - ), - ], - ) - def test_is_homogeneous_type(self, data, expected): - assert data._is_homogeneous_type is expected - - def test_asarray_homogenous(self): - df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])}) - result = np.asarray(df) - # may change from object in the future - expected = np.array([[1, 1], [2, 2]], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - def test_str_to_small_float_conversion_type(self): - # GH 20388 - np.random.seed(13) - col_data = [str(np.random.random() * 1e-12) for _ in range(5)] - result = pd.DataFrame(col_data, columns=["A"]) - expected = pd.DataFrame(col_data, columns=["A"], dtype=object) - tm.assert_frame_equal(result, expected) - # change the dtype of the elements from object to float one by one - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = pd.DataFrame(col_data, columns=["A"], dtype=float) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] - ) - def test_convert_dtypes(self, convert_integer, expected): - # Specific types are tested in tests/series/test_dtypes.py - # Just check that it works for DataFrame here - df = pd.DataFrame( - { - "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), - "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), - } - ) - result = df.convert_dtypes(True, True, convert_integer, False) - expected = pd.DataFrame( - { - "a": pd.Series([1, 2, 3], dtype=expected), - "b": pd.Series(["x", "y", "z"], dtype="string"), - } - ) - tm.assert_frame_equal(result, expected) - - -class TestDataFrameDatetimeWithTZ: - def test_interleave(self, timezone_frame): - - # interleave with object - result = timezone_frame.assign(D="foo").values - expected = np.array( - [ - [ - Timestamp("2013-01-01 00:00:00"), - Timestamp("2013-01-02 00:00:00"), - Timestamp("2013-01-03 00:00:00"), - ], - [ - Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), - pd.NaT, - Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), - ], - [ - Timestamp("2013-01-01 00:00:00+0100", tz="CET"), - pd.NaT, - Timestamp("2013-01-03 00:00:00+0100", tz="CET"), - ], - ["foo", "foo", "foo"], - ], - dtype=object, - ).T - tm.assert_numpy_array_equal(result, expected) - - # interleave with only datetime64[ns] - result = timezone_frame.values - expected = np.array( - [ - [ - Timestamp("2013-01-01 00:00:00"), - Timestamp("2013-01-02 00:00:00"), - Timestamp("2013-01-03 00:00:00"), - ], - [ - Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), - pd.NaT, - Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), - ], - [ - Timestamp("2013-01-01 00:00:00+0100", tz="CET"), - pd.NaT, - Timestamp("2013-01-03 00:00:00+0100", tz="CET"), - ], - ], - dtype=object, - ).T - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py new file mode 100644 index 0000000000000..fb7a6e586c460 --- /dev/null +++ b/pandas/tests/frame/test_logical_ops.py @@ -0,0 +1,128 @@ +import operator +import re + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFrameLogicalOperators: + # &, |, ^ + + def test_logical_ops_empty_frame(self): + # GH#5808 + # empty frames, non-mixed dtype + df = DataFrame(index=[1]) + + result = df & df + tm.assert_frame_equal(result, df) + + result = df | df + tm.assert_frame_equal(result, df) + + df2 = DataFrame(index=[1, 2]) + result = df & df2 + tm.assert_frame_equal(result, df2) + + dfa = DataFrame(index=[1], columns=["A"]) + + result = dfa & dfa + expected = DataFrame(False, index=[1], columns=["A"]) + tm.assert_frame_equal(result, expected) + + def test_logical_ops_bool_frame(self): + # GH#5808 + df1a_bool = DataFrame(True, index=[1], columns=["A"]) + + result = df1a_bool & df1a_bool + tm.assert_frame_equal(result, df1a_bool) + + result = df1a_bool | df1a_bool + tm.assert_frame_equal(result, df1a_bool) + + def test_logical_ops_int_frame(self): + # GH#5808 + df1a_int = DataFrame(1, index=[1], columns=["A"]) + df1a_bool = DataFrame(True, index=[1], columns=["A"]) + + result = df1a_int | df1a_bool + tm.assert_frame_equal(result, df1a_bool) + + # Check that this matches Series behavior + res_ser = df1a_int["A"] | df1a_bool["A"] + tm.assert_series_equal(res_ser, df1a_bool["A"]) + + def test_logical_ops_invalid(self): + # GH#5808 + + df1 = DataFrame(1.0, index=[1], columns=["A"]) + df2 = DataFrame(True, index=[1], columns=["A"]) + msg = re.escape("unsupported operand type(s) for |: 'float' and 'bool'") + with pytest.raises(TypeError, match=msg): + df1 | df2 + + df1 = DataFrame("foo", index=[1], columns=["A"]) + df2 = DataFrame(True, index=[1], columns=["A"]) + msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") + with pytest.raises(TypeError, match=msg): + df1 | df2 + + def test_logical_operators(self): + def _check_bin_op(op): + result = op(df1, df2) + expected = DataFrame( + op(df1.values, df2.values), index=df1.index, columns=df1.columns + ) + assert result.values.dtype == np.bool_ + tm.assert_frame_equal(result, expected) + + def _check_unary_op(op): + result = op(df1) + expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns) + assert result.values.dtype == np.bool_ + tm.assert_frame_equal(result, expected) + + df1 = { + "a": {"a": True, "b": False, "c": False, "d": True, "e": True}, + "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, + "c": {"a": False, "b": False, "c": True, "d": False, "e": False}, + "d": {"a": True, "b": False, "c": False, "d": True, "e": True}, + "e": {"a": True, "b": False, "c": False, "d": True, "e": True}, + } + + df2 = { + "a": {"a": True, "b": False, "c": True, "d": False, "e": False}, + "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, + "c": {"a": True, "b": False, "c": True, "d": False, "e": False}, + "d": {"a": False, "b": False, "c": False, "d": True, "e": False}, + "e": {"a": False, "b": False, "c": False, "d": False, "e": True}, + } + + df1 = DataFrame(df1) + df2 = DataFrame(df2) + + _check_bin_op(operator.and_) + _check_bin_op(operator.or_) + _check_bin_op(operator.xor) + + _check_unary_op(operator.inv) # TODO: belongs elsewhere + + def test_logical_with_nas(self): + d = DataFrame({"a": [np.nan, False], "b": [True, True]}) + + # GH4947 + # bool comparisons should return bool + result = d["a"] | d["b"] + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + + # GH4604, automatic casting here + result = d["a"].fillna(False) | d["b"] + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + result = d["a"].fillna(False, downcast=False) | d["b"] + expected = Series([True, True]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index a8b76f4d85f49..c6b1c69442dbc 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -238,7 +238,7 @@ def check(result, expected=None): ) for index in [df.index, pd.Index(list("edcba"))]: this_df = df.copy() - expected_ser = pd.Series(index.values, index=this_df.index) + expected_ser = Series(index.values, index=this_df.index) expected_df = DataFrame( {"A": expected_ser, "B": this_df["B"], "A": expected_ser}, columns=["A", "B", "A"], @@ -488,16 +488,6 @@ def test_columns_with_dups(self): xp.columns = ["A", "A", "B"] tm.assert_frame_equal(rs, xp) - def test_values_duplicates(self): - df = DataFrame( - [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"] - ) - - result = df.values - expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object) - - tm.assert_numpy_array_equal(result, expected) - def test_set_value_by_index(self): # See gh-12344 df = DataFrame(np.arange(9).reshape(3, 3).T) diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py new file mode 100644 index 0000000000000..a3b4c659a4124 --- /dev/null +++ b/pandas/tests/frame/test_npfuncs.py @@ -0,0 +1,16 @@ +""" +Tests for np.foo applied to DataFrame, not necessarily ufuncs. +""" +import numpy as np + +from pandas import Categorical, DataFrame +import pandas._testing as tm + + +class TestAsArray: + def test_asarray_homogenous(self): + df = DataFrame({"A": Categorical([1, 2]), "B": Categorical([1, 2])}) + result = np.asarray(df) + # may change from object in the future + expected = np.array([[1, 1], [2, 2]], dtype="object") + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py deleted file mode 100644 index 8cf66e2737249..0000000000000 --- a/pandas/tests/frame/test_operators.py +++ /dev/null @@ -1,282 +0,0 @@ -from decimal import Decimal -import operator -import re - -import numpy as np -import pytest - -import pandas as pd -from pandas import DataFrame, Series -import pandas._testing as tm - - -class TestDataFrameUnaryOperators: - # __pos__, __neg__, __inv__ - - @pytest.mark.parametrize( - "df,expected", - [ - (pd.DataFrame({"a": [-1, 1]}), pd.DataFrame({"a": [1, -1]})), - (pd.DataFrame({"a": [False, True]}), pd.DataFrame({"a": [True, False]})), - ( - pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), - pd.DataFrame({"a": pd.Series(pd.to_timedelta([1, -1]))}), - ), - ], - ) - def test_neg_numeric(self, df, expected): - tm.assert_frame_equal(-df, expected) - tm.assert_series_equal(-df["a"], expected["a"]) - - @pytest.mark.parametrize( - "df, expected", - [ - (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), - ([Decimal("1.0"), Decimal("2.0")], [Decimal("-1.0"), Decimal("-2.0")]), - ], - ) - def test_neg_object(self, df, expected): - # GH#21380 - df = pd.DataFrame({"a": df}) - expected = pd.DataFrame({"a": expected}) - tm.assert_frame_equal(-df, expected) - tm.assert_series_equal(-df["a"], expected["a"]) - - @pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"a": ["a", "b"]}), - pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), - ], - ) - def test_neg_raises(self, df): - msg = ( - "bad operand type for unary -: 'str'|" - r"Unary negative expects numeric dtype, not datetime64\[ns\]" - ) - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) - - def test_invert(self, float_frame): - df = float_frame - - tm.assert_frame_equal(-(df < 0), ~(df < 0)) - - def test_invert_mixed(self): - shape = (10, 5) - df = pd.concat( - [ - pd.DataFrame(np.zeros(shape, dtype="bool")), - pd.DataFrame(np.zeros(shape, dtype=int)), - ], - axis=1, - ignore_index=True, - ) - result = ~df - expected = pd.concat( - [ - pd.DataFrame(np.ones(shape, dtype="bool")), - pd.DataFrame(-np.ones(shape, dtype=int)), - ], - axis=1, - ignore_index=True, - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"a": [-1, 1]}), - pd.DataFrame({"a": [False, True]}), - pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), - ], - ) - def test_pos_numeric(self, df): - # GH#16073 - tm.assert_frame_equal(+df, df) - tm.assert_series_equal(+df["a"], df["a"]) - - @pytest.mark.parametrize( - "df", - [ - # numpy changing behavior in the future - pytest.param( - pd.DataFrame({"a": ["a", "b"]}), - marks=[pytest.mark.filterwarnings("ignore")], - ), - pd.DataFrame({"a": np.array([-1, 2], dtype=object)}), - pd.DataFrame({"a": [Decimal("-1.0"), Decimal("2.0")]}), - ], - ) - def test_pos_object(self, df): - # GH#21380 - tm.assert_frame_equal(+df, df) - tm.assert_series_equal(+df["a"], df["a"]) - - @pytest.mark.parametrize( - "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] - ) - def test_pos_raises(self, df): - msg = "Unary plus expects .* dtype, not datetime64\\[ns\\]" - with pytest.raises(TypeError, match=msg): - (+df) - with pytest.raises(TypeError, match=msg): - (+df["a"]) - - -class TestDataFrameLogicalOperators: - # &, |, ^ - - def test_logical_ops_empty_frame(self): - # GH#5808 - # empty frames, non-mixed dtype - df = DataFrame(index=[1]) - - result = df & df - tm.assert_frame_equal(result, df) - - result = df | df - tm.assert_frame_equal(result, df) - - df2 = DataFrame(index=[1, 2]) - result = df & df2 - tm.assert_frame_equal(result, df2) - - dfa = DataFrame(index=[1], columns=["A"]) - - result = dfa & dfa - expected = DataFrame(False, index=[1], columns=["A"]) - tm.assert_frame_equal(result, expected) - - def test_logical_ops_bool_frame(self): - # GH#5808 - df1a_bool = DataFrame(True, index=[1], columns=["A"]) - - result = df1a_bool & df1a_bool - tm.assert_frame_equal(result, df1a_bool) - - result = df1a_bool | df1a_bool - tm.assert_frame_equal(result, df1a_bool) - - def test_logical_ops_int_frame(self): - # GH#5808 - df1a_int = DataFrame(1, index=[1], columns=["A"]) - df1a_bool = DataFrame(True, index=[1], columns=["A"]) - - result = df1a_int | df1a_bool - tm.assert_frame_equal(result, df1a_bool) - - # Check that this matches Series behavior - res_ser = df1a_int["A"] | df1a_bool["A"] - tm.assert_series_equal(res_ser, df1a_bool["A"]) - - def test_logical_ops_invalid(self): - # GH#5808 - - df1 = DataFrame(1.0, index=[1], columns=["A"]) - df2 = DataFrame(True, index=[1], columns=["A"]) - msg = re.escape("unsupported operand type(s) for |: 'float' and 'bool'") - with pytest.raises(TypeError, match=msg): - df1 | df2 - - df1 = DataFrame("foo", index=[1], columns=["A"]) - df2 = DataFrame(True, index=[1], columns=["A"]) - msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - with pytest.raises(TypeError, match=msg): - df1 | df2 - - def test_logical_operators(self): - def _check_bin_op(op): - result = op(df1, df2) - expected = DataFrame( - op(df1.values, df2.values), index=df1.index, columns=df1.columns - ) - assert result.values.dtype == np.bool_ - tm.assert_frame_equal(result, expected) - - def _check_unary_op(op): - result = op(df1) - expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns) - assert result.values.dtype == np.bool_ - tm.assert_frame_equal(result, expected) - - df1 = { - "a": {"a": True, "b": False, "c": False, "d": True, "e": True}, - "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, - "c": {"a": False, "b": False, "c": True, "d": False, "e": False}, - "d": {"a": True, "b": False, "c": False, "d": True, "e": True}, - "e": {"a": True, "b": False, "c": False, "d": True, "e": True}, - } - - df2 = { - "a": {"a": True, "b": False, "c": True, "d": False, "e": False}, - "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, - "c": {"a": True, "b": False, "c": True, "d": False, "e": False}, - "d": {"a": False, "b": False, "c": False, "d": True, "e": False}, - "e": {"a": False, "b": False, "c": False, "d": False, "e": True}, - } - - df1 = DataFrame(df1) - df2 = DataFrame(df2) - - _check_bin_op(operator.and_) - _check_bin_op(operator.or_) - _check_bin_op(operator.xor) - - _check_unary_op(operator.inv) # TODO: belongs elsewhere - - def test_logical_with_nas(self): - d = DataFrame({"a": [np.nan, False], "b": [True, True]}) - - # GH4947 - # bool comparisons should return bool - result = d["a"] | d["b"] - expected = Series([False, True]) - tm.assert_series_equal(result, expected) - - # GH4604, automatic casting here - result = d["a"].fillna(False) | d["b"] - expected = Series([True, True]) - tm.assert_series_equal(result, expected) - - result = d["a"].fillna(False, downcast=False) | d["b"] - expected = Series([True, True]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "left, right, op, expected", - [ - ( - [True, False, np.nan], - [True, False, True], - operator.and_, - [True, False, False], - ), - ( - [True, False, True], - [True, False, np.nan], - operator.and_, - [True, False, False], - ), - ( - [True, False, np.nan], - [True, False, True], - operator.or_, - [True, False, False], - ), - ( - [True, False, True], - [True, False, np.nan], - operator.or_, - [True, False, True], - ), - ], - ) - def test_logical_operators_nans(self, left, right, op, expected): - # GH 13896 - result = op(DataFrame(left), DataFrame(right)) - expected = DataFrame(expected) - - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_period.py b/pandas/tests/frame/test_period.py deleted file mode 100644 index c378194b9e2b2..0000000000000 --- a/pandas/tests/frame/test_period.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np - -from pandas import DataFrame, Index, PeriodIndex, period_range -import pandas._testing as tm - - -class TestPeriodIndex: - def test_as_frame_columns(self): - rng = period_range("1/1/2000", periods=5) - df = DataFrame(np.random.randn(10, 5), columns=rng) - - ts = df[rng[0]] - tm.assert_series_equal(ts, df.iloc[:, 0]) - - # GH # 1211 - repr(df) - - ts = df["1/1/2000"] - tm.assert_series_equal(ts, df.iloc[:, 0]) - - def test_frame_setitem(self): - rng = period_range("1/1/2000", periods=5, name="index") - df = DataFrame(np.random.randn(5, 3), index=rng) - - df["Index"] = rng - rs = Index(df["Index"]) - tm.assert_index_equal(rs, rng, check_names=False) - assert rs.name == "Index" - assert rng.name == "index" - - rs = df.reset_index().set_index("index") - assert isinstance(rs.index, PeriodIndex) - tm.assert_index_equal(rs.index, rng) - - def test_frame_index_to_string(self): - index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") - frame = DataFrame(np.random.randn(3, 4), index=index) - - # it works! - frame.to_string() diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 024403189409c..f3f2bbe1d160e 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -135,7 +135,7 @@ def test_dataframe_sub_numexpr_path(self): def test_query_non_str(self): # GH 11485 - df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]}) + df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]}) msg = "expr must be a string to be evaluated" with pytest.raises(ValueError, match=msg): @@ -146,7 +146,7 @@ def test_query_non_str(self): def test_query_empty_string(self): # GH 13139 - df = pd.DataFrame({"A": [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 3]}) msg = "expr cannot be an empty string" with pytest.raises(ValueError, match=msg): @@ -162,9 +162,9 @@ def test_eval_resolvers_as_list(self): def test_eval_object_dtype_binop(self): # GH#24883 - df = pd.DataFrame({"a1": ["Y", "N"]}) + df = DataFrame({"a1": ["Y", "N"]}) res = df.eval("c = ((a1 == 'Y') & True)") - expected = pd.DataFrame({"a1": ["Y", "N"], "c": [True, False]}) + expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]}) tm.assert_frame_equal(res, expected) @@ -716,12 +716,12 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): df_index = pd.date_range( start="2019-01-01", freq="1d", periods=10, tz=tz, name="time" ) - expected = pd.DataFrame(index=df_index) - df = pd.DataFrame(index=df_index) + expected = DataFrame(index=df_index) + df = DataFrame(index=df_index) result = df.query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) - expected = pd.DataFrame(df_index) + expected = DataFrame(df_index) result = df.reset_index().query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) @@ -1045,7 +1045,7 @@ def test_query_single_element_booleans(self, parser, engine): def test_query_string_scalar_variable(self, parser, engine): skip_if_no_pandas_parser(parser) - df = pd.DataFrame( + df = DataFrame( { "Symbol": ["BUD US", "BUD US", "IBM US", "IBM US"], "Price": [109.70, 109.72, 183.30, 183.35], diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 6d786d9580542..ef43319d11464 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -8,7 +8,11 @@ from pandas import ( Categorical, DataFrame, + MultiIndex, + NaT, + PeriodIndex, Series, + Timestamp, date_range, option_context, period_range, @@ -19,6 +23,72 @@ class TestDataFrameReprInfoEtc: + def test_assign_index_sequences(self): + # GH#2200 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( + ["a", "b"] + ) + index = list(df.index) + index[0] = ("faz", "boo") + df.index = index + repr(df) + + # this travels an improper code path + index[0] = ["faz", "boo"] + df.index = index + repr(df) + + def test_repr_with_mi_nat(self, float_string_frame): + df = DataFrame({"X": [1, 2]}, index=[[NaT, Timestamp("20130101")], ["a", "b"]]) + result = repr(df) + expected = " X\nNaT a 1\n2013-01-01 b 2" + assert result == expected + + def test_multiindex_na_repr(self): + # only an issue with long columns + df3 = DataFrame( + { + "A" * 30: {("A", "A0006000", "nuit"): "A0006000"}, + "B" * 30: {("A", "A0006000", "nuit"): np.nan}, + "C" * 30: {("A", "A0006000", "nuit"): np.nan}, + "D" * 30: {("A", "A0006000", "nuit"): np.nan}, + "E" * 30: {("A", "A0006000", "nuit"): "A"}, + "F" * 30: {("A", "A0006000", "nuit"): np.nan}, + } + ) + + idf = df3.set_index(["A" * 30, "C" * 30]) + repr(idf) + + def test_repr_name_coincide(self): + index = MultiIndex.from_tuples( + [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"] + ) + + df = DataFrame({"value": [0, 1]}, index=index) + + lines = repr(df).split("\n") + assert lines[2].startswith("a 0 foo") + + def test_repr_to_string( + self, + multiindex_year_month_day_dataframe_random_data, + multiindex_dataframe_random_data, + ): + ymd = multiindex_year_month_day_dataframe_random_data + frame = multiindex_dataframe_random_data + + repr(frame) + repr(ymd) + repr(frame.T) + repr(ymd.T) + + buf = StringIO() + frame.to_string(buf=buf) + ymd.to_string(buf=buf) + frame.T.to_string(buf=buf) + ymd.T.to_string(buf=buf) + def test_repr_empty(self): # empty repr(DataFrame()) @@ -218,3 +288,10 @@ def test_frame_datetime64_pre1900_repr(self): df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) # it works! repr(df) + + def test_frame_to_string_with_periodindex(self): + index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") + frame = DataFrame(np.random.randn(3, 4), index=index) + + # it works! + frame.to_string() diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py deleted file mode 100644 index 40526ab27ac9a..0000000000000 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy as np -import pytest - -from pandas.errors import PerformanceWarning - -from pandas import DataFrame -import pandas._testing as tm - - -@pytest.fixture -def df_none(): - return DataFrame( - { - "outer": ["a", "a", "a", "b", "b", "b"], - "inner": [1, 2, 2, 2, 1, 1], - "A": np.arange(6, 0, -1), - ("B", 5): ["one", "one", "two", "two", "one", "one"], - } - ) - - -@pytest.fixture(params=[["outer"], ["outer", "inner"]]) -def df_idx(request, df_none): - levels = request.param - return df_none.set_index(levels) - - -@pytest.fixture( - params=[ - "inner", # index level - ["outer"], # list of index level - "A", # column - [("B", 5)], # list of column - ["inner", "outer"], # two index levels - [("B", 5), "outer"], # index level and column - ["A", ("B", 5)], # Two columns - ["inner", "outer"], # two index levels and column - ] -) -def sort_names(request): - return request.param - - -@pytest.fixture(params=[True, False]) -def ascending(request): - return request.param - - -def test_sort_index_level_and_column_label(df_none, df_idx, sort_names, ascending): - - # GH 14353 - - # Get index levels from df_idx - levels = df_idx.index.names - - # Compute expected by sorting on columns and the setting index - expected = df_none.sort_values( - by=sort_names, ascending=ascending, axis=0 - ).set_index(levels) - - # Compute result sorting on mix on columns and index levels - result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) - - tm.assert_frame_equal(result, expected) - - -def test_sort_column_level_and_index_label(df_none, df_idx, sort_names, ascending): - - # GH 14353 - - # Get levels from df_idx - levels = df_idx.index.names - - # Compute expected by sorting on axis=0, setting index levels, and then - # transposing. For some cases this will result in a frame with - # multiple column levels - expected = ( - df_none.sort_values(by=sort_names, ascending=ascending, axis=0) - .set_index(levels) - .T - ) - - # Compute result by transposing and sorting on axis=1. - result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) - - if len(levels) > 1: - # Accessing multi-level columns that are not lexsorted raises a - # performance warning - with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): - tm.assert_frame_equal(result, expected) - else: - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_stack_unstack.py similarity index 63% rename from pandas/tests/frame/test_reshape.py rename to pandas/tests/frame/test_stack_unstack.py index b10fdbb707404..c34f5b35c1ab7 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1,4 +1,5 @@ from datetime import datetime +from io import StringIO import itertools import numpy as np @@ -10,95 +11,6 @@ class TestDataFrameReshape: - def test_pivot(self): - data = { - "index": ["A", "B", "C", "C", "B", "A"], - "columns": ["One", "One", "One", "Two", "Two", "Two"], - "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], - } - - frame = DataFrame(data) - pivoted = frame.pivot(index="index", columns="columns", values="values") - - expected = DataFrame( - { - "One": {"A": 1.0, "B": 2.0, "C": 3.0}, - "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, - } - ) - - expected.index.name, expected.columns.name = "index", "columns" - tm.assert_frame_equal(pivoted, expected) - - # name tracking - assert pivoted.index.name == "index" - assert pivoted.columns.name == "columns" - - # don't specify values - pivoted = frame.pivot(index="index", columns="columns") - assert pivoted.index.name == "index" - assert pivoted.columns.names == (None, "columns") - - def test_pivot_duplicates(self): - data = DataFrame( - { - "a": ["bar", "bar", "foo", "foo", "foo"], - "b": ["one", "two", "one", "one", "two"], - "c": [1.0, 2.0, 3.0, 3.0, 4.0], - } - ) - with pytest.raises(ValueError, match="duplicate entries"): - data.pivot("a", "b", "c") - - def test_pivot_empty(self): - df = DataFrame(columns=["a", "b", "c"]) - result = df.pivot("a", "b", "c") - expected = DataFrame() - tm.assert_frame_equal(result, expected, check_names=False) - - def test_pivot_integer_bug(self): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")]) - - result = df.pivot(index=1, columns=0, values=2) - repr(result) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0)) - - def test_pivot_index_none(self): - # gh-3962 - data = { - "index": ["A", "B", "C", "C", "B", "A"], - "columns": ["One", "One", "One", "Two", "Two", "Two"], - "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], - } - - frame = DataFrame(data).set_index("index") - result = frame.pivot(columns="columns", values="values") - expected = DataFrame( - { - "One": {"A": 1.0, "B": 2.0, "C": 3.0}, - "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, - } - ) - - expected.index.name, expected.columns.name = "index", "columns" - tm.assert_frame_equal(result, expected) - - # omit values - result = frame.pivot(columns="columns") - - expected.columns = pd.MultiIndex.from_tuples( - [("values", "One"), ("values", "Two")], names=[None, "columns"] - ) - expected.index.name = "index" - tm.assert_frame_equal(result, expected, check_names=False) - assert result.index.name == "index" - assert result.columns.names == (None, "columns") - expected.columns = expected.columns.droplevel(0) - result = frame.pivot(columns="columns", values="values") - - expected.columns.name = "columns" - tm.assert_frame_equal(result, expected) - def test_stack_unstack(self, float_frame): df = float_frame.copy() df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) @@ -142,7 +54,7 @@ def test_stack_mixed_level(self): def test_unstack_not_consolidated(self): # Gh#34708 - df = pd.DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + df = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) df2 = df[["x"]] df2["y"] = df["y"] assert len(df2._mgr.blocks) == 2 @@ -182,7 +94,7 @@ def test_unstack_fill(self): unstacked = df.unstack(["x", "y"], fill_value=0) key = ("w", "b", "j") expected = unstacked[key] - result = pd.Series([0, 0, 2], index=unstacked.index, name=key) + result = Series([0, 0, 2], index=unstacked.index, name=key) tm.assert_series_equal(result, expected) stacked = unstacked.stack(["x", "y"]) @@ -315,7 +227,7 @@ def test_unstack_fill_frame_period(self): def test_unstack_fill_frame_categorical(self): # Test unstacking with categorical - data = pd.Series(["a", "b", "c", "a"], dtype="category") + data = Series(["a", "b", "c", "a"], dtype="category") data.index = pd.MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) @@ -352,10 +264,10 @@ def test_unstack_tuplename_in_multiindex(self): idx = pd.MultiIndex.from_product( [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")] ) - df = pd.DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx) + df = DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx) result = df.unstack(("A", "a")) - expected = pd.DataFrame( + expected = DataFrame( [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]], columns=pd.MultiIndex.from_tuples( [ @@ -368,7 +280,7 @@ def test_unstack_tuplename_in_multiindex(self): ], names=[None, ("A", "a")], ), - index=pd.Index([1, 2, 3], name=("B", "b")), + index=Index([1, 2, 3], name=("B", "b")), ) tm.assert_frame_equal(result, expected) @@ -389,7 +301,7 @@ def test_unstack_tuplename_in_multiindex(self): ( (("A", "a"), "B"), [[1, 1, 1, 1, 2, 2, 2, 2], [1, 1, 1, 1, 2, 2, 2, 2]], - pd.Index([3, 4], name="C"), + Index([3, 4], name="C"), pd.MultiIndex.from_tuples( [ ("d", "a", 1), @@ -413,29 +325,29 @@ def test_unstack_mixed_type_name_in_multiindex( idx = pd.MultiIndex.from_product( [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"] ) - df = pd.DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx) + df = DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx) result = df.unstack(unstack_idx) - expected = pd.DataFrame( + expected = DataFrame( expected_values, columns=expected_columns, index=expected_index ) tm.assert_frame_equal(result, expected) def test_unstack_preserve_dtypes(self): # Checks fix for #11847 - df = pd.DataFrame( + df = DataFrame( dict( state=["IL", "MI", "NC"], index=["a", "b", "c"], - some_categories=pd.Series(["a", "b", "c"]).astype("category"), + some_categories=Series(["a", "b", "c"]).astype("category"), A=np.random.rand(3), B=1, C="foo", D=pd.Timestamp("20010102"), - E=pd.Series([1.0, 50.0, 100.0]).astype("float32"), - F=pd.Series([3.0, 4.0, 5.0]).astype("float64"), + E=Series([1.0, 50.0, 100.0]).astype("float32"), + F=Series([3.0, 4.0, 5.0]).astype("float64"), G=False, - H=pd.Series([1, 200, 923442], dtype="int8"), + H=Series([1, 200, 923442], dtype="int8"), ) ) @@ -586,7 +498,7 @@ def test_unstack_level_binding(self): codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], names=["first", "second", "third"], ) - s = pd.Series(0, index=mi) + s = Series(0, index=mi) result = s.unstack([1, 2]).stack(0) expected_mi = pd.MultiIndex( @@ -595,12 +507,12 @@ def test_unstack_level_binding(self): names=["first", "second"], ) - expected = pd.DataFrame( + expected = DataFrame( np.array( [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64 ), index=expected_mi, - columns=pd.Index(["a", "b"], name="third"), + columns=Index(["a", "b"], name="third"), ) tm.assert_frame_equal(result, expected) @@ -717,11 +629,11 @@ def test_unstack_non_unique_index_names(self): def test_unstack_unused_levels(self): # GH 17845: unused codes in index make unstack() cast int to float idx = pd.MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1] - df = pd.DataFrame([[1, 0]] * 3, index=idx) + df = DataFrame([[1, 0]] * 3, index=idx) result = df.unstack() exp_col = pd.MultiIndex.from_product([[0, 1], ["A", "B", "C"]]) - expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) + expected = DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) tm.assert_frame_equal(result, expected) assert (result.columns.levels[1] == idx.levels[1]).all() @@ -730,9 +642,9 @@ def test_unstack_unused_levels(self): codes = [[0, 0, 1, 1], [0, 2, 0, 2]] idx = pd.MultiIndex(levels, codes) block = np.arange(4).reshape(2, 2) - df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx) + df = DataFrame(np.concatenate([block, block + 4]), index=idx) result = df.unstack() - expected = pd.DataFrame( + expected = DataFrame( np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx ) tm.assert_frame_equal(result, expected) @@ -743,7 +655,7 @@ def test_unstack_unused_levels(self): codes = [[0, -1, 1, 1], [0, 2, -1, 2]] idx = pd.MultiIndex(levels, codes) data = np.arange(8) - df = pd.DataFrame(data.reshape(4, 2), index=idx) + df = DataFrame(data.reshape(4, 2), index=idx) cases = ( (0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]), @@ -754,17 +666,13 @@ def test_unstack_unused_levels(self): exp_data = np.zeros(18) * np.nan exp_data[idces] = data cols = pd.MultiIndex.from_product([[0, 1], col_level]) - expected = pd.DataFrame( - exp_data.reshape(3, 6), index=idx_level, columns=cols - ) + expected = DataFrame(exp_data.reshape(3, 6), index=idx_level, columns=cols) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cols", [["A", "C"], slice(None)]) def test_unstack_unused_level(self, cols): # GH 18562 : unused codes on the unstacked level - df = pd.DataFrame( - [[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"] - ) + df = DataFrame([[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"]) ind = df.set_index(["A", "B", "C"], drop=False) selection = ind.loc[(slice(None), slice(None), "I"), cols] @@ -780,7 +688,7 @@ def test_unstack_unused_level(self, cols): def test_unstack_long_index(self): # PH 32624: Error when using a lot of indices to unstack. # The error occurred only, if a lot of indices are used. - df = pd.DataFrame( + df = DataFrame( [[1]], columns=pd.MultiIndex.from_tuples([[0]], names=["c1"]), index=pd.MultiIndex.from_tuples( @@ -789,19 +697,19 @@ def test_unstack_long_index(self): ), ) result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"]) - expected = pd.DataFrame( + expected = DataFrame( [[1]], columns=pd.MultiIndex.from_tuples( [[0, 0, 1, 0, 0, 0, 1]], names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"], ), - index=pd.Index([0], name="i1"), + index=Index([0], name="i1"), ) tm.assert_frame_equal(result, expected) def test_unstack_multi_level_cols(self): # PH 24729: Unstack a df with multi level columns - df = pd.DataFrame( + df = DataFrame( [[0.0, 0.0], [0.0, 0.0]], columns=pd.MultiIndex.from_tuples( [["B", "C"], ["B", "D"]], names=["c1", "c2"] @@ -814,7 +722,7 @@ def test_unstack_multi_level_cols(self): def test_unstack_multi_level_rows_and_cols(self): # PH 28306: Unstack df with multi level cols and rows - df = pd.DataFrame( + df = DataFrame( [[1, 2], [3, 4], [-1, -2], [-3, -4]], columns=pd.MultiIndex.from_tuples([["a", "b", "c"], ["d", "e", "f"]]), index=pd.MultiIndex.from_tuples( @@ -918,7 +826,7 @@ def verify(df): verify(udf[col]) # GH7403 - df = pd.DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) + df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) @@ -947,9 +855,7 @@ def verify(df): right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) - df = pd.DataFrame( - {"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)} - ) + df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) @@ -962,7 +868,7 @@ def verify(df): tm.assert_frame_equal(left, right) # GH7401 - df = pd.DataFrame( + df = DataFrame( { "A": list("aaaaabbbbb"), "B": (date_range("2012-01-01", periods=5).tolist() * 2), @@ -1141,10 +1047,10 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): def test_stack_preserve_categorical_dtype_values(self): # GH-23077 cat = pd.Categorical(["a", "a", "b", "c"]) - df = pd.DataFrame({"A": cat, "B": cat}) + df = DataFrame({"A": cat, "B": cat}) result = df.stack() index = pd.MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) - expected = pd.Series( + expected = Series( pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index ) tm.assert_series_equal(result, expected) @@ -1159,10 +1065,10 @@ def test_stack_preserve_categorical_dtype_values(self): ) def test_stack_multi_columns_non_unique_index(self, index, columns): # GH-28301 - df = pd.DataFrame(index=index, columns=columns).fillna(1) + df = DataFrame(index=index, columns=columns).fillna(1) stacked = df.stack() new_index = pd.MultiIndex.from_tuples(stacked.index.to_numpy()) - expected = pd.DataFrame( + expected = DataFrame( stacked.to_numpy(), index=new_index, columns=stacked.columns ) tm.assert_frame_equal(stacked, expected) @@ -1175,7 +1081,7 @@ def test_unstack_mixed_extension_types(self, level): index = pd.MultiIndex.from_tuples( [("A", 0), ("A", 1), ("B", 1)], names=["a", "b"] ) - df = pd.DataFrame( + df = DataFrame( { "A": pd.core.arrays.integer_array([0, 1, None]), "B": pd.Categorical(["a", "a", "b"]), @@ -1186,7 +1092,7 @@ def test_unstack_mixed_extension_types(self, level): result = df.unstack(level=level) expected = df.astype(object).unstack(level=level) - expected_dtypes = pd.Series( + expected_dtypes = Series( [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns ) tm.assert_series_equal(result.dtypes, expected_dtypes) @@ -1196,10 +1102,10 @@ def test_unstack_mixed_extension_types(self, level): def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 mi = pd.MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) - df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) + df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) df.columns.name = "foo" - expected = pd.DataFrame( + expected = DataFrame( [[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples( [("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"] @@ -1213,21 +1119,21 @@ def test_unstack_swaplevel_sortlevel(self, level): def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. - data = pd.Series(["a", "b", "c", "a"], dtype="object") + data = Series(["a", "b", "c", "a"], dtype="object") data.index = pd.MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) # By default missing values will be NaN result = data.unstack() - expected = pd.DataFrame( + expected = DataFrame( {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") ) tm.assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected result = data.unstack(fill_value="d") - expected = pd.DataFrame( + expected = DataFrame( {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") ) tm.assert_frame_equal(result, expected) @@ -1235,7 +1141,7 @@ def test_unstack_fill_frame_object(): def test_unstack_timezone_aware_values(): # GH 18338 - df = pd.DataFrame( + df = DataFrame( { "timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")], "a": ["a"], @@ -1245,9 +1151,9 @@ def test_unstack_timezone_aware_values(): columns=["timestamp", "a", "b", "c"], ) result = df.set_index(["a", "b"]).unstack() - expected = pd.DataFrame( + expected = DataFrame( [[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]], - index=pd.Index(["a"], name="a"), + index=Index(["a"], name="a"), columns=pd.MultiIndex( levels=[["timestamp", "c"], ["b"]], codes=[[0, 1], [0, 0]], @@ -1262,9 +1168,9 @@ def test_stack_timezone_aware_values(): ts = pd.date_range( freq="D", start="20180101", end="20180103", tz="America/New_York" ) - df = pd.DataFrame({"A": ts}, index=["a", "b", "c"]) + df = DataFrame({"A": ts}, index=["a", "b", "c"]) result = df.stack() - expected = pd.Series( + expected = Series( ts, index=pd.MultiIndex( levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]] @@ -1307,11 +1213,648 @@ def test_unstacking_multi_index_df(): def test_stack_positional_level_duplicate_column_names(): # https://github.com/pandas-dev/pandas/issues/36353 columns = pd.MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"]) - df = pd.DataFrame([[1, 1, 1, 1]], columns=columns) + df = DataFrame([[1, 1, 1, 1]], columns=columns) result = df.stack(0) - new_columns = pd.Index(["y", "z"], name="a") + new_columns = Index(["y", "z"], name="a") new_index = pd.MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"]) - expected = pd.DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns) + expected = DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns) tm.assert_frame_equal(result, expected) + + +class TestStackUnstackMultiLevel: + def test_unstack(self, multiindex_year_month_day_dataframe_random_data): + # just check that it works for now + ymd = multiindex_year_month_day_dataframe_random_data + + unstacked = ymd.unstack() + unstacked.unstack() + + # test that ints work + ymd.astype(int).unstack() + + # test that int32 work + ymd.astype(np.int32).unstack() + + @pytest.mark.parametrize( + "result_rows,result_columns,index_product,expected_row", + [ + ( + [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]], + ["ix1", "ix2", "col1", "col2", "col3", "col4"], + 2, + [None, None, 30.0, None], + ), + ( + [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]], + ["ix1", "ix2", "col1", "col2", "col3"], + 2, + [None, None, 30.0], + ), + ( + [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]], + ["ix1", "ix2", "col1", "col2", "col3"], + None, + [None, None, 30.0], + ), + ], + ) + def test_unstack_partial( + self, result_rows, result_columns, index_product, expected_row + ): + # check for regressions on this issue: + # https://github.com/pandas-dev/pandas/issues/19351 + # make sure DataFrame.unstack() works when its run on a subset of the DataFrame + # and the Index levels contain values that are not present in the subset + result = DataFrame(result_rows, columns=result_columns).set_index( + ["ix1", "ix2"] + ) + result = result.iloc[1:2].unstack("ix2") + expected = DataFrame( + [expected_row], + columns=pd.MultiIndex.from_product( + [result_columns[2:], [index_product]], names=[None, "ix2"] + ), + index=Index([2], name="ix1"), + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_multiple_no_empty_columns(self): + index = MultiIndex.from_tuples( + [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)] + ) + + s = Series(np.random.randn(4), index=index) + + unstacked = s.unstack([1, 2]) + expected = unstacked.dropna(axis=1, how="all") + tm.assert_frame_equal(unstacked, expected) + + def test_stack(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + + # regular roundtrip + unstacked = ymd.unstack() + restacked = unstacked.stack() + tm.assert_frame_equal(restacked, ymd) + + unlexsorted = ymd.sort_index(level=2) + + unstacked = unlexsorted.unstack(2) + restacked = unstacked.stack() + tm.assert_frame_equal(restacked.sort_index(level=0), ymd) + + unlexsorted = unlexsorted[::-1] + unstacked = unlexsorted.unstack(1) + restacked = unstacked.stack().swaplevel(1, 2) + tm.assert_frame_equal(restacked.sort_index(level=0), ymd) + + unlexsorted = unlexsorted.swaplevel(0, 1) + unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) + restacked = unstacked.stack(0).swaplevel(1, 2) + tm.assert_frame_equal(restacked.sort_index(level=0), ymd) + + # columns unsorted + unstacked = ymd.unstack() + unstacked = unstacked.sort_index(axis=1, ascending=False) + restacked = unstacked.stack() + tm.assert_frame_equal(restacked, ymd) + + # more than 2 levels in the columns + unstacked = ymd.unstack(1).unstack(1) + + result = unstacked.stack(1) + expected = ymd.unstack() + tm.assert_frame_equal(result, expected) + + result = unstacked.stack(2) + expected = ymd.unstack(1) + tm.assert_frame_equal(result, expected) + + result = unstacked.stack(0) + expected = ymd.stack().unstack(1).unstack(1) + tm.assert_frame_equal(result, expected) + + # not all levels present in each echelon + unstacked = ymd.unstack(2).loc[:, ::3] + stacked = unstacked.stack().stack() + ymd_stacked = ymd.stack() + tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) + + # stack with negative number + result = ymd.unstack(0).stack(-2) + expected = ymd.unstack(0).stack(0) + + # GH10417 + def check(left, right): + tm.assert_series_equal(left, right) + assert left.index.is_unique is False + li, ri = left.index, right.index + tm.assert_index_equal(li, ri) + + df = DataFrame( + np.arange(12).reshape(4, 3), + index=list("abab"), + columns=["1st", "2nd", "3rd"], + ) + + mi = MultiIndex( + levels=[["a", "b"], ["1st", "2nd", "3rd"]], + codes=[np.tile(np.arange(2).repeat(3), 2), np.tile(np.arange(3), 4)], + ) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + + df.columns = ["1st", "2nd", "1st"] + mi = MultiIndex( + levels=[["a", "b"], ["1st", "2nd"]], + codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)], + ) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + + tpls = ("a", 2), ("b", 1), ("a", 1), ("b", 2) + df.index = MultiIndex.from_tuples(tpls) + mi = MultiIndex( + levels=[["a", "b"], [1, 2], ["1st", "2nd"]], + codes=[ + np.tile(np.arange(2).repeat(3), 2), + np.repeat([1, 0, 1], [3, 6, 3]), + np.tile([0, 1, 0], 4), + ], + ) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + + def test_unstack_odd_failure(self): + data = """day,time,smoker,sum,len +Fri,Dinner,No,8.25,3. +Fri,Dinner,Yes,27.03,9 +Fri,Lunch,No,3.0,1 +Fri,Lunch,Yes,13.68,6 +Sat,Dinner,No,139.63,45 +Sat,Dinner,Yes,120.77,42 +Sun,Dinner,No,180.57,57 +Sun,Dinner,Yes,66.82,19 +Thur,Dinner,No,3.0,1 +Thur,Lunch,No,117.32,44 +Thur,Lunch,Yes,51.51,17""" + + df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"]) + + # it works, #2100 + result = df.unstack(2) + + recons = result.stack() + tm.assert_frame_equal(recons, df) + + def test_stack_mixed_dtype(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + df = frame.T + df["foo", "four"] = "foo" + df = df.sort_index(level=1, axis=1) + + stacked = df.stack() + result = df["foo"].stack().sort_index() + tm.assert_series_equal(stacked["foo"], result, check_names=False) + assert result.name is None + assert stacked["bar"].dtype == np.float_ + + def test_unstack_bug(self): + df = DataFrame( + { + "state": ["naive", "naive", "naive", "activ", "activ", "activ"], + "exp": ["a", "b", "b", "b", "a", "a"], + "barcode": [1, 2, 3, 4, 1, 3], + "v": ["hi", "hi", "bye", "bye", "bye", "peace"], + "extra": np.arange(6.0), + } + ) + + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) + + unstacked = result.unstack() + restacked = unstacked.stack() + tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) + + def test_stack_unstack_preserve_names(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + unstacked = frame.unstack() + assert unstacked.index.name == "first" + assert unstacked.columns.names == ["exp", "second"] + + restacked = unstacked.stack() + assert restacked.index.names == frame.index.names + + @pytest.mark.parametrize("method", ["stack", "unstack"]) + def test_stack_unstack_wrong_level_name( + self, method, multiindex_dataframe_random_data + ): + # GH 18303 - wrong level name should raise + frame = multiindex_dataframe_random_data + + # A DataFrame with flat axes: + df = frame.loc["foo"] + + with pytest.raises(KeyError, match="does not match index name"): + getattr(df, method)("mistake") + + if method == "unstack": + # Same on a Series: + s = df.iloc[:, 0] + with pytest.raises(KeyError, match="does not match index name"): + getattr(s, method)("mistake") + + def test_unstack_level_name(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + result = frame.unstack("second") + expected = frame.unstack(level=1) + tm.assert_frame_equal(result, expected) + + def test_stack_level_name(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + unstacked = frame.unstack("second") + result = unstacked.stack("exp") + expected = frame.unstack().stack(0) + tm.assert_frame_equal(result, expected) + + result = frame.stack("exp") + expected = frame.stack() + tm.assert_series_equal(result, expected) + + def test_stack_unstack_multiple( + self, multiindex_year_month_day_dataframe_random_data + ): + ymd = multiindex_year_month_day_dataframe_random_data + + unstacked = ymd.unstack(["year", "month"]) + expected = ymd.unstack("year").unstack("month") + tm.assert_frame_equal(unstacked, expected) + assert unstacked.columns.names == expected.columns.names + + # series + s = ymd["A"] + s_unstacked = s.unstack(["year", "month"]) + tm.assert_frame_equal(s_unstacked, expected["A"]) + + restacked = unstacked.stack(["year", "month"]) + restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) + restacked = restacked.sort_index(level=0) + + tm.assert_frame_equal(restacked, ymd) + assert restacked.index.names == ymd.index.names + + # GH #451 + unstacked = ymd.unstack([1, 2]) + expected = ymd.unstack(1).unstack(1).dropna(axis=1, how="all") + tm.assert_frame_equal(unstacked, expected) + + unstacked = ymd.unstack([2, 1]) + expected = ymd.unstack(2).unstack(1).dropna(axis=1, how="all") + tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) + + def test_stack_names_and_numbers( + self, multiindex_year_month_day_dataframe_random_data + ): + ymd = multiindex_year_month_day_dataframe_random_data + + unstacked = ymd.unstack(["year", "month"]) + + # Can't use mixture of names and numbers to stack + with pytest.raises(ValueError, match="level should contain"): + unstacked.stack([0, "month"]) + + def test_stack_multiple_out_of_bounds( + self, multiindex_year_month_day_dataframe_random_data + ): + # nlevels == 3 + ymd = multiindex_year_month_day_dataframe_random_data + + unstacked = ymd.unstack(["year", "month"]) + + with pytest.raises(IndexError, match="Too many levels"): + unstacked.stack([2, 3]) + with pytest.raises(IndexError, match="not a valid level number"): + unstacked.stack([-4, -3]) + + def test_unstack_period_series(self): + # GH4342 + idx1 = pd.PeriodIndex( + ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], + freq="M", + name="period", + ) + idx2 = Index(["A", "B"] * 3, name="str") + value = [1, 2, 3, 4, 5, 6] + + idx = MultiIndex.from_arrays([idx1, idx2]) + s = Series(value, index=idx) + + result1 = s.unstack() + result2 = s.unstack(level=1) + result3 = s.unstack(level=0) + + e_idx = pd.PeriodIndex( + ["2013-01", "2013-02", "2013-03"], freq="M", name="period" + ) + expected = DataFrame( + {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"] + ) + expected.columns.name = "str" + + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected.T) + + idx1 = pd.PeriodIndex( + ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], + freq="M", + name="period1", + ) + + idx2 = pd.PeriodIndex( + ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"], + freq="M", + name="period2", + ) + idx = MultiIndex.from_arrays([idx1, idx2]) + s = Series(value, index=idx) + + result1 = s.unstack() + result2 = s.unstack(level=1) + result3 = s.unstack(level=0) + + e_idx = pd.PeriodIndex( + ["2013-01", "2013-02", "2013-03"], freq="M", name="period1" + ) + e_cols = pd.PeriodIndex( + ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"], + freq="M", + name="period2", + ) + expected = DataFrame( + [ + [np.nan, np.nan, np.nan, np.nan, 2, 1], + [np.nan, np.nan, 4, 3, np.nan, np.nan], + [6, 5, np.nan, np.nan, np.nan, np.nan], + ], + index=e_idx, + columns=e_cols, + ) + + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected.T) + + def test_unstack_period_frame(self): + # GH4342 + idx1 = pd.PeriodIndex( + ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"], + freq="M", + name="period1", + ) + idx2 = pd.PeriodIndex( + ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"], + freq="M", + name="period2", + ) + value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]} + idx = MultiIndex.from_arrays([idx1, idx2]) + df = DataFrame(value, index=idx) + + result1 = df.unstack() + result2 = df.unstack(level=1) + result3 = df.unstack(level=0) + + e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1") + e_2 = pd.PeriodIndex( + ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"], + freq="M", + name="period2", + ) + e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2]) + expected = DataFrame( + [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols + ) + + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + e_1 = pd.PeriodIndex( + ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1" + ) + e_2 = pd.PeriodIndex( + ["2013-10", "2013-12", "2014-02"], freq="M", name="period2" + ) + e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1]) + expected = DataFrame( + [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols + ) + + tm.assert_frame_equal(result3, expected) + + def test_stack_multiple_bug(self): + # bug when some uniques are not present in the data GH#3170 + id_col = ([1] * 3) + ([2] * 3) + name = (["a"] * 3) + (["b"] * 3) + date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2) + var1 = np.random.randint(0, 100, 6) + df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1)) + + multi = df.set_index(["DATE", "ID"]) + multi.columns.name = "Params" + unst = multi.unstack("ID") + down = unst.resample("W-THU").mean() + + rs = down.stack("ID") + xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") + xp.columns.name = "Params" + tm.assert_frame_equal(rs, xp) + + def test_stack_dropna(self): + # GH#3997 + df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]}) + df = df.set_index(["A", "B"]) + + stacked = df.unstack().stack(dropna=False) + assert len(stacked) > len(stacked.dropna()) + + stacked = df.unstack().stack(dropna=True) + tm.assert_frame_equal(stacked, stacked.dropna()) + + def test_unstack_multiple_hierarchical(self): + df = DataFrame( + index=[ + [0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + columns=[[0, 0, 1, 1], [0, 1, 0, 1]], + ) + + df.index.names = ["a", "b", "c"] + df.columns.names = ["d", "e"] + + # it works! + df.unstack(["b", "c"]) + + def test_unstack_sparse_keyspace(self): + # memory problems with naive impl GH#2278 + # Generate Long File & Test Pivot + NUM_ROWS = 1000 + + df = DataFrame( + { + "A": np.random.randint(100, size=NUM_ROWS), + "B": np.random.randint(300, size=NUM_ROWS), + "C": np.random.randint(-7, 7, size=NUM_ROWS), + "D": np.random.randint(-19, 19, size=NUM_ROWS), + "E": np.random.randint(3000, size=NUM_ROWS), + "F": np.random.randn(NUM_ROWS), + } + ) + + idf = df.set_index(["A", "B", "C", "D", "E"]) + + # it works! is sufficient + idf.unstack("E") + + def test_unstack_unobserved_keys(self): + # related to GH#2278 refactoring + levels = [[0, 1], [0, 1, 2, 3]] + codes = [[0, 0, 1, 1], [0, 2, 0, 2]] + + index = MultiIndex(levels, codes) + + df = DataFrame(np.random.randn(4, 2), index=index) + + result = df.unstack() + assert len(result.columns) == 4 + + recons = result.stack() + tm.assert_frame_equal(recons, df) + + @pytest.mark.slow + def test_unstack_number_of_levels_larger_than_int32(self): + # GH#20601 + df = DataFrame( + np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)] + ) + with pytest.raises(ValueError, match="int32 overflow"): + df.unstack() + + def test_stack_order_with_unsorted_levels(self): + # GH#16323 + + def manual_compare_stacked(df, df_stacked, lev0, lev1): + assert all( + df.loc[row, col] == df_stacked.loc[(row, col[lev0]), col[lev1]] + for row in df.index + for col in df.columns + ) + + # deep check for 1-row case + for width in [2, 3]: + levels_poss = itertools.product( + itertools.permutations([0, 1, 2], width), repeat=2 + ) + + for levels in levels_poss: + columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + df = DataFrame(columns=columns, data=[range(4)]) + for stack_lev in range(2): + df_stacked = df.stack(stack_lev) + manual_compare_stacked(df, df_stacked, stack_lev, 1 - stack_lev) + + # check multi-row case + mi = MultiIndex( + levels=[["A", "C", "B"], ["B", "A", "C"]], + codes=[np.repeat(range(3), 3), np.tile(range(3), 3)], + ) + df = DataFrame( + columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1) + ) + manual_compare_stacked(df, df.stack(0), 0, 1) + + def test_stack_unstack_unordered_multiindex(self): + # GH# 18265 + values = np.arange(5) + data = np.vstack( + [ + [f"b{x}" for x in values], # b0, b1, .. + [f"a{x}" for x in values], # a0, a1, .. + ] + ) + df = DataFrame(data.T, columns=["b", "a"]) + df.columns.name = "first" + second_level_dict = {"x": df} + multi_level_df = pd.concat(second_level_dict, axis=1) + multi_level_df.columns.names = ["second", "first"] + df = multi_level_df.reindex(sorted(multi_level_df.columns), axis=1) + result = df.stack(["first", "second"]).unstack(["first", "second"]) + expected = DataFrame( + [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]], + index=[0, 1, 2, 3, 4], + columns=MultiIndex.from_tuples( + [("a", "x"), ("b", "x")], names=["first", "second"] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_preserve_types( + self, multiindex_year_month_day_dataframe_random_data + ): + # GH#403 + ymd = multiindex_year_month_day_dataframe_random_data + ymd["E"] = "foo" + ymd["F"] = 2 + + unstacked = ymd.unstack("month") + assert unstacked["A", 1].dtype == np.float64 + assert unstacked["E", 1].dtype == np.object_ + assert unstacked["F", 1].dtype == np.float64 + + def test_unstack_group_index_overflow(self): + codes = np.tile(np.arange(500), 2) + level = np.arange(500) + + index = MultiIndex( + levels=[level] * 8 + [[0, 1]], + codes=[codes] * 8 + [np.arange(2).repeat(500)], + ) + + s = Series(np.arange(1000), index=index) + result = s.unstack() + assert result.shape == (500, 2) + + # test roundtrip + stacked = result.stack() + tm.assert_series_equal(s, stacked.reindex(s.index)) + + # put it at beginning + index = MultiIndex( + levels=[[0, 1]] + [level] * 8, + codes=[np.arange(2).repeat(500)] + [codes] * 8, + ) + + s = Series(np.arange(1000), index=index) + result = s.unstack(0) + assert result.shape == (500, 2) + + # put it in middle + index = MultiIndex( + levels=[level] * 4 + [[0, 1]] + [level] * 4, + codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4), + ) + + s = Series(np.arange(1000), index=index) + result = s.unstack(4) + assert result.shape == (500, 2) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 63361789b8e50..22ffb30324366 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -1,65 +1,57 @@ import numpy as np +import pytest import pandas as pd -from pandas import DataFrame, date_range, to_datetime +from pandas import DataFrame, to_datetime import pandas._testing as tm class TestDataFrameTimeSeriesMethods: - def test_frame_ctor_datetime64_column(self): - rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") - dates = np.asarray(rng) - - df = DataFrame({"A": np.random.randn(len(rng)), "B": dates}) - assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]")) - - def test_frame_append_datetime64_col_other_units(self): + @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) + def test_frame_append_datetime64_col_other_units(self, unit): n = 100 - units = ["h", "m", "s", "ms", "D", "M", "Y"] - ns_dtype = np.dtype("M8[ns]") - for unit in units: - dtype = np.dtype(f"M8[{unit}]") - vals = np.arange(n, dtype=np.int64).view(dtype) + dtype = np.dtype(f"M8[{unit}]") + vals = np.arange(n, dtype=np.int64).view(dtype) - df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) - df[unit] = vals + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df[unit] = vals - ex_vals = to_datetime(vals.astype("O")).values + ex_vals = to_datetime(vals.astype("O")).values - assert df[unit].dtype == ns_dtype - assert (df[unit].values == ex_vals).all() + assert df[unit].dtype == ns_dtype + assert (df[unit].values == ex_vals).all() + @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) + def test_frame_setitem_existing_datetime64_col_other_units(self, unit): # Test insertion into existing datetime64 column + n = 100 + ns_dtype = np.dtype("M8[ns]") + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype) - for unit in units: - dtype = np.dtype(f"M8[{unit}]") - vals = np.arange(n, dtype=np.int64).view(dtype) + dtype = np.dtype(f"M8[{unit}]") + vals = np.arange(n, dtype=np.int64).view(dtype) - tmp = df.copy() + tmp = df.copy() - tmp["dates"] = vals - ex_vals = to_datetime(vals.astype("O")).values + tmp["dates"] = vals + ex_vals = to_datetime(vals.astype("O")).values - assert (tmp["dates"].values == ex_vals).all() + assert (tmp["dates"].values == ex_vals).all() def test_datetime_assignment_with_NaT_and_diff_time_units(self): # GH 7492 data_ns = np.array([1, "nat"], dtype="datetime64[ns]") result = pd.Series(data_ns).to_frame() result["new"] = data_ns - expected = pd.DataFrame( - {0: [1, None], "new": [1, None]}, dtype="datetime64[ns]" - ) + expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]") tm.assert_frame_equal(result, expected) # OutOfBoundsDatetime error shouldn't occur data_s = np.array([1, "nat"], dtype="datetime64[s]") result["new"] = data_s - expected = pd.DataFrame( - {0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]" - ) + expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index dfd4fb1855383..3d814f22ce262 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -3,101 +3,15 @@ """ import numpy as np import pytest -import pytz from pandas.core.dtypes.dtypes import DatetimeTZDtype -import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm from pandas.core.indexes.datetimes import date_range class TestDataFrameTimezones: - def test_frame_values_with_tz(self): - tz = "US/Central" - df = DataFrame({"A": date_range("2000", periods=4, tz=tz)}) - result = df.values - expected = np.array( - [ - [pd.Timestamp("2000-01-01", tz=tz)], - [pd.Timestamp("2000-01-02", tz=tz)], - [pd.Timestamp("2000-01-03", tz=tz)], - [pd.Timestamp("2000-01-04", tz=tz)], - ] - ) - tm.assert_numpy_array_equal(result, expected) - - # two columns, homogenous - - df = df.assign(B=df.A) - result = df.values - expected = np.concatenate([expected, expected], axis=1) - tm.assert_numpy_array_equal(result, expected) - - # three columns, heterogeneous - est = "US/Eastern" - df = df.assign(C=df.A.dt.tz_convert(est)) - - new = np.array( - [ - [pd.Timestamp("2000-01-01T01:00:00", tz=est)], - [pd.Timestamp("2000-01-02T01:00:00", tz=est)], - [pd.Timestamp("2000-01-03T01:00:00", tz=est)], - [pd.Timestamp("2000-01-04T01:00:00", tz=est)], - ] - ) - expected = np.concatenate([expected, new], axis=1) - result = df.values - tm.assert_numpy_array_equal(result, expected) - - def test_frame_join_tzaware(self): - test1 = DataFrame( - np.zeros((6, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" - ), - ) - test2 = DataFrame( - np.zeros((3, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" - ), - columns=range(3, 6), - ) - - result = test1.join(test2, how="outer") - ex_index = test1.index.union(test2.index) - - tm.assert_index_equal(result.index, ex_index) - assert result.index.tz.zone == "US/Central" - - def test_frame_align_aware(self): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") - idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") - df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) - df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) - new1, new2 = df1.align(df2) - assert df1.index.tz == new1.index.tz - assert df2.index.tz == new2.index.tz - - # different timezones convert to UTC - - # frame with frame - df1_central = df1.tz_convert("US/Central") - new1, new2 = df1.align(df1_central) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - # frame with Series - new1, new2 = df1.align(df1_central[0], axis=0) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - df1[0].align(df1_central, axis=0) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_frame_no_datetime64_dtype(self, tz): # after GH#7822 @@ -159,10 +73,3 @@ def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) ) tm.assert_frame_equal(result, expected) - - def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): - # GH 25843 - tz = tz_aware_fixture - result = DataFrame({"d": [pd.Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]") - expected = DataFrame({"d": [pd.Timestamp("2019")]}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py new file mode 100644 index 0000000000000..ea6243e2eae4a --- /dev/null +++ b/pandas/tests/frame/test_unary.py @@ -0,0 +1,123 @@ +from decimal import Decimal + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestDataFrameUnaryOperators: + # __pos__, __neg__, __inv__ + + @pytest.mark.parametrize( + "df,expected", + [ + (pd.DataFrame({"a": [-1, 1]}), pd.DataFrame({"a": [1, -1]})), + (pd.DataFrame({"a": [False, True]}), pd.DataFrame({"a": [True, False]})), + ( + pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + pd.DataFrame({"a": pd.Series(pd.to_timedelta([1, -1]))}), + ), + ], + ) + def test_neg_numeric(self, df, expected): + tm.assert_frame_equal(-df, expected) + tm.assert_series_equal(-df["a"], expected["a"]) + + @pytest.mark.parametrize( + "df, expected", + [ + (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), + ([Decimal("1.0"), Decimal("2.0")], [Decimal("-1.0"), Decimal("-2.0")]), + ], + ) + def test_neg_object(self, df, expected): + # GH#21380 + df = pd.DataFrame({"a": df}) + expected = pd.DataFrame({"a": expected}) + tm.assert_frame_equal(-df, expected) + tm.assert_series_equal(-df["a"], expected["a"]) + + @pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": ["a", "b"]}), + pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), + ], + ) + def test_neg_raises(self, df): + msg = ( + "bad operand type for unary -: 'str'|" + r"Unary negative expects numeric dtype, not datetime64\[ns\]" + ) + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) + + def test_invert(self, float_frame): + df = float_frame + + tm.assert_frame_equal(-(df < 0), ~(df < 0)) + + def test_invert_mixed(self): + shape = (10, 5) + df = pd.concat( + [ + pd.DataFrame(np.zeros(shape, dtype="bool")), + pd.DataFrame(np.zeros(shape, dtype=int)), + ], + axis=1, + ignore_index=True, + ) + result = ~df + expected = pd.concat( + [ + pd.DataFrame(np.ones(shape, dtype="bool")), + pd.DataFrame(-np.ones(shape, dtype=int)), + ], + axis=1, + ignore_index=True, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": [-1, 1]}), + pd.DataFrame({"a": [False, True]}), + pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + ], + ) + def test_pos_numeric(self, df): + # GH#16073 + tm.assert_frame_equal(+df, df) + tm.assert_series_equal(+df["a"], df["a"]) + + @pytest.mark.parametrize( + "df", + [ + # numpy changing behavior in the future + pytest.param( + pd.DataFrame({"a": ["a", "b"]}), + marks=[pytest.mark.filterwarnings("ignore")], + ), + pd.DataFrame({"a": np.array([-1, 2], dtype=object)}), + pd.DataFrame({"a": [Decimal("-1.0"), Decimal("2.0")]}), + ], + ) + def test_pos_object(self, df): + # GH#21380 + tm.assert_frame_equal(+df, df) + tm.assert_series_equal(+df["a"], df["a"]) + + @pytest.mark.parametrize( + "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] + ) + def test_pos_raises(self, df): + msg = "Unary plus expects .* dtype, not datetime64\\[ns\\]" + with pytest.raises(TypeError, match=msg): + (+df) + with pytest.raises(TypeError, match=msg): + (+df["a"]) diff --git a/pandas/tests/generic/methods/test_pipe.py b/pandas/tests/generic/methods/test_pipe.py new file mode 100644 index 0000000000000..59e5edc4b8bb5 --- /dev/null +++ b/pandas/tests/generic/methods/test_pipe.py @@ -0,0 +1,38 @@ +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestPipe: + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_pipe(self, klass): + obj = DataFrame({"A": [1, 2, 3]}) + expected = DataFrame({"A": [1, 4, 9]}) + if klass is Series: + obj = obj["A"] + expected = expected["A"] + + f = lambda x, y: x ** y + result = obj.pipe(f, 2) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_pipe_tuple(self, klass): + obj = DataFrame({"A": [1, 2, 3]}) + if klass is Series: + obj = obj["A"] + + f = lambda x, y: y + result = obj.pipe((f, "y"), 0) + tm.assert_equal(result, obj) + + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_pipe_tuple_error(self, klass): + obj = DataFrame({"A": [1, 2, 3]}) + if klass is Series: + obj = obj["A"] + + f = lambda x, y: y + with pytest.raises(ValueError): + obj.pipe((f, "y"), x=1, y=0) diff --git a/pandas/tests/generic/methods/test_sample.py b/pandas/tests/generic/methods/test_sample.py new file mode 100644 index 0000000000000..7303dad9170ed --- /dev/null +++ b/pandas/tests/generic/methods/test_sample.py @@ -0,0 +1,309 @@ +import numpy as np +import pytest + +from pandas.compat.numpy import np_version_under1p17 + +from pandas import DataFrame, Series +import pandas._testing as tm +import pandas.core.common as com + + +class TestSample: + @pytest.fixture(params=[Series, DataFrame]) + def obj(self, request): + klass = request.param + if klass is Series: + arr = np.random.randn(10) + else: + arr = np.random.randn(10, 10) + return klass(arr, dtype=None) + + @pytest.mark.parametrize("test", list(range(10))) + def test_sample(self, test, obj): + # Fixes issue: 2419 + # Check behavior of random_state argument + # Check for stability when receives seed or random state -- run 10 + # times. + + seed = np.random.randint(0, 100) + tm.assert_equal( + obj.sample(n=4, random_state=seed), obj.sample(n=4, random_state=seed) + ) + + tm.assert_equal( + obj.sample(frac=0.7, random_state=seed), + obj.sample(frac=0.7, random_state=seed), + ) + + tm.assert_equal( + obj.sample(n=4, random_state=np.random.RandomState(test)), + obj.sample(n=4, random_state=np.random.RandomState(test)), + ) + + tm.assert_equal( + obj.sample(frac=0.7, random_state=np.random.RandomState(test)), + obj.sample(frac=0.7, random_state=np.random.RandomState(test)), + ) + + tm.assert_equal( + obj.sample(frac=2, replace=True, random_state=np.random.RandomState(test)), + obj.sample(frac=2, replace=True, random_state=np.random.RandomState(test)), + ) + + os1, os2 = [], [] + for _ in range(2): + np.random.seed(test) + os1.append(obj.sample(n=4)) + os2.append(obj.sample(frac=0.7)) + tm.assert_equal(*os1) + tm.assert_equal(*os2) + + def test_sample_lengths(self, obj): + # Check lengths are right + assert len(obj.sample(n=4) == 4) + assert len(obj.sample(frac=0.34) == 3) + assert len(obj.sample(frac=0.36) == 4) + + def test_sample_invalid_random_state(self, obj): + # Check for error when random_state argument invalid. + with pytest.raises(ValueError): + obj.sample(random_state="astring!") + + def test_sample_wont_accept_n_and_frac(self, obj): + # Giving both frac and N throws error + with pytest.raises(ValueError): + obj.sample(n=3, frac=0.3) + + def test_sample_requires_positive_n_frac(self, obj): + with pytest.raises(ValueError): + obj.sample(n=-3) + with pytest.raises(ValueError): + obj.sample(frac=-0.3) + + def test_sample_requires_integer_n(self, obj): + # Make sure float values of `n` give error + with pytest.raises(ValueError): + obj.sample(n=3.2) + + def test_sample_invalid_weight_lengths(self, obj): + # Weight length must be right + with pytest.raises(ValueError): + obj.sample(n=3, weights=[0, 1]) + + with pytest.raises(ValueError): + bad_weights = [0.5] * 11 + obj.sample(n=3, weights=bad_weights) + + with pytest.raises(ValueError): + bad_weight_series = Series([0, 0, 0.2]) + obj.sample(n=4, weights=bad_weight_series) + + def test_sample_negative_weights(self, obj): + # Check won't accept negative weights + with pytest.raises(ValueError): + bad_weights = [-0.1] * 10 + obj.sample(n=3, weights=bad_weights) + + def test_sample_inf_weights(self, obj): + # Check inf and -inf throw errors: + + with pytest.raises(ValueError): + weights_with_inf = [0.1] * 10 + weights_with_inf[0] = np.inf + obj.sample(n=3, weights=weights_with_inf) + + with pytest.raises(ValueError): + weights_with_ninf = [0.1] * 10 + weights_with_ninf[0] = -np.inf + obj.sample(n=3, weights=weights_with_ninf) + + def test_sample_zero_weights(self, obj): + # All zeros raises errors + + zero_weights = [0] * 10 + with pytest.raises(ValueError): + obj.sample(n=3, weights=zero_weights) + + def test_sample_missing_weights(self, obj): + # All missing weights + + nan_weights = [np.nan] * 10 + with pytest.raises(ValueError): + obj.sample(n=3, weights=nan_weights) + + def test_sample_none_weights(self, obj): + # Check None are also replaced by zeros. + weights_with_None = [None] * 10 + weights_with_None[5] = 0.5 + tm.assert_equal( + obj.sample(n=1, axis=0, weights=weights_with_None), obj.iloc[5:6] + ) + + @pytest.mark.parametrize( + "func_str,arg", + [ + ("np.array", [2, 3, 1, 0]), + pytest.param( + "np.random.MT19937", + 3, + marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"), + ), + pytest.param( + "np.random.PCG64", + 11, + marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"), + ), + ], + ) + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_sample_random_state(self, func_str, arg, klass): + # GH#32503 + obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)}) + if klass is Series: + obj = obj["col1"] + result = obj.sample(n=3, random_state=eval(func_str)(arg)) + expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_sample_upsampling_without_replacement(self, klass): + # GH#27451 + + obj = DataFrame({"A": list("abc")}) + if klass is Series: + obj = obj["A"] + + msg = ( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + with pytest.raises(ValueError, match=msg): + obj.sample(frac=2, replace=False) + + +class TestSampleDataFrame: + # Tests which are relevant only for DataFrame, so these are + # as fully parametrized as they can get. + + def test_sample(self): + # GH#2419 + # additional specific object based tests + + # A few dataframe test with degenerate weights. + easy_weight_list = [0] * 10 + easy_weight_list[5] = 1 + + df = DataFrame( + { + "col1": range(10, 20), + "col2": range(20, 30), + "colString": ["a"] * 10, + "easyweights": easy_weight_list, + } + ) + sample1 = df.sample(n=1, weights="easyweights") + tm.assert_frame_equal(sample1, df.iloc[5:6]) + + # Ensure proper error if string given as weight for Series or + # DataFrame with axis = 1. + ser = Series(range(10)) + with pytest.raises(ValueError): + ser.sample(n=3, weights="weight_column") + + with pytest.raises(ValueError): + df.sample(n=1, weights="weight_column", axis=1) + + # Check weighting key error + with pytest.raises( + KeyError, match="'String passed to weights not a valid column'" + ): + df.sample(n=3, weights="not_a_real_column_name") + + # Check that re-normalizes weights that don't sum to one. + weights_less_than_1 = [0] * 10 + weights_less_than_1[0] = 0.5 + tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) + + ### + # Test axis argument + ### + + # Test axis argument + df = DataFrame({"col1": range(10), "col2": ["a"] * 10}) + second_column_weight = [0, 1] + tm.assert_frame_equal( + df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]] + ) + + # Different axis arg types + tm.assert_frame_equal( + df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]] + ) + + weight = [0] * 10 + weight[5] = 0.5 + tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6]) + tm.assert_frame_equal( + df.sample(n=1, axis="index", weights=weight), df.iloc[5:6] + ) + + # Check out of range axis values + with pytest.raises(ValueError): + df.sample(n=1, axis=2) + + with pytest.raises(ValueError): + df.sample(n=1, axis="not_a_name") + + with pytest.raises(ValueError): + ser = Series(range(10)) + ser.sample(n=1, axis=1) + + # Test weight length compared to correct axis + with pytest.raises(ValueError): + df.sample(n=1, axis=1, weights=[0.5] * 10) + + def test_sample_axis1(self): + # Check weights with axis = 1 + easy_weight_list = [0] * 3 + easy_weight_list[2] = 1 + + df = DataFrame( + {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} + ) + sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) + tm.assert_frame_equal(sample1, df[["colString"]]) + + # Test default axes + tm.assert_frame_equal( + df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42) + ) + + def test_sample_aligns_weights_with_frame(self): + + # Test that function aligns weights with frame + df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) + ser = Series([1, 0, 0], index=[3, 5, 9]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser)) + + # Weights have index values to be dropped because not in + # sampled DataFrame + ser2 = Series([0.001, 0, 10000], index=[3, 5, 10]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser2)) + + # Weights have empty values to be filed with zeros + ser3 = Series([0.01, 0], index=[3, 5]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser3)) + + # No overlap in weight and sampled DataFrame indices + ser4 = Series([1, 0], index=[1, 2]) + with pytest.raises(ValueError): + df.sample(1, weights=ser4) + + def test_sample_is_copy(self): + # GH#27357, GH#30784: ensure the result of sample is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) + df2 = df.sample(3) + + with tm.assert_produces_warning(None): + df2["d"] = 1 diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index 97468e1f10a8b..42745d2a69375 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -275,7 +275,8 @@ def test_set_flags_with_duplicates(self, cls, axes): result = cls(**axes) assert result.flags.allows_duplicate_labels is True - with pytest.raises(pd.errors.DuplicateLabelError): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): cls(**axes).set_flags(allows_duplicate_labels=False) @pytest.mark.parametrize( @@ -287,7 +288,8 @@ def test_set_flags_with_duplicates(self, cls, axes): ], ) def test_setting_allows_duplicate_labels_raises(self, data): - with pytest.raises(pd.errors.DuplicateLabelError): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): data.flags.allows_duplicate_labels = False assert data.flags.allows_duplicate_labels is True @@ -297,7 +299,8 @@ def test_setting_allows_duplicate_labels_raises(self, data): ) def test_series_raises(self, func): s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) - with pytest.raises(pd.errors.DuplicateLabelError): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): func(s) @pytest.mark.parametrize( @@ -332,7 +335,8 @@ def test_getitem_raises(self, getter, target): else: target = df - with pytest.raises(pd.errors.DuplicateLabelError): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): getter(target) @pytest.mark.parametrize( @@ -352,7 +356,8 @@ def test_getitem_raises(self, getter, target): ], ) def test_concat_raises(self, objs, kwargs): - with pytest.raises(pd.errors.DuplicateLabelError): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.concat(objs, **kwargs) @not_implemented @@ -361,7 +366,8 @@ def test_merge_raises(self): allows_duplicate_labels=False ) b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"]) - with pytest.raises(pd.errors.DuplicateLabelError): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.merge(a, b, left_index=True, right_index=True) @@ -381,13 +387,14 @@ def test_merge_raises(self): ids=lambda x: type(x).__name__, ) def test_raises_basic(idx): - with pytest.raises(pd.errors.DuplicateLabelError): + msg = "Index has duplicates." + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False) - with pytest.raises(pd.errors.DuplicateLabelError): + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False) - with pytest.raises(pd.errors.DuplicateLabelError): + with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False) @@ -412,7 +419,8 @@ def test_format_duplicate_labels_message_multi(): def test_dataframe_insert_raises(): df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) - with pytest.raises(ValueError, match="Cannot specify"): + msg = "Cannot specify" + with pytest.raises(ValueError, match=msg): df.insert(0, "A", [3, 4], allow_duplicates=True) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 25c926b1de4c6..d7aadda990f53 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -115,10 +115,7 @@ (pd.DataFrame, frame_data, operator.methodcaller("notnull")), (pd.DataFrame, frame_data, operator.methodcaller("dropna")), (pd.DataFrame, frame_data, operator.methodcaller("drop_duplicates")), - pytest.param( - (pd.DataFrame, frame_data, operator.methodcaller("duplicated")), - marks=not_implemented_mark, - ), + (pd.DataFrame, frame_data, operator.methodcaller("duplicated")), (pd.DataFrame, frame_data, operator.methodcaller("sort_values", by="A")), (pd.DataFrame, frame_data, operator.methodcaller("sort_index")), (pd.DataFrame, frame_data, operator.methodcaller("nlargest", 1, "A")), @@ -157,10 +154,7 @@ ), marks=not_implemented_mark, ), - pytest.param( - (pd.DataFrame, frame_data, operator.methodcaller("pivot", columns="A")), - marks=not_implemented_mark, - ), + (pd.DataFrame, frame_data, operator.methodcaller("pivot", columns="A")), pytest.param( ( pd.DataFrame, @@ -169,18 +163,12 @@ ), marks=not_implemented_mark, ), - pytest.param( - (pd.DataFrame, frame_data, operator.methodcaller("stack")), - marks=not_implemented_mark, - ), + (pd.DataFrame, frame_data, operator.methodcaller("stack")), pytest.param( (pd.DataFrame, frame_data, operator.methodcaller("explode", "A")), marks=not_implemented_mark, ), - pytest.param( - (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack")), - marks=not_implemented_mark, - ), + (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack")), pytest.param( ( pd.DataFrame, @@ -690,7 +678,9 @@ def test_datetime_method(method): "microsecond", "nanosecond", "dayofweek", + "day_of_week", "dayofyear", + "day_of_year", "quarter", "is_month_start", "is_month_end", @@ -762,13 +752,35 @@ def test_categorical_accessor(method): [ operator.methodcaller("sum"), lambda x: x.agg("sum"), + ], +) +def test_groupby_finalize(obj, method): + obj.attrs = {"a": 1} + result = method(obj.groupby([0, 0])) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] +) +@pytest.mark.parametrize( + "method", + [ lambda x: x.agg(["sum", "count"]), lambda x: x.transform(lambda y: y), lambda x: x.apply(lambda y: y), ], ) @not_implemented_mark -def test_groupby(obj, method): +def test_groupby_finalize_not_implemented(obj, method): obj.attrs = {"a": 1} result = method(obj.groupby([0, 0])) assert result.attrs == {"a": 1} + + +def test_finalize_frame_series_name(): + # https://github.com/pandas-dev/pandas/pull/37186/files#r506978889 + # ensure we don't copy the column `name` to the Series. + df = pd.DataFrame({"name": [1, 2]}) + result = pd.Series([1, 2]).__finalize__(df) + assert result.name is None diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index ad0d1face53cf..da02a82890adc 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -15,16 +15,9 @@ class TestDataFrame(Generic): _typ = DataFrame _comparator = lambda self, x, y: tm.assert_frame_equal(x, y) - def test_rename_mi(self): - df = DataFrame( - [11, 21, 31], - index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), - ) - df.rename(str.lower) - @pytest.mark.parametrize("func", ["_set_axis_name", "rename_axis"]) def test_set_axis_name(self, func): - df = pd.DataFrame([[1, 2], [3, 4]]) + df = DataFrame([[1, 2], [3, 4]]) result = methodcaller(func, "foo")(df) assert df.index.name is None @@ -76,8 +69,7 @@ def test_get_numeric_data_preserve_dtype(self): expected = DataFrame(index=[0, 1, 2], dtype=object) self._compare(result, expected) - def test_metadata_propagation_indiv(self): - + def test_metadata_propagation_indiv_groupby(self): # groupby df = DataFrame( { @@ -90,6 +82,7 @@ def test_metadata_propagation_indiv(self): result = df.groupby("A").sum() self.check_metadata(df, result) + def test_metadata_propagation_indiv_resample(self): # resample df = DataFrame( np.random.randn(1000, 2), @@ -98,6 +91,7 @@ def test_metadata_propagation_indiv(self): result = df.resample("1T") self.check_metadata(df, result) + def test_metadata_propagation_indiv(self): # merging with override # GH 6923 _metadata = DataFrame._metadata diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 2c2584e8dee01..cccae1babe82f 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -3,14 +3,11 @@ import numpy as np import pytest -from pandas.compat.numpy import np_version_under1p17 - from pandas.core.dtypes.common import is_scalar import pandas as pd from pandas import DataFrame, Series, date_range import pandas._testing as tm -import pandas.core.common as com # ---------------------------------------------------------------------- # Generic types test cases @@ -276,154 +273,6 @@ def test_head_tail(self, index): self._compare(o.head(-3), o.head(len(index) - 3)) self._compare(o.tail(-3), o.tail(len(index) - 3)) - def test_sample(self): - # Fixes issue: 2419 - - o = self._construct(shape=10) - - ### - # Check behavior of random_state argument - ### - - # Check for stability when receives seed or random state -- run 10 - # times. - for test in range(10): - seed = np.random.randint(0, 100) - self._compare( - o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed) - ) - - self._compare( - o.sample(frac=0.7, random_state=seed), - o.sample(frac=0.7, random_state=seed), - ) - - self._compare( - o.sample(n=4, random_state=np.random.RandomState(test)), - o.sample(n=4, random_state=np.random.RandomState(test)), - ) - - self._compare( - o.sample(frac=0.7, random_state=np.random.RandomState(test)), - o.sample(frac=0.7, random_state=np.random.RandomState(test)), - ) - - self._compare( - o.sample( - frac=2, replace=True, random_state=np.random.RandomState(test) - ), - o.sample( - frac=2, replace=True, random_state=np.random.RandomState(test) - ), - ) - - os1, os2 = [], [] - for _ in range(2): - np.random.seed(test) - os1.append(o.sample(n=4)) - os2.append(o.sample(frac=0.7)) - self._compare(*os1) - self._compare(*os2) - - # Check for error when random_state argument invalid. - with pytest.raises(ValueError): - o.sample(random_state="astring!") - - ### - # Check behavior of `frac` and `N` - ### - - # Giving both frac and N throws error - with pytest.raises(ValueError): - o.sample(n=3, frac=0.3) - - # Check that raises right error for negative lengths - with pytest.raises(ValueError): - o.sample(n=-3) - with pytest.raises(ValueError): - o.sample(frac=-0.3) - - # Make sure float values of `n` give error - with pytest.raises(ValueError): - o.sample(n=3.2) - - # Check lengths are right - assert len(o.sample(n=4) == 4) - assert len(o.sample(frac=0.34) == 3) - assert len(o.sample(frac=0.36) == 4) - - ### - # Check weights - ### - - # Weight length must be right - with pytest.raises(ValueError): - o.sample(n=3, weights=[0, 1]) - - with pytest.raises(ValueError): - bad_weights = [0.5] * 11 - o.sample(n=3, weights=bad_weights) - - with pytest.raises(ValueError): - bad_weight_series = Series([0, 0, 0.2]) - o.sample(n=4, weights=bad_weight_series) - - # Check won't accept negative weights - with pytest.raises(ValueError): - bad_weights = [-0.1] * 10 - o.sample(n=3, weights=bad_weights) - - # Check inf and -inf throw errors: - with pytest.raises(ValueError): - weights_with_inf = [0.1] * 10 - weights_with_inf[0] = np.inf - o.sample(n=3, weights=weights_with_inf) - - with pytest.raises(ValueError): - weights_with_ninf = [0.1] * 10 - weights_with_ninf[0] = -np.inf - o.sample(n=3, weights=weights_with_ninf) - - # All zeros raises errors - zero_weights = [0] * 10 - with pytest.raises(ValueError): - o.sample(n=3, weights=zero_weights) - - # All missing weights - nan_weights = [np.nan] * 10 - with pytest.raises(ValueError): - o.sample(n=3, weights=nan_weights) - - # Check np.nan are replaced by zeros. - weights_with_nan = [np.nan] * 10 - weights_with_nan[5] = 0.5 - self._compare(o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) - - # Check None are also replaced by zeros. - weights_with_None = [None] * 10 - weights_with_None[5] = 0.5 - self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) - - def test_sample_upsampling_without_replacement(self): - # GH27451 - - df = pd.DataFrame({"A": list("abc")}) - msg = ( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." - ) - with pytest.raises(ValueError, match=msg): - df.sample(frac=2, replace=False) - - def test_sample_is_copy(self): - # GH-27357, GH-30784: ensure the result of sample is an actual copy and - # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings - df = pd.DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) - df2 = df.sample(3) - - with tm.assert_produces_warning(None): - df2["d"] = 1 - def test_size_compat(self): # GH8846 # size property should be defined @@ -534,140 +383,6 @@ def test_pct_change(self, periods, fill_method, limit, exp): class TestNDFrame: # tests that don't fit elsewhere - def test_sample(sel): - # Fixes issue: 2419 - # additional specific object based tests - - # A few dataframe test with degenerate weights. - easy_weight_list = [0] * 10 - easy_weight_list[5] = 1 - - df = pd.DataFrame( - { - "col1": range(10, 20), - "col2": range(20, 30), - "colString": ["a"] * 10, - "easyweights": easy_weight_list, - } - ) - sample1 = df.sample(n=1, weights="easyweights") - tm.assert_frame_equal(sample1, df.iloc[5:6]) - - # Ensure proper error if string given as weight for Series or - # DataFrame with axis = 1. - s = Series(range(10)) - with pytest.raises(ValueError): - s.sample(n=3, weights="weight_column") - - with pytest.raises(ValueError): - df.sample(n=1, weights="weight_column", axis=1) - - # Check weighting key error - with pytest.raises( - KeyError, match="'String passed to weights not a valid column'" - ): - df.sample(n=3, weights="not_a_real_column_name") - - # Check that re-normalizes weights that don't sum to one. - weights_less_than_1 = [0] * 10 - weights_less_than_1[0] = 0.5 - tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) - - ### - # Test axis argument - ### - - # Test axis argument - df = pd.DataFrame({"col1": range(10), "col2": ["a"] * 10}) - second_column_weight = [0, 1] - tm.assert_frame_equal( - df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]] - ) - - # Different axis arg types - tm.assert_frame_equal( - df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]] - ) - - weight = [0] * 10 - weight[5] = 0.5 - tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6]) - tm.assert_frame_equal( - df.sample(n=1, axis="index", weights=weight), df.iloc[5:6] - ) - - # Check out of range axis values - with pytest.raises(ValueError): - df.sample(n=1, axis=2) - - with pytest.raises(ValueError): - df.sample(n=1, axis="not_a_name") - - with pytest.raises(ValueError): - s = pd.Series(range(10)) - s.sample(n=1, axis=1) - - # Test weight length compared to correct axis - with pytest.raises(ValueError): - df.sample(n=1, axis=1, weights=[0.5] * 10) - - # Check weights with axis = 1 - easy_weight_list = [0] * 3 - easy_weight_list[2] = 1 - - df = pd.DataFrame( - {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} - ) - sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) - tm.assert_frame_equal(sample1, df[["colString"]]) - - # Test default axes - tm.assert_frame_equal( - df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42) - ) - - # Test that function aligns weights with frame - df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) - s = Series([1, 0, 0], index=[3, 5, 9]) - tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) - - # Weights have index values to be dropped because not in - # sampled DataFrame - s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) - tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) - - # Weights have empty values to be filed with zeros - s3 = Series([0.01, 0], index=[3, 5]) - tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) - - # No overlap in weight and sampled DataFrame indices - s4 = Series([1, 0], index=[1, 2]) - with pytest.raises(ValueError): - df.sample(1, weights=s4) - - @pytest.mark.parametrize( - "func_str,arg", - [ - ("np.array", [2, 3, 1, 0]), - pytest.param( - "np.random.MT19937", - 3, - marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"), - ), - pytest.param( - "np.random.PCG64", - 11, - marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"), - ), - ], - ) - def test_sample_random_state(self, func_str, arg): - # GH32503 - df = pd.DataFrame({"col1": range(10, 20), "col2": range(20, 30)}) - result = df.sample(n=3, random_state=eval(func_str)(arg)) - expected = df.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) - tm.assert_frame_equal(result, expected) - def test_squeeze(self): # noop for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: @@ -837,34 +552,6 @@ def test_equals(self): df2 = df1.set_index(["floats"], append=True) assert df3.equals(df2) - def test_pipe(self): - df = DataFrame({"A": [1, 2, 3]}) - f = lambda x, y: x ** y - result = df.pipe(f, 2) - expected = DataFrame({"A": [1, 4, 9]}) - tm.assert_frame_equal(result, expected) - - result = df.A.pipe(f, 2) - tm.assert_series_equal(result, expected.A) - - def test_pipe_tuple(self): - df = DataFrame({"A": [1, 2, 3]}) - f = lambda x, y: y - result = df.pipe((f, "y"), 0) - tm.assert_frame_equal(result, df) - - result = df.A.pipe((f, "y"), 0) - tm.assert_series_equal(result, df.A) - - def test_pipe_tuple_error(self): - df = DataFrame({"A": [1, 2, 3]}) - f = lambda x, y: y - with pytest.raises(ValueError): - df.pipe((f, "y"), x=1, y=0) - - with pytest.raises(ValueError): - df.A.pipe((f, "y"), x=1, y=0) - @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) def test_axis_classmethods(self, box): obj = box(dtype=object) @@ -890,7 +577,7 @@ def test_axis_numbers_deprecated(self, box): @pytest.mark.parametrize("as_frame", [True, False]) def test_flags_identity(self, as_frame): - s = pd.Series([1, 2]) + s = Series([1, 2]) if as_frame: s = s.to_frame() diff --git a/pandas/tests/generic/test_logical_ops.py b/pandas/tests/generic/test_logical_ops.py new file mode 100644 index 0000000000000..58185cbd9a40f --- /dev/null +++ b/pandas/tests/generic/test_logical_ops.py @@ -0,0 +1,49 @@ +""" +Shareable tests for &, |, ^ +""" +import operator + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestLogicalOps: + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_logical_operators_nans(self, left, right, op, expected, klass): + # GH#13896 + result = op(klass(left), klass(right)) + expected = klass(expected) + + tm.assert_equal(result, expected) diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 07c02330d85ce..0a05a42f0fc39 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -14,13 +14,6 @@ class TestSeries(Generic): _typ = Series _comparator = lambda self, x, y: tm.assert_series_equal(x, y) - def test_rename_mi(self): - s = Series( - [11, 21, 31], - index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), - ) - s.rename(str.lower) - @pytest.mark.parametrize("func", ["rename_axis", "_set_axis_name"]) def test_set_axis_name_mi(self, func): s = Series( @@ -37,7 +30,7 @@ def test_set_axis_name_mi(self, func): assert result.index.names, ["L1", "L2"] def test_set_axis_name_raises(self): - s = pd.Series([1]) + s = Series([1]) msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): s._set_axis_name(name="a", axis=1) @@ -104,17 +97,7 @@ def test_nonzero_single_element(self): with pytest.raises(ValueError, match=msg): s.bool() - def test_metadata_propagation_indiv(self): - # check that the metadata matches up on the resulting ops - - o = Series(range(3), range(3)) - o.name = "foo" - o2 = Series(range(3), range(3)) - o2.name = "bar" - - result = o.T - self.check_metadata(o, result) - + def test_metadata_propagation_indiv_resample(self): # resample ts = Series( np.random.rand(1000), @@ -130,6 +113,17 @@ def test_metadata_propagation_indiv(self): result = ts.resample("1T").apply(lambda x: x.sum()) self.check_metadata(ts, result) + def test_metadata_propagation_indiv(self): + # check that the metadata matches up on the resulting ops + + o = Series(range(3), range(3)) + o.name = "foo" + o2 = Series(range(3), range(3)) + o2.name = "bar" + + result = o.T + self.check_metadata(o, result) + _metadata = Series._metadata _finalize = Series.__finalize__ Series._metadata = ["name", "filename"] @@ -157,25 +151,3 @@ def finalize(self, other, method=None, **kwargs): # reset Series._metadata = _metadata Series.__finalize__ = _finalize # FIXME: use monkeypatch - - -class TestSeries2: - # Separating off because it doesnt rely on parent class - @pytest.mark.parametrize( - "s", - [ - Series([np.arange(5)]), - pd.date_range("1/1/2011", periods=24, freq="H"), - pd.Series(range(5), index=pd.date_range("2017", periods=5)), - ], - ) - @pytest.mark.parametrize("shift_size", [0, 1, 2]) - def test_shift_always_copy(self, s, shift_size): - # GH22397 - assert s.shift(shift_size) is not s - - @pytest.mark.parametrize("move_by_freq", [pd.Timedelta("1D"), pd.Timedelta("1M")]) - def test_datetime_shift_always_copy(self, move_by_freq): - # GH22397 - s = pd.Series(range(5), index=pd.date_range("2017", periods=5)) - assert s.shift(freq=move_by_freq) is not s diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4a0ea5f520873..dba039b66d22d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -20,7 +20,7 @@ def test_groupby_agg_no_extra_calls(): # GH#31760 - df = pd.DataFrame({"key": ["a", "b", "c", "c"], "value": [1, 2, 3, 4]}) + df = DataFrame({"key": ["a", "b", "c", "c"], "value": [1, 2, 3, 4]}) gb = df.groupby("key")["value"] def dummy_func(x): @@ -115,13 +115,13 @@ def test_groupby_aggregation_multi_level_column(): [True, True, np.nan, False], [True, True, np.nan, False], ] - df = pd.DataFrame( + df = DataFrame( data=lst, columns=pd.MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]), ) result = df.groupby(level=1, axis=1).sum() - expected = pd.DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]}) + expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]}) tm.assert_frame_equal(result, expected) @@ -132,7 +132,7 @@ def test_agg_apply_corner(ts, tsframe): assert ts.dtype == np.float64 # groupby float64 values results in Float64Index - exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64)) + exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64)) tm.assert_series_equal(grouped.sum(), exp) tm.assert_series_equal(grouped.agg(np.sum), exp) tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) @@ -140,7 +140,7 @@ def test_agg_apply_corner(ts, tsframe): # DataFrame grouped = tsframe.groupby(tsframe["A"] * np.nan) exp_df = DataFrame( - columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64) + columns=tsframe.columns, dtype=float, index=Index([], dtype=np.float64) ) tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) @@ -213,14 +213,10 @@ def test_aggregate_item_by_item(df): # GH5782 # odd comparisons can result here, so cast to make easy - exp = pd.Series( - np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo" - ) + exp = Series(np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo") tm.assert_series_equal(result.xs("foo"), exp) - exp = pd.Series( - np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar" - ) + exp = Series(np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar") tm.assert_almost_equal(result.xs("bar"), exp) def aggfun(ser): @@ -257,7 +253,7 @@ def test_agg_multiple_functions_maintain_order(df): def test_agg_multiple_functions_same_name(): # GH 30880 - df = pd.DataFrame( + df = DataFrame( np.random.randn(1000, 3), index=pd.date_range("1/1/2012", freq="S", periods=1000), columns=["A", "B", "C"], @@ -270,7 +266,7 @@ def test_agg_multiple_functions_same_name(): expected_values = np.array( [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] ).T - expected = pd.DataFrame( + expected = DataFrame( expected_values, columns=expected_columns, index=expected_index ) tm.assert_frame_equal(result, expected) @@ -279,7 +275,7 @@ def test_agg_multiple_functions_same_name(): def test_agg_multiple_functions_same_name_with_ohlc_present(): # GH 30880 # ohlc expands dimensions, so different test to the above is required. - df = pd.DataFrame( + df = DataFrame( np.random.randn(1000, 3), index=pd.date_range("1/1/2012", freq="S", periods=1000), columns=["A", "B", "C"], @@ -302,7 +298,7 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] ).T expected_values = np.hstack([df.resample("3T").A.ohlc(), non_ohlc_expected_values]) - expected = pd.DataFrame( + expected = DataFrame( expected_values, columns=expected_columns, index=expected_index ) # PerformanceWarning is thrown by `assert col in right` in assert_frame_equal @@ -386,7 +382,7 @@ def test_multi_function_flexible_mix(df): def test_groupby_agg_coercing_bools(): # issue 14873 - dat = pd.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]}) + dat = DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]}) gp = dat.groupby("a") index = Index([1, 2], name="a") @@ -414,7 +410,7 @@ def test_groupby_agg_coercing_bools(): def test_bool_agg_dtype(op): # GH 7001 # Bool sum aggregations result in int - df = pd.DataFrame({"a": [1, 1], "b": [False, True]}) + df = DataFrame({"a": [1, 1], "b": [False, True]}) s = df.set_index("a")["b"] result = op(df.groupby("a"))["b"].dtype @@ -426,12 +422,12 @@ def test_bool_agg_dtype(op): def test_order_aggregate_multiple_funcs(): # GH 25692 - df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) + df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) result = res.columns.levels[1] - expected = pd.Index(["sum", "max", "mean", "ohlc", "min"]) + expected = Index(["sum", "max", "mean", "ohlc", "min"]) tm.assert_index_equal(result, expected) @@ -440,7 +436,7 @@ def test_order_aggregate_multiple_funcs(): @pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"]) def test_uint64_type_handling(dtype, how): # GH 26310 - df = pd.DataFrame({"x": 6903052872240755750, "y": [1, 2]}) + df = DataFrame({"x": 6903052872240755750, "y": [1, 2]}) expected = df.groupby("y").agg({"x": how}) df.x = df.x.astype(dtype) result = df.groupby("y").agg({"x": how}) @@ -451,7 +447,7 @@ def test_uint64_type_handling(dtype, how): def test_func_duplicates_raises(): # GH28426 msg = "Function names" - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) with pytest.raises(SpecificationError, match=msg): df.groupby("A").agg(["min", "min"]) @@ -475,7 +471,7 @@ def test_agg_index_has_complex_internals(index): def test_agg_split_block(): # https://github.com/pandas-dev/pandas/issues/31522 - df = pd.DataFrame( + df = DataFrame( { "key1": ["a", "a", "b", "b", "a"], "key2": ["one", "two", "one", "two", "one"], @@ -483,16 +479,16 @@ def test_agg_split_block(): } ) result = df.groupby("key1").min() - expected = pd.DataFrame( + expected = DataFrame( {"key2": ["one", "one"], "key3": ["six", "six"]}, - index=pd.Index(["a", "b"], name="key1"), + index=Index(["a", "b"], name="key1"), ) tm.assert_frame_equal(result, expected) def test_agg_split_object_part_datetime(): # https://github.com/pandas-dev/pandas/pull/31616 - df = pd.DataFrame( + df = DataFrame( { "A": pd.date_range("2000", periods=4), "B": ["a", "b", "c", "d"], @@ -503,7 +499,7 @@ def test_agg_split_object_part_datetime(): } ).astype(object) result = df.groupby([0, 0, 0, 0]).min() - expected = pd.DataFrame( + expected = DataFrame( { "A": [pd.Timestamp("2000")], "B": ["a"], @@ -518,10 +514,10 @@ def test_agg_split_object_part_datetime(): class TestNamedAggregationSeries: def test_series_named_agg(self): - df = pd.Series([1, 2, 3, 4]) + df = Series([1, 2, 3, 4]) gr = df.groupby([0, 0, 1, 1]) result = gr.agg(a="sum", b="min") - expected = pd.DataFrame( + expected = DataFrame( {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=[0, 1] ) tm.assert_frame_equal(result, expected) @@ -531,26 +527,26 @@ def test_series_named_agg(self): tm.assert_frame_equal(result, expected) def test_no_args_raises(self): - gr = pd.Series([1, 2]).groupby([0, 1]) + gr = Series([1, 2]).groupby([0, 1]) with pytest.raises(TypeError, match="Must provide"): gr.agg() # but we do allow this result = gr.agg([]) - expected = pd.DataFrame() + expected = DataFrame() tm.assert_frame_equal(result, expected) def test_series_named_agg_duplicates_no_raises(self): # GH28426 - gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + gr = Series([1, 2, 3]).groupby([0, 0, 1]) grouped = gr.agg(a="sum", b="sum") - expected = pd.DataFrame({"a": [3, 3], "b": [3, 3]}) + expected = DataFrame({"a": [3, 3], "b": [3, 3]}) tm.assert_frame_equal(expected, grouped) def test_mangled(self): - gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + gr = Series([1, 2, 3]).groupby([0, 0, 1]) result = gr.agg(a=lambda x: 0, b=lambda x: 1) - expected = pd.DataFrame({"a": [0, 0], "b": [1, 1]}) + expected = DataFrame({"a": [0, 0], "b": [1, 1]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -563,7 +559,7 @@ def test_mangled(self): ) def test_named_agg_nametuple(self, inp): # GH34422 - s = pd.Series([1, 1, 2, 2, 3, 3, 4, 5]) + s = Series([1, 1, 2, 2, 3, 3, 4, 5]) msg = f"func is expected but received {type(inp).__name__}" with pytest.raises(TypeError, match=msg): s.groupby(s.values).agg(a=inp) @@ -571,13 +567,13 @@ def test_named_agg_nametuple(self, inp): class TestNamedAggregationDataFrame: def test_agg_relabel(self): - df = pd.DataFrame( + df = DataFrame( {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} ) result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max")) - expected = pd.DataFrame( + expected = DataFrame( {"a_max": [1, 3], "b_max": [6, 8]}, - index=pd.Index(["a", "b"], name="group"), + index=Index(["a", "b"], name="group"), columns=["a_max", "b_max"], ) tm.assert_frame_equal(result, expected) @@ -592,7 +588,7 @@ def test_agg_relabel(self): b_max=("B", "max"), a_98=("A", p98), ) - expected = pd.DataFrame( + expected = DataFrame( { "b_min": [5, 7], "a_min": [0, 2], @@ -601,31 +597,27 @@ def test_agg_relabel(self): "b_max": [6, 8], "a_98": [0.98, 2.98], }, - index=pd.Index(["a", "b"], name="group"), + index=Index(["a", "b"], name="group"), columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"], ) tm.assert_frame_equal(result, expected) def test_agg_relabel_non_identifier(self): - df = pd.DataFrame( + df = DataFrame( {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} ) result = df.groupby("group").agg(**{"my col": ("A", "max")}) - expected = pd.DataFrame( - {"my col": [1, 3]}, index=pd.Index(["a", "b"], name="group") - ) + expected = DataFrame({"my col": [1, 3]}, index=Index(["a", "b"], name="group")) tm.assert_frame_equal(result, expected) def test_duplicate_no_raises(self): # GH 28426, if use same input function on same column, # no error should raise - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min")) - expected = pd.DataFrame( - {"a": [1, 3], "b": [1, 3]}, index=pd.Index([0, 1], name="A") - ) + expected = DataFrame({"a": [1, 3], "b": [1, 3]}, index=Index([0, 1], name="A")) tm.assert_frame_equal(grouped, expected) quant50 = functools.partial(np.percentile, q=50) @@ -633,34 +625,32 @@ def test_duplicate_no_raises(self): quant50.__name__ = "quant50" quant70.__name__ = "quant70" - test = pd.DataFrame( - {"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]} - ) + test = DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]}) grouped = test.groupby("col1").agg( quantile_50=("col2", quant50), quantile_70=("col2", quant70) ) - expected = pd.DataFrame( + expected = DataFrame( {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]}, - index=pd.Index(["a", "b"], name="col1"), + index=Index(["a", "b"], name="col1"), ) tm.assert_frame_equal(grouped, expected) def test_agg_relabel_with_level(self): - df = pd.DataFrame( + df = DataFrame( {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, index=pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]), ) result = df.groupby(level=0).agg( aa=("A", "max"), bb=("A", "min"), cc=("B", "mean") ) - expected = pd.DataFrame( + expected = DataFrame( {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"] ) tm.assert_frame_equal(result, expected) def test_agg_relabel_other_raises(self): - df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) + df = DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) grouped = df.groupby("A") match = "Must provide" with pytest.raises(TypeError, match=match): @@ -673,12 +663,12 @@ def test_agg_relabel_other_raises(self): grouped.agg(a=("B", "max"), b=(1, 2, 3)) def test_missing_raises(self): - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + df = DataFrame({"A": [0, 1], "B": [1, 2]}) with pytest.raises(KeyError, match="Column 'C' does not exist"): df.groupby("A").agg(c=("C", "sum")) def test_agg_namedtuple(self): - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + df = DataFrame({"A": [0, 1], "B": [1, 2]}) result = df.groupby("A").agg( b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count") ) @@ -686,11 +676,9 @@ def test_agg_namedtuple(self): tm.assert_frame_equal(result, expected) def test_mangled(self): - df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) + df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1)) - expected = pd.DataFrame( - {"b": [0, 0], "c": [1, 1]}, index=pd.Index([0, 1], name="A") - ) + expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A")) tm.assert_frame_equal(result, expected) @@ -731,7 +719,7 @@ def test_agg_relabel_multiindex_column( {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} ) df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) - idx = pd.Index(["a", "b"], name=("x", "group")) + idx = Index(["a", "b"], name=("x", "group")) result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")) expected = DataFrame({"a_max": [1, 3]}, index=idx) @@ -769,7 +757,7 @@ def test_agg_relabel_multiindex_duplicates(): result = df.groupby(("x", "group")).agg( a=(("y", "A"), "min"), b=(("y", "A"), "min") ) - idx = pd.Index(["a", "b"], name=("x", "group")) + idx = Index(["a", "b"], name=("x", "group")) expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx) tm.assert_frame_equal(result, expected) @@ -777,11 +765,11 @@ def test_agg_relabel_multiindex_duplicates(): @pytest.mark.parametrize("kwargs", [{"c": ["min"]}, {"b": [], "c": ["min"]}]) def test_groupby_aggregate_empty_key(kwargs): # GH: 32580 - df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) + df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) result = df.groupby("a").agg(kwargs) - expected = pd.DataFrame( + expected = DataFrame( [1, 4], - index=pd.Index([1, 2], dtype="int64", name="a"), + index=Index([1, 2], dtype="int64", name="a"), columns=pd.MultiIndex.from_tuples([["c", "min"]]), ) tm.assert_frame_equal(result, expected) @@ -789,9 +777,9 @@ def test_groupby_aggregate_empty_key(kwargs): def test_groupby_aggregate_empty_key_empty_return(): # GH: 32580 Check if everything works, when return is empty - df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) + df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) result = df.groupby("a").agg({"b": []}) - expected = pd.DataFrame(columns=pd.MultiIndex(levels=[["b"], []], codes=[[], []])) + expected = DataFrame(columns=pd.MultiIndex(levels=[["b"], []], codes=[[], []])) tm.assert_frame_equal(result, expected) @@ -799,13 +787,13 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel(): # GH 32240: When the aggregate function relabels column names and # as_index=False is specified, the results are dropped. - df = pd.DataFrame( + df = DataFrame( {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]} ) grouped = df.groupby("key", as_index=False) result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) - expected = pd.DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]}) + expected = DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]}) tm.assert_frame_equal(result, expected) @@ -814,7 +802,7 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): # as_index=False is specified, the results are dropped. Check if # multiindex is returned in the right order - df = pd.DataFrame( + df = DataFrame( { "key": ["x", "y", "x", "y", "x", "x"], "key1": ["a", "b", "c", "b", "a", "c"], @@ -824,7 +812,7 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): grouped = df.groupby(["key", "key1"], as_index=False) result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) - expected = pd.DataFrame( + expected = DataFrame( {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]} ) tm.assert_frame_equal(result, expected) @@ -836,10 +824,10 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): def test_multiindex_custom_func(func): # GH 31777 data = [[1, 4, 2], [5, 7, 1]] - df = pd.DataFrame(data, columns=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) + df = DataFrame(data, columns=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) result = df.groupby(np.array([0, 1])).agg(func) expected_dict = {(1, 3): {0: 1, 1: 5}, (1, 4): {0: 4, 1: 7}, (2, 3): {0: 2, 1: 1}} - expected = pd.DataFrame(expected_dict) + expected = DataFrame(expected_dict) tm.assert_frame_equal(result, expected) @@ -872,13 +860,13 @@ def test_lambda_named_agg(func): def test_aggregate_mixed_types(): # GH 16916 - df = pd.DataFrame( + df = DataFrame( data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc") ) df["grouping"] = ["group 1", "group 1", 2] result = df.groupby("grouping").aggregate(lambda x: x.tolist()) expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]] - expected = pd.DataFrame( + expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), columns=Index(["X", "Y", "Z"], dtype="object"), @@ -901,9 +889,9 @@ def aggfunc(x): else: return pd.NA - df = pd.DataFrame({"A": pd.array([1, 2, 3])}) + df = DataFrame({"A": pd.array([1, 2, 3])}) result = df.groupby([1, 1, 2]).agg(aggfunc) - expected = pd.DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2]) + expected = DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2]) tm.assert_frame_equal(result, expected) @@ -912,11 +900,11 @@ def test_groupby_aggregate_period_column(func): # GH 31471 groups = [1, 2] periods = pd.period_range("2020", periods=2, freq="Y") - df = pd.DataFrame({"a": groups, "b": periods}) + df = DataFrame({"a": groups, "b": periods}) result = getattr(df.groupby("a")["b"], func)() idx = pd.Int64Index([1, 2], name="a") - expected = pd.Series(periods, index=idx, name="b") + expected = Series(periods, index=idx, name="b") tm.assert_series_equal(result, expected) @@ -926,47 +914,47 @@ def test_groupby_aggregate_period_frame(func): # GH 31471 groups = [1, 2] periods = pd.period_range("2020", periods=2, freq="Y") - df = pd.DataFrame({"a": groups, "b": periods}) + df = DataFrame({"a": groups, "b": periods}) result = getattr(df.groupby("a"), func)() idx = pd.Int64Index([1, 2], name="a") - expected = pd.DataFrame({"b": periods}, index=idx) + expected = DataFrame({"b": periods}, index=idx) tm.assert_frame_equal(result, expected) class TestLambdaMangling: def test_basic(self): - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) - expected = pd.DataFrame( + expected = DataFrame( {("B", ""): [0, 0], ("B", ""): [1, 1]}, - index=pd.Index([0, 1], name="A"), + index=Index([0, 1], name="A"), ) tm.assert_frame_equal(result, expected) def test_mangle_series_groupby(self): - gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1]) + gr = Series([1, 2, 3, 4]).groupby([0, 0, 1, 1]) result = gr.agg([lambda x: 0, lambda x: 1]) - expected = pd.DataFrame({"": [0, 0], "": [1, 1]}) + expected = DataFrame({"": [0, 0], "": [1, 1]}) tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.") def test_with_kwargs(self): f1 = lambda x, y, b=1: x.sum() + y + b f2 = lambda x, y, b=2: x.sum() + y * b - result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0) - expected = pd.DataFrame({"": [4], "": [6]}) + result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0) + expected = DataFrame({"": [4], "": [6]}) tm.assert_frame_equal(result, expected) - result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) - expected = pd.DataFrame({"": [13], "": [30]}) + result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) + expected = DataFrame({"": [13], "": [30]}) tm.assert_frame_equal(result, expected) def test_agg_with_one_lambda(self): # GH 25719, write tests for DataFrameGroupby.agg with only one lambda - df = pd.DataFrame( + df = DataFrame( { "kind": ["cat", "dog", "cat", "dog"], "height": [9.1, 6.0, 9.5, 34.0], @@ -975,13 +963,13 @@ def test_agg_with_one_lambda(self): ) columns = ["height_sqr_min", "height_max", "weight_max"] - expected = pd.DataFrame( + expected = DataFrame( { "height_sqr_min": [82.81, 36.00], "height_max": [9.5, 34.0], "weight_max": [9.9, 198.0], }, - index=pd.Index(["cat", "dog"], name="kind"), + index=Index(["cat", "dog"], name="kind"), columns=columns, ) @@ -1006,7 +994,7 @@ def test_agg_with_one_lambda(self): def test_agg_multiple_lambda(self): # GH25719, test for DataFrameGroupby.agg with multiple lambdas # with mixed aggfunc - df = pd.DataFrame( + df = DataFrame( { "kind": ["cat", "dog", "cat", "dog"], "height": [9.1, 6.0, 9.5, 34.0], @@ -1020,7 +1008,7 @@ def test_agg_multiple_lambda(self): "height_max_2", "weight_min", ] - expected = pd.DataFrame( + expected = DataFrame( { "height_sqr_min": [82.81, 36.00], "height_max": [9.5, 34.0], @@ -1028,7 +1016,7 @@ def test_agg_multiple_lambda(self): "height_max_2": [9.5, 34.0], "weight_min": [7.9, 7.5], }, - index=pd.Index(["cat", "dog"], name="kind"), + index=Index(["cat", "dog"], name="kind"), columns=columns, ) @@ -1057,9 +1045,9 @@ def test_agg_multiple_lambda(self): def test_groupby_get_by_index(): # GH 33439 - df = pd.DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]}) + df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]}) res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) - expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") + expected = DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) @@ -1075,7 +1063,7 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): # test single aggregations on ordered categorical cols GHGH27800 # create the result dataframe - input_df = pd.DataFrame( + input_df = DataFrame( { "nr": [1, 2, 3, 4, 5, 6, 7, 8], "cat_ord": list("aabbccdd"), @@ -1092,7 +1080,7 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" ) - expected_df = pd.DataFrame(data=exp_data, index=cat_index) + expected_df = DataFrame(data=exp_data, index=cat_index) tm.assert_frame_equal(result_df, expected_df) @@ -1109,7 +1097,7 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): # test combined aggregations on ordered categorical cols GH27800 # create the result dataframe - input_df = pd.DataFrame( + input_df = DataFrame( { "nr": [1, 2, 3, 4, 5, 6, 7, 8], "cat_ord": list("aabbccdd"), @@ -1137,7 +1125,7 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): multi_index_list.append([k, v]) multi_index = pd.MultiIndex.from_tuples(tuple(multi_index_list)) - expected_df = pd.DataFrame(data=exp_data, columns=multi_index, index=cat_index) + expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index) tm.assert_frame_equal(result_df, expected_df) @@ -1145,7 +1133,7 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): def test_nonagg_agg(): # GH 35490 - Single/Multiple agg of non-agg function give same results # TODO: agg should raise for functions that don't aggregate - df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) + df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) g = df.groupby("a") result = g.agg(["cumsum"]) @@ -1157,14 +1145,14 @@ def test_nonagg_agg(): def test_agg_no_suffix_index(): # GH36189 - df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) + df = DataFrame([[4, 9]] * 3, columns=["A", "B"]) result = df.agg(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = pd.DataFrame( + expected = DataFrame( {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] ) tm.assert_frame_equal(result, expected) # test Series case result = df["A"].agg(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = pd.Series([12, 12, 12], index=["sum", "", ""], name="A") + expected = Series([12, 12, 12], index=["sum", "", ""], name="A") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 87ebd8b5a27fb..c907391917ca8 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -176,7 +176,7 @@ def test__cython_agg_general(op, targop): ], ) def test_cython_agg_empty_buckets(op, targop, observed): - df = pd.DataFrame([11, 12, 13]) + df = DataFrame([11, 12, 13]) grps = range(0, 55, 5) # calling _cython_agg_general directly, instead of via the user API @@ -192,14 +192,14 @@ def test_cython_agg_empty_buckets(op, targop, observed): def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these - df = pd.DataFrame([11, 12, 13], columns=["a"]) + df = DataFrame([11, 12, 13], columns=["a"]) grps = range(0, 25, 5) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "add" ) intervals = pd.interval_range(0, 20, freq=5) - expected = pd.DataFrame( + expected = DataFrame( {"a": [0, 0, 36, 0]}, index=pd.CategoricalIndex(intervals, name="a", ordered=True), ) @@ -212,7 +212,7 @@ def test_cython_agg_empty_buckets_nanops(observed): result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "prod" ) - expected = pd.DataFrame( + expected = DataFrame( {"a": [1, 1, 1716, 1]}, index=pd.CategoricalIndex(intervals, name="a", ordered=True), ) @@ -277,3 +277,38 @@ def test_read_only_buffer_source_agg(agg): expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op_name", + [ + "count", + "sum", + "std", + "var", + "sem", + "mean", + "median", + "prod", + "min", + "max", + ], +) +def test_cython_agg_nullable_int(op_name): + # ensure that the cython-based aggregations don't fail for nullable dtype + # (eg https://github.com/pandas-dev/pandas/issues/37415) + df = DataFrame( + { + "A": ["A", "B"] * 5, + "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"), + } + ) + result = getattr(df.groupby("A")["B"], op_name)() + df2 = df.assign(B=df["B"].astype("float64")) + expected = getattr(df2.groupby("A")["B"], op_name)() + + if op_name != "count": + # the result is not yet consistently using Int64/Float64 dtype, + # so for now just checking the values by casting to float + result = result.astype("float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index e8cd6017a117c..15803d4b0ef94 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -130,11 +130,11 @@ def test_agg_dict_parameter_cast_result_dtypes(): tm.assert_series_equal(grouped.time.agg("last"), exp["time"]) # count - exp = pd.Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time") + exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time") tm.assert_series_equal(grouped.time.agg(len), exp) tm.assert_series_equal(grouped.time.size(), exp) - exp = pd.Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time") + exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time") tm.assert_series_equal(grouped.time.count(), exp) @@ -143,7 +143,7 @@ def test_agg_cast_results_dtypes(): # xref #11444 u = [dt.datetime(2015, x + 1, 1) for x in range(12)] v = list("aaabbbbbbccd") - df = pd.DataFrame({"X": v, "Y": u}) + df = DataFrame({"X": v, "Y": u}) result = df.groupby("X")["Y"].agg(len) expected = df.groupby("X")["Y"].count() @@ -216,7 +216,7 @@ def test_aggregate_api_consistency(): def test_agg_dict_renaming_deprecation(): # 15931 - df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) + df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): @@ -414,7 +414,7 @@ def __call__(self, x): def test_agg_over_numpy_arrays(): # GH 3788 - df = pd.DataFrame( + df = DataFrame( [ [1, np.array([10, 20, 30])], [1, np.array([40, 50, 60])], @@ -427,9 +427,7 @@ def test_agg_over_numpy_arrays(): expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] expected_index = pd.Index([1, 2], name="category") expected_column = ["arraydata"] - expected = pd.DataFrame( - expected_data, index=expected_index, columns=expected_column - ) + expected = DataFrame(expected_data, index=expected_index, columns=expected_column) tm.assert_frame_equal(result, expected) @@ -438,23 +436,23 @@ def test_agg_tzaware_non_datetime_result(): # discussed in GH#29589, fixed in GH#29641, operating on tzaware values # with function that is not dtype-preserving dti = pd.date_range("2012-01-01", periods=4, tz="UTC") - df = pd.DataFrame({"a": [0, 0, 1, 1], "b": dti}) + df = DataFrame({"a": [0, 0, 1, 1], "b": dti}) gb = df.groupby("a") # Case that _does_ preserve the dtype result = gb["b"].agg(lambda x: x.iloc[0]) - expected = pd.Series(dti[::2], name="b") + expected = Series(dti[::2], name="b") expected.index.name = "a" tm.assert_series_equal(result, expected) # Cases that do _not_ preserve the dtype result = gb["b"].agg(lambda x: x.iloc[0].year) - expected = pd.Series([2012, 2012], name="b") + expected = Series([2012, 2012], name="b") expected.index.name = "a" tm.assert_series_equal(result, expected) result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) - expected = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") + expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") expected.index.name = "a" tm.assert_series_equal(result, expected) @@ -462,9 +460,7 @@ def test_agg_tzaware_non_datetime_result(): def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") - df = pd.DataFrame( - {"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]} - ) + df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}) result1 = df.groupby("a")["b"].agg(np.min).iloc[0] result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0] @@ -477,7 +473,7 @@ def test_agg_timezone_round_trip(): dates = [ pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5) ] - df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates}) + df = DataFrame({"A": ["a", "b"] * 2, "B": dates}) grouped = df.groupby("A") ts = df["B"].iloc[0] @@ -498,13 +494,13 @@ def test_agg_timezone_round_trip(): def test_sum_uint64_overflow(): # see gh-14758 # Convert to uint64 and don't overflow - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) + df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) df = df + 9223372036854775807 index = pd.Index( [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 ) - expected = pd.DataFrame( + expected = DataFrame( {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, index=index, ) @@ -517,20 +513,20 @@ def test_sum_uint64_overflow(): @pytest.mark.parametrize( "structure, expected", [ - (tuple, pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), - (list, pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), + (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), + (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), ( lambda x: tuple(x), - pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), + DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), ), ( lambda x: list(x), - pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), + DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), ), ], ) def test_agg_structs_dataframe(structure, expected): - df = pd.DataFrame( + df = DataFrame( {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} ) @@ -542,15 +538,15 @@ def test_agg_structs_dataframe(structure, expected): @pytest.mark.parametrize( "structure, expected", [ - (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), - (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), - (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), - (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), ], ) def test_agg_structs_series(structure, expected): # Issue #18079 - df = pd.DataFrame( + df = DataFrame( {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} ) @@ -561,11 +557,11 @@ def test_agg_structs_series(structure, expected): def test_agg_category_nansum(observed): categories = ["a", "b", "c"] - df = pd.DataFrame( + df = DataFrame( {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} ) result = df.groupby("A", observed=observed).B.agg(np.nansum) - expected = pd.Series( + expected = Series( [3, 3, 0], index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), name="B", @@ -577,12 +573,10 @@ def test_agg_category_nansum(observed): def test_agg_list_like_func(): # GH 18473 - df = pd.DataFrame( - {"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]} - ) + df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]}) grouped = df.groupby("A", as_index=False, sort=False) result = grouped.agg({"B": lambda x: list(x)}) - expected = pd.DataFrame( + expected = DataFrame( {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]} ) tm.assert_frame_equal(result, expected) @@ -590,7 +584,7 @@ def test_agg_list_like_func(): def test_agg_lambda_with_timezone(): # GH 23683 - df = pd.DataFrame( + df = DataFrame( { "tag": [1, 1], "date": [ @@ -600,7 +594,7 @@ def test_agg_lambda_with_timezone(): } ) result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) - expected = pd.DataFrame( + expected = DataFrame( [pd.Timestamp("2018-01-01", tz="UTC")], index=pd.Index([1], name="tag"), columns=["date"], @@ -629,11 +623,11 @@ def test_groupby_agg_err_catching(err_cls): from pandas.tests.extension.decimal.array import DecimalArray, make_data, to_decimal data = make_data()[:5] - df = pd.DataFrame( + df = DataFrame( {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} ) - expected = pd.Series(to_decimal([data[0], data[3]])) + expected = Series(to_decimal([data[0], data[3]])) def weird_func(x): # weird function that raise something other than TypeError or IndexError diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 176efdb6204da..f5078930a7b4b 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -40,7 +40,7 @@ def test_apply_issues(): # GH 5789 # don't auto coerce dates df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"]) - exp_idx = pd.Index( + exp_idx = Index( ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) @@ -51,7 +51,7 @@ def test_apply_issues(): def test_apply_trivial(): # GH 20066 # trivial apply: ignore input and return a constant dataframe. - df = pd.DataFrame( + df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) @@ -65,7 +65,7 @@ def test_apply_trivial(): def test_apply_trivial_fail(): # GH 20066 - df = pd.DataFrame( + df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) @@ -189,7 +189,7 @@ def test_group_apply_once_per_group2(capsys): expected = 2 # Number of times `apply` should call a function for the current test - df = pd.DataFrame( + df = DataFrame( { "group_by_column": [0, 0, 0, 0, 1, 1, 1, 1], "test_column": ["0", "2", "4", "6", "8", "10", "12", "14"], @@ -241,7 +241,7 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): # have an impact on the index structure of the result since this is not # transparent to the user - df = pd.DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) result = df.groupby("g").apply(func) tm.assert_frame_equal(result, df) @@ -538,7 +538,7 @@ def filt2(x): @pytest.mark.parametrize("test_series", [True, False]) def test_apply_with_duplicated_non_sorted_axis(test_series): # GH 30667 - df = pd.DataFrame( + df = DataFrame( [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2] ) if test_series: @@ -565,10 +565,8 @@ def test_apply_reindex_values(): # solved in #30679 values = [1, 2, 3, 4] indices = [1, 1, 2, 2] - df = pd.DataFrame( - {"group": ["Group1", "Group2"] * 2, "value": values}, index=indices - ) - expected = pd.Series(values, index=indices, name="value") + df = DataFrame({"group": ["Group1", "Group2"] * 2, "value": values}, index=indices) + expected = Series(values, index=indices, name="value") def reindex_helper(x): return x.reindex(np.arange(x.index.min(), x.index.max() + 1)) @@ -608,7 +606,7 @@ def test_apply_numeric_coercion_when_datetime(): # for which are here. # GH 15670 - df = pd.DataFrame( + df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) @@ -617,7 +615,7 @@ def test_apply_numeric_coercion_when_datetime(): tm.assert_series_equal(result["Str"], expected["Str"]) # GH 15421 - df = pd.DataFrame( + df = DataFrame( {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3} ) @@ -631,7 +629,7 @@ def get_B(g): # GH 14423 def predictions(tool): - out = pd.Series(index=["p1", "p2", "useTime"], dtype=object) + out = Series(index=["p1", "p2", "useTime"], dtype=object) if "step1" in list(tool.State): out["p1"] = str(tool[tool.State == "step1"].Machine.values[0]) if "step2" in list(tool.State): @@ -639,7 +637,7 @@ def predictions(tool): out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0]) return out - df1 = pd.DataFrame( + df1 = DataFrame( { "Key": ["B", "B", "A", "A"], "State": ["step1", "step2", "step1", "step2"], @@ -658,7 +656,7 @@ def test_apply_aggregating_timedelta_and_datetime(): # Regression test for GH 15562 # The following groupby caused ValueErrors and IndexErrors pre 0.20.0 - df = pd.DataFrame( + df = DataFrame( { "clientid": ["A", "B", "C"], "datetime": [np.datetime64("2017-02-01 00:00:00")] * 3, @@ -666,11 +664,11 @@ def test_apply_aggregating_timedelta_and_datetime(): ) df["time_delta_zero"] = df.datetime - df.datetime result = df.groupby("clientid").apply( - lambda ddf: pd.Series( + lambda ddf: Series( dict(clientid_age=ddf.time_delta_zero.min(), date=ddf.datetime.min()) ) ) - expected = pd.DataFrame( + expected = DataFrame( { "clientid": ["A", "B", "C"], "clientid_age": [np.timedelta64(0, "D")] * 3, @@ -686,13 +684,13 @@ def test_apply_groupby_datetimeindex(): # groupby apply failed on dataframe with DatetimeIndex data = [["A", 10], ["B", 20], ["B", 30], ["C", 40], ["C", 50]] - df = pd.DataFrame( + df = DataFrame( data, columns=["Name", "Value"], index=pd.date_range("2020-09-01", "2020-09-05") ) result = df.groupby("Name").sum() - expected = pd.DataFrame({"Name": ["A", "B", "C"], "Value": [10, 50, 90]}) + expected = DataFrame({"Name": ["A", "B", "C"], "Value": [10, 50, 90]}) expected.set_index("Name", inplace=True) tm.assert_frame_equal(result, expected) @@ -704,22 +702,20 @@ def test_time_field_bug(): # that were not returned by the apply function, an exception would be # raised. - df = pd.DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]}) + df = DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]}) def func_with_no_date(batch): - return pd.Series({"c": 2}) + return Series({"c": 2}) def func_with_date(batch): - return pd.Series({"b": datetime(2015, 1, 1), "c": 2}) + return Series({"b": datetime(2015, 1, 1), "c": 2}) dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) - dfg_no_conversion_expected = pd.DataFrame({"c": 2}, index=[1]) + dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) - dfg_conversion_expected = pd.DataFrame( - {"b": datetime(2015, 1, 1), "c": 2}, index=[1] - ) + dfg_conversion_expected = DataFrame({"b": datetime(2015, 1, 1), "c": 2}, index=[1]) dfg_conversion_expected.index.name = "a" tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected) @@ -788,10 +784,10 @@ def test_func(x): def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups - df = pd.DataFrame(dict(value=[0, 1], group=["filled", "empty"])) + df = DataFrame(dict(value=[0, 1], group=["filled", "empty"])) groups = df.groupby("group") result = groups.apply(lambda group: group[group.value != 1]["value"]) - expected = pd.Series( + expected = Series( [0], name="value", index=MultiIndex.from_product( @@ -803,11 +799,11 @@ def test_groupby_apply_return_empty_chunk(): def test_apply_with_mixed_types(): # gh-20949 - df = pd.DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) + df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) g = df.groupby("A") result = g.transform(lambda x: x / x.sum()) - expected = pd.DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) + expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) tm.assert_frame_equal(result, expected) result = g.apply(lambda x: x / x.sum()) @@ -835,10 +831,10 @@ def test_apply_datetime_issue(group_column_dtlike): # is a datetime object and the column labels are different from # standard int values in range(len(num_columns)) - df = pd.DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - result = df.groupby("a").apply(lambda x: pd.Series(["spam"], index=[42])) + df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - expected = pd.DataFrame( + expected = DataFrame( ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] ) tm.assert_frame_equal(result, expected) @@ -876,7 +872,7 @@ def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.iteritems()}) result = tdf.groupby("day").apply(most_common_values)["userId"] - expected = pd.Series( + expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) tm.assert_series_equal(result, expected) @@ -890,12 +886,12 @@ def test_apply_multi_level_name(category): b = pd.Categorical(b, categories=[1, 2, 3]) expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") else: - expected_index = pd.Index([1, 2], name="B") - df = pd.DataFrame( + expected_index = Index([1, 2], name="B") + df = DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) + expected = DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] @@ -953,9 +949,9 @@ def test_apply_index_has_complex_internals(index): ) def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 - df = pd.DataFrame(["A", "A", "B", "B"], columns=["groups"]) + df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) result = df.groupby("groups").apply(function) - expected = pd.Series(expected_values, index=pd.Index(["A", "B"], name="groups")) + expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -964,11 +960,11 @@ def test_apply_function_returns_numpy_array(): def fct(group): return group["B"].values.flatten() - df = pd.DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) + df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) result = df.groupby("A").apply(fct) - expected = pd.Series( - [[1.0, 2.0], [3.0], [np.nan]], index=pd.Index(["a", "b", "none"], name="A") + expected = Series( + [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) tm.assert_series_equal(result, expected) @@ -976,27 +972,25 @@ def fct(group): @pytest.mark.parametrize("function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1]) def test_apply_function_index_return(function): # GH: 22541 - df = pd.DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) + df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) result = df.groupby("id").apply(function) - expected = pd.Series( - [pd.Index([0, 4, 7, 9]), pd.Index([1, 2, 3, 5]), pd.Index([6, 8])], - index=pd.Index([1, 2, 3], name="id"), + expected = Series( + [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], + index=Index([1, 2, 3], name="id"), ) tm.assert_series_equal(result, expected) def test_apply_function_with_indexing(): # GH: 33058 - df = pd.DataFrame( - {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} - ) + df = DataFrame({"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}) def fn(x): x.col2[x.index[-1]] = 0 return x.col2 result = df.groupby(["col1"], as_index=False).apply(fn) - expected = pd.Series( + expected = Series( [1, 2, 0, 4, 5, 0], index=pd.MultiIndex.from_tuples( [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] @@ -1026,8 +1020,8 @@ def test_apply_with_timezones_aware(): dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2 index_no_tz = pd.DatetimeIndex(dates) index_tz = pd.DatetimeIndex(dates, tz="UTC") - df1 = pd.DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) - df2 = pd.DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) + df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) + df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) @@ -1046,9 +1040,9 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): } ) - expected = pd.DataFrame( + expected = DataFrame( {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, - index=pd.Index([88, 99], name="a"), + index=Index([88, 99], name="a"), ) # Check output when no other methods are called before .apply() @@ -1067,7 +1061,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): # GH 29617 - df = pd.DataFrame( + df = DataFrame( { "A": ["a", "a", "a", "b"], "B": [ @@ -1078,7 +1072,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ], "C": [1, 2, 3, 4], }, - index=pd.Index([100, 101, 102, 103], name="idx"), + index=Index([100, 101, 102, 103], name="idx"), ) grp = df.groupby(["A", "B"]) @@ -1100,7 +1094,7 @@ def test_apply_by_cols_equals_apply_by_rows_transposed(): # should give the same result. There was previously a bug where the # by_rows operation would work fine, but by_cols would throw a ValueError - df = pd.DataFrame( + df = DataFrame( np.random.random([6, 4]), columns=pd.MultiIndex.from_product([["A", "B"], [1, 2]]), ) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index f20eed4575e91..aff9911961b25 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -1,12 +1,10 @@ import numpy as np import pytest -from pandas._libs import groupby, lib, reduction as libreduction - -from pandas.core.dtypes.common import ensure_int64 +from pandas._libs import lib, reduction as libreduction import pandas as pd -from pandas import Series, isna +from pandas import Series import pandas._testing as tm @@ -103,36 +101,5 @@ def test_generate_bins(binner, closed, expected): tm.assert_numpy_array_equal(result, expected) -def test_group_ohlc(): - def _check(dtype): - obj = np.array(np.random.randn(20), dtype=dtype) - - bins = np.array([6, 12, 20]) - out = np.zeros((3, 4), dtype) - counts = np.zeros(len(out), dtype=np.int64) - labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - - func = getattr(groupby, f"group_ohlc_{dtype}") - func(out, counts, obj[:, None], labels) - - def _ohlc(group): - if isna(group).all(): - return np.repeat(np.nan, 4) - return [group[0], group.max(), group.min(), group[-1]] - - expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) - - tm.assert_almost_equal(out, expected) - tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) - - obj[:6] = np.nan - func(out, counts, obj[:, None], labels) - expected[0] = np.nan - tm.assert_almost_equal(out, expected) - - _check("float32") - _check("float64") - - class TestMoments: pass diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 711daf7fe415d..c29c9e15ada79 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -470,13 +470,13 @@ def test_observed_groups_with_nan(observed): def test_observed_nth(): # GH 26385 cat = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"]) - ser = pd.Series([1, 2, 3]) - df = pd.DataFrame({"cat": cat, "ser": ser}) + ser = Series([1, 2, 3]) + df = DataFrame({"cat": cat, "ser": ser}) result = df.groupby("cat", observed=False)["ser"].nth(0) index = pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]) - expected = pd.Series([1, np.nan, np.nan], index=index, name="ser") + expected = Series([1, np.nan, np.nan], index=index, name="ser") expected.index.name = "cat" tm.assert_series_equal(result, expected) @@ -768,11 +768,11 @@ def test_preserve_on_ordered_ops(func, values): # gh-18502 # preserve the categoricals on ops c = pd.Categorical(["first", "second", "third", "fourth"], ordered=True) - df = pd.DataFrame({"payload": [-1, -2, -1, -2], "col": c}) + df = DataFrame({"payload": [-1, -2, -1, -2], "col": c}) g = df.groupby("payload") result = getattr(g, func)() - expected = pd.DataFrame( - {"payload": [-2, -1], "col": pd.Series(values, dtype=c.dtype)} + expected = DataFrame( + {"payload": [-2, -1], "col": Series(values, dtype=c.dtype)} ).set_index("payload") tm.assert_frame_equal(result, expected) @@ -818,13 +818,11 @@ def test_groupby_empty_with_category(): # GH-9614 # test fix for when group by on None resulted in # coercion of dtype categorical -> float - df = pd.DataFrame( - {"A": [None] * 3, "B": pd.Categorical(["train", "train", "test"])} - ) + df = DataFrame({"A": [None] * 3, "B": pd.Categorical(["train", "train", "test"])}) result = df.groupby("A").first()["B"] - expected = pd.Series( + expected = Series( pd.Categorical([], categories=["test", "train"]), - index=pd.Series([], dtype="object", name="A"), + index=Series([], dtype="object", name="A"), name="B", ) tm.assert_series_equal(result, expected) @@ -1280,10 +1278,10 @@ def test_groupby_cat_preserves_structure(observed, ordered): def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe - df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) + df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): df.groupby("var").apply( - lambda rows: pd.DataFrame( + lambda rows: DataFrame( {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) ) @@ -1300,7 +1298,7 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed, r ) request.node.add_marker(mark) - df = pd.DataFrame( + df = DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), @@ -1333,7 +1331,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( ) request.node.add_marker(mark) - df = pd.DataFrame( + df = DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), @@ -1369,7 +1367,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - df = pd.DataFrame( + df = DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), "cat_2": pd.Categorical(list("1111"), categories=list("12")), @@ -1399,7 +1397,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - df = pd.DataFrame( + df = DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), "cat_2": pd.Categorical(list("1111"), categories=list("12")), @@ -1424,7 +1422,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} - df = pd.DataFrame(d) + df = DataFrame(d) cat = pd.cut(df["foo"], np.linspace(0, 20, 5)) df["range"] = cat groups = df.groupby(["range", "baz"], as_index=True, sort=True) @@ -1439,7 +1437,7 @@ def test_series_groupby_categorical_aggregation_getitem(): ) def test_groupby_agg_categorical_columns(func, expected_values): # 31256 - df = pd.DataFrame( + df = DataFrame( { "id": [0, 1, 2, 3, 4], "groups": [0, 1, 1, 2, 2], @@ -1448,17 +1446,15 @@ def test_groupby_agg_categorical_columns(func, expected_values): ).set_index("id") result = df.groupby("groups").agg(func) - expected = pd.DataFrame( - {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups") + expected = DataFrame( + {"value": expected_values}, index=Index([0, 1, 2], name="groups") ) tm.assert_frame_equal(result, expected) def test_groupby_agg_non_numeric(): - df = pd.DataFrame( - {"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])} - ) - expected = pd.DataFrame({"A": [2, 1]}, index=[1, 2]) + df = DataFrame({"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])}) + expected = DataFrame({"A": [2, 1]}, index=[1, 2]) result = df.groupby([1, 2, 1]).agg(pd.Series.nunique) tm.assert_frame_equal(result, expected) @@ -1471,12 +1467,10 @@ def test_groupby_agg_non_numeric(): def test_groupy_first_returned_categorical_instead_of_dataframe(func): # GH 28641: groupby drops index, when grouping over categorical column with # first/last. Renamed Categorical instead of DataFrame previously. - df = pd.DataFrame( - {"A": [1997], "B": pd.Series(["b"], dtype="category").cat.as_ordered()} - ) + df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()}) df_grouped = df.groupby("A")["B"] result = getattr(df_grouped, func)() - expected = pd.Series(["b"], index=pd.Index([1997], name="A"), name="B") + expected = Series(["b"], index=Index([1997], name="A"), name="B") tm.assert_series_equal(result, expected) @@ -1494,7 +1488,7 @@ def test_read_only_category_no_sort(): def test_sorted_missing_category_values(): # GH 28597 - df = pd.DataFrame( + df = DataFrame( { "foo": [ "small", @@ -1515,7 +1509,7 @@ def test_sorted_missing_category_values(): .cat.set_categories(["tiny", "small", "medium", "large"], ordered=True) ) - expected = pd.DataFrame( + expected = DataFrame( { "tiny": {"A": 0, "C": 0}, "small": {"A": 0, "C": 1}, @@ -1539,13 +1533,11 @@ def test_sorted_missing_category_values(): def test_agg_cython_category_not_implemented_fallback(): # https://github.com/pandas-dev/pandas/issues/31450 - df = pd.DataFrame({"col_num": [1, 1, 2, 3]}) + df = DataFrame({"col_num": [1, 1, 2, 3]}) df["col_cat"] = df["col_num"].astype("category") result = df.groupby("col_num").col_cat.first() - expected = pd.Series( - [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat" - ) + expected = Series([1, 2, 3], index=Index([1, 2, 3], name="col_num"), name="col_cat") tm.assert_series_equal(result, expected) result = df.groupby("col_num").agg({"col_cat": "first"}) @@ -1556,16 +1548,16 @@ def test_agg_cython_category_not_implemented_fallback(): @pytest.mark.parametrize("func", ["min", "max"]) def test_aggregate_categorical_lost_index(func: str): # GH: 28641 groupby drops index, when grouping over categorical column with min/max - ds = pd.Series(["b"], dtype="category").cat.as_ordered() - df = pd.DataFrame({"A": [1997], "B": ds}) + ds = Series(["b"], dtype="category").cat.as_ordered() + df = DataFrame({"A": [1997], "B": ds}) result = df.groupby("A").agg({"B": func}) - expected = pd.DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A")) + expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) tm.assert_frame_equal(result, expected) def test_aggregate_categorical_with_isnan(): # GH 29837 - df = pd.DataFrame( + df = DataFrame( { "A": [1, 1, 1, 1], "B": [1, 2, 1, 2], @@ -1579,7 +1571,7 @@ def test_aggregate_categorical_with_isnan(): result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum()) index = pd.MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B")) - expected = pd.DataFrame( + expected = DataFrame( data={ "numerical_col": [1.0, 0.0], "object_col": [0, 0], @@ -1592,7 +1584,7 @@ def test_aggregate_categorical_with_isnan(): def test_categorical_transform(): # GH 29037 - df = pd.DataFrame( + df = DataFrame( { "package_id": [1, 1, 1, 2, 2, 3], "status": [ @@ -1613,7 +1605,7 @@ def test_categorical_transform(): df["last_status"] = df.groupby("package_id")["status"].transform(max) result = df.copy() - expected = pd.DataFrame( + expected = DataFrame( { "package_id": [1, 1, 1, 2, 2, 3], "status": [ @@ -1647,13 +1639,13 @@ def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals( # GH 34951 cat = pd.Categorical([0, 0, 1, 1]) val = [0, 1, 1, 0] - df = pd.DataFrame({"a": cat, "b": cat, "c": val}) + df = DataFrame({"a": cat, "b": cat, "c": val}) idx = pd.Categorical([0, 1]) idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) expected_dict = { - "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), - "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), + "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"), + "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"), } expected = expected_dict[func] @@ -1672,13 +1664,13 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( # GH 34951 cat = pd.Categorical([0, 0, 1, 1]) val = [0, 1, 1, 0] - df = pd.DataFrame({"a": cat, "b": cat, "c": val}) + df = DataFrame({"a": cat, "b": cat, "c": val}) idx = pd.Categorical([0, 1]) idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) expected_dict = { - "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), - "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), + "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"), + "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"), } expected = expected_dict[func].to_frame() diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 997d9b006c802..04b73b16ae2c7 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -241,6 +241,20 @@ def test_count_groupby_column_with_nan_in_groupby_column(self): ) tm.assert_frame_equal(expected, res) + def test_groupby_count_dateparseerror(self): + dr = date_range(start="1/1/2012", freq="5min", periods=10) + + # BAD Example, datetimes first + ser = Series(np.arange(10), index=[dr, np.arange(10)]) + grouped = ser.groupby(lambda x: x[1] % 2 == 0) + result = grouped.count() + + ser = Series(np.arange(10), index=[np.arange(10), dr]) + grouped = ser.groupby(lambda x: x[0] % 2 == 0) + expected = grouped.count() + + tm.assert_series_equal(result, expected) + def test_groupby_timedelta_cython_count(): df = DataFrame( @@ -283,7 +297,7 @@ def test_count(): def test_count_non_nulls(): # GH#5610 # count counts non-nulls - df = pd.DataFrame( + df = DataFrame( [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]], columns=["A", "B", "C"], ) @@ -301,14 +315,14 @@ def test_count_non_nulls(): def test_count_object(): - df = pd.DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) result = df.groupby("c").a.count() - expected = pd.Series([3, 3], index=pd.Index([2, 3], name="c"), name="a") + expected = Series([3, 3], index=pd.Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) - df = pd.DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) result = df.groupby("c").a.count() - expected = pd.Series([1, 3], index=pd.Index([2, 3], name="c"), name="a") + expected = Series([1, 3], index=pd.Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) @@ -318,7 +332,7 @@ def test_count_cross_type(): (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2))) ) - df = pd.DataFrame(vals, columns=["a", "b", "c", "d"]) + df = DataFrame(vals, columns=["a", "b", "c", "d"]) df[df == 2] = np.nan expected = df.groupby(["c", "d"]).count() diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index c16ad812eb634..448e6c6e6f64a 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -7,9 +7,9 @@ def test_filter_series(): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) + s = Series([1, 3, 20, 5, 22, 24, 7]) + expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = Series([20, 22, 24], index=[2, 4, 5]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) @@ -26,9 +26,9 @@ def test_filter_series(): def test_filter_single_column_df(): - df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) + df = DataFrame([1, 3, 20, 5, 22, 24, 7]) + expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = DataFrame([20, 22, 24], index=[2, 4, 5]) grouper = df[0].apply(lambda x: x % 2) grouped = df.groupby(grouper) tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) @@ -45,41 +45,41 @@ def test_filter_single_column_df(): def test_filter_multi_column_df(): - df = pd.DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]}) + df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - expected = pd.DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2]) + expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2]) tm.assert_frame_equal( grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected ) def test_filter_mixed_df(): - df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - expected = pd.DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2]) + expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2]) tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected) def test_filter_out_all_groups(): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + s = Series([1, 3, 20, 5, 22, 24, 7]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) - df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]]) def test_filter_out_no_groups(): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + s = Series([1, 3, 20, 5, 22, 24, 7]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) - df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) filtered = grouped.filter(lambda x: x["A"].mean() > 0) @@ -88,16 +88,16 @@ def test_filter_out_no_groups(): def test_filter_out_all_groups_in_df(): # GH12768 - df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) + df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=False) - expected = pd.DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) + expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) - df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) + df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) - expected = pd.DataFrame({"a": [], "b": []}, dtype="int64") + expected = DataFrame({"a": [], "b": []}, dtype="int64") tm.assert_frame_equal(expected, res) @@ -108,7 +108,7 @@ def raise_if_sum_is_zero(x): else: return x.sum() > 0 - s = pd.Series([-1, 0, 1, 2]) + s = Series([-1, 0, 1, 2]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) msg = "the filter must return a boolean result" @@ -119,7 +119,7 @@ def raise_if_sum_is_zero(x): def test_filter_with_axis_in_groupby(): # issue 11041 index = pd.MultiIndex.from_product([range(10), [0, 1]]) - data = pd.DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64") + data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64") result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10) expected = data.iloc[:, 12:20] tm.assert_frame_equal(result, expected) @@ -551,7 +551,7 @@ def test_filter_has_access_to_grouped_cols(): def test_filter_enforces_scalarness(): - df = pd.DataFrame( + df = DataFrame( [ ["best", "a", "x"], ["worst", "b", "y"], @@ -568,7 +568,7 @@ def test_filter_enforces_scalarness(): def test_filter_non_bool_raises(): - df = pd.DataFrame( + df = DataFrame( [ ["best", "a", 1], ["worst", "b", 1], @@ -586,12 +586,12 @@ def test_filter_non_bool_raises(): def test_filter_dropna_with_empty_groups(): # GH 10780 - data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) + data = Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) groupped = data.groupby(level=0) result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) - expected_false = pd.Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3)) + expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3)) tm.assert_series_equal(result_false, expected_false) result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) - expected_true = pd.Series(index=pd.Index([], dtype=int), dtype=np.float64) + expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64) tm.assert_series_equal(result_true, expected_true) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ab736b55b5743..05a959ea6f22b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -88,13 +88,13 @@ def test_max_min_non_numeric(): def test_min_date_with_nans(): # GH26321 dates = pd.to_datetime( - pd.Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" + Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" ).dt.date - df = pd.DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) + df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) result = df.groupby("b", as_index=False)["c"].min()["c"] expected = pd.to_datetime( - pd.Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" + Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" ).dt.date tm.assert_series_equal(result, expected) @@ -122,7 +122,7 @@ def test_intercept_builtin_sum(): @pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key def test_builtins_apply(keys, f): # see gh-8155 - df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"]) + df = DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"]) df["jolie"] = np.random.randn(1000) fname = f.__name__ @@ -151,13 +151,13 @@ def test_arg_passthru(): # GH3668 # GH5724 - df = pd.DataFrame( + df = DataFrame( { "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], "string": list("abc"), - "category_string": pd.Series(list("abc")).astype("category"), + "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": pd.date_range("20130101", periods=3), "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), @@ -179,7 +179,7 @@ def test_arg_passthru(): expected_columns_numeric = Index(["int", "float", "category_int"]) # mean / median - expected = pd.DataFrame( + expected = DataFrame( { "category_int": [7.5, 9], "float": [4.5, 6.0], @@ -303,12 +303,12 @@ def test_non_cython_api(): tm.assert_frame_equal(result, expected) # describe - expected_index = pd.Index([1, 3], name="A") + expected_index = Index([1, 3], name="A") expected_col = pd.MultiIndex( levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], codes=[[0] * 8, list(range(8))], ) - expected = pd.DataFrame( + expected = DataFrame( [ [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -325,7 +325,7 @@ def test_non_cython_api(): df[df.A == 3].describe().unstack().to_frame().T, ] ) - expected.index = pd.Index([0, 1]) + expected.index = Index([0, 1]) result = gni.describe() tm.assert_frame_equal(result, expected) @@ -385,7 +385,7 @@ def test_cython_median(): def test_median_empty_bins(observed): - df = pd.DataFrame(np.random.randint(0, 44, 500)) + df = DataFrame(np.random.randint(0, 44, 500)) grps = range(0, 55, 5) bins = pd.cut(df[0], grps) @@ -411,7 +411,7 @@ def test_median_empty_bins(observed): ) def test_groupby_non_arithmetic_agg_types(dtype, method, data): # GH9311, GH6620 - df = pd.DataFrame( + df = DataFrame( [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}] ) @@ -426,7 +426,7 @@ def test_groupby_non_arithmetic_agg_types(dtype, method, data): out_type = dtype exp = data["df"] - df_out = pd.DataFrame(exp) + df_out = DataFrame(exp) df_out["b"] = df_out.b.astype(out_type) df_out.set_index("a", inplace=True) @@ -448,7 +448,7 @@ def test_groupby_non_arithmetic_agg_types(dtype, method, data): ) def test_groupby_non_arithmetic_agg_int_like_precision(i): # see gh-6620, gh-9311 - df = pd.DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}]) + df = DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}]) grp_exp = { "first": {"expected": i[0]}, @@ -478,7 +478,7 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): ) def test_idxmin_idxmax_returns_int_types(func, values): # GH 25444 - df = pd.DataFrame( + df = DataFrame( { "name": ["A", "A", "B", "B"], "c_int": [1, 2, 3, 4], @@ -490,21 +490,21 @@ def test_idxmin_idxmax_returns_int_types(func, values): result = getattr(df.groupby("name"), func)() - expected = pd.DataFrame(values, index=Index(["A", "B"], name="name")) + expected = DataFrame(values, index=Index(["A", "B"], name="name")) tm.assert_frame_equal(result, expected) def test_groupby_cumprod(): # GH 4095 - df = pd.DataFrame({"key": ["b"] * 10, "value": 2}) + df = DataFrame({"key": ["b"] * 10, "value": 2}) actual = df.groupby("key")["value"].cumprod() expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) expected.name = "value" tm.assert_series_equal(actual, expected) - df = pd.DataFrame({"key": ["b"] * 100, "value": 2}) + df = DataFrame({"key": ["b"] * 100, "value": 2}) actual = df.groupby("key")["value"].cumprod() # if overflows, groupby product casts to float # while numpy passes back invalid values @@ -648,7 +648,7 @@ def test_nsmallest(): @pytest.mark.parametrize("func", ["cumprod", "cumsum"]) def test_numpy_compat(func): # see gh-12811 - df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) g = df.groupby("A") msg = "numpy operations are not valid with groupby" @@ -664,14 +664,12 @@ def test_cummin(numpy_dtypes_for_minmax): min_val = numpy_dtypes_for_minmax[1] # GH 15048 - base_df = pd.DataFrame( - {"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} - ) + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] df = base_df.astype(dtype) - expected = pd.DataFrame({"B": expected_mins}).astype(dtype) + expected = DataFrame({"B": expected_mins}).astype(dtype) result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() @@ -687,30 +685,30 @@ def test_cummin(numpy_dtypes_for_minmax): # Test nan in some values base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) + expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) result = base_df.groupby("A").cummin() tm.assert_frame_equal(result, expected) expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) # GH 15561 - df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) - expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") + df = DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") result = df.groupby("a")["b"].cummin() tm.assert_series_equal(expected, result) # GH 15635 - df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) + df = DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) result = df.groupby("a").b.cummin() - expected = pd.Series([1, 2, 1], name="b") + expected = Series([1, 2, 1], name="b") tm.assert_series_equal(result, expected) def test_cummin_all_nan_column(): - base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) - expected = pd.DataFrame({"B": [np.nan] * 8}) + expected = DataFrame({"B": [np.nan] * 8}) result = base_df.groupby("A").cummin() tm.assert_frame_equal(expected, result) result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() @@ -722,14 +720,12 @@ def test_cummax(numpy_dtypes_for_minmax): max_val = numpy_dtypes_for_minmax[2] # GH 15048 - base_df = pd.DataFrame( - {"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} - ) + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] df = base_df.astype(dtype) - expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) + expected = DataFrame({"B": expected_maxs}).astype(dtype) result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() @@ -745,30 +741,30 @@ def test_cummax(numpy_dtypes_for_minmax): # Test nan in some values base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) + expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) result = base_df.groupby("A").cummax() tm.assert_frame_equal(result, expected) expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # GH 15561 - df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) - expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") + df = DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") result = df.groupby("a")["b"].cummax() tm.assert_series_equal(expected, result) # GH 15635 - df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) + df = DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) result = df.groupby("a").b.cummax() - expected = pd.Series([2, 1, 2], name="b") + expected = Series([2, 1, 2], name="b") tm.assert_series_equal(result, expected) def test_cummax_all_nan_column(): - base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) - expected = pd.DataFrame({"B": [np.nan] * 8}) + expected = DataFrame({"B": [np.nan] * 8}) result = base_df.groupby("A").cummax() tm.assert_frame_equal(expected, result) result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() @@ -800,10 +796,10 @@ def test_is_monotonic_increasing(in_vals, out_vals): "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], "C": in_vals, } - df = pd.DataFrame(source_dict) + df = DataFrame(source_dict) result = df.groupby("B").C.is_monotonic_increasing index = Index(list("abcd"), name="B") - expected = pd.Series(index=index, data=out_vals, name="C") + expected = Series(index=index, data=out_vals, name="C") tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. @@ -837,10 +833,10 @@ def test_is_monotonic_decreasing(in_vals, out_vals): "C": in_vals, } - df = pd.DataFrame(source_dict) + df = DataFrame(source_dict) result = df.groupby("B").C.is_monotonic_decreasing index = Index(list("abcd"), name="B") - expected = pd.Series(index=index, data=out_vals, name="C") + expected = Series(index=index, data=out_vals, name="C") tm.assert_series_equal(result, expected) @@ -887,7 +883,7 @@ def test_frame_describe_multikey(tsframe): levels=[[col], group.columns], codes=[[0] * len(group.columns), range(len(group.columns))], ) - group = pd.DataFrame(group.values, columns=group_col, index=group.index) + group = DataFrame(group.values, columns=group_col, index=group.index) desc_groups.append(group) expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) @@ -929,15 +925,15 @@ def test_frame_describe_unstacked_format(): pd.Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, pd.Timestamp("2011-01-06 12:54:09", tz=None): 100000000, } - df = pd.DataFrame({"PRICE": prices, "VOLUME": volumes}) + df = DataFrame({"PRICE": prices, "VOLUME": volumes}) result = df.groupby("PRICE").VOLUME.describe() data = [ df[df.PRICE == 24990].VOLUME.describe().values.tolist(), df[df.PRICE == 25499].VOLUME.describe().values.tolist(), ] - expected = pd.DataFrame( + expected = DataFrame( data, - index=pd.Index([24990, 25499], name="PRICE"), + index=Index([24990, 25499], name="PRICE"), columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_frame_equal(result, expected) @@ -951,7 +947,7 @@ def test_frame_describe_unstacked_format(): @pytest.mark.parametrize("as_index", [True, False]) def test_describe_with_duplicate_output_column_names(as_index): # GH 35314 - df = pd.DataFrame( + df = DataFrame( { "a": [99, 99, 99, 88, 88, 88], "b": [1, 2, 3, 4, 5, 6], @@ -993,7 +989,7 @@ def test_describe_with_duplicate_output_column_names(as_index): .T ) expected.columns.names = [None, None] - expected.index = pd.Index([88, 99], name="a") + expected.index = Index([88, 99], name="a") if as_index: expected = expected.drop(columns=["a"], level=0) @@ -1007,7 +1003,7 @@ def test_describe_with_duplicate_output_column_names(as_index): def test_groupby_mean_no_overflow(): # Regression test for (#22487) - df = pd.DataFrame( + df = DataFrame( { "user": ["A", "A", "A", "A", "A"], "connections": [4970, 4749, 4719, 4704, 18446744073699999744], @@ -1031,10 +1027,10 @@ def test_apply_to_nullable_integer_returns_float(values, function): # https://github.com/pandas-dev/pandas/issues/32219 output = 0.5 if function == "var" else 1.5 arr = np.array([output] * 3, dtype=float) - idx = pd.Index([1, 2, 3], dtype=object, name="a") - expected = pd.DataFrame({"b": arr}, index=idx) + idx = Index([1, 2, 3], dtype=object, name="a") + expected = DataFrame({"b": arr}, index=idx) - groups = pd.DataFrame(values, dtype="Int64").groupby("a") + groups = DataFrame(values, dtype="Int64").groupby("a") result = getattr(groups, function)() tm.assert_frame_equal(result, expected) @@ -1049,16 +1045,14 @@ def test_apply_to_nullable_integer_returns_float(values, function): def test_groupby_sum_below_mincount_nullable_integer(): # https://github.com/pandas-dev/pandas/issues/32861 - df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") + df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") grouped = df.groupby("a") - idx = pd.Index([0, 1, 2], dtype=object, name="a") + idx = Index([0, 1, 2], dtype=object, name="a") result = grouped["b"].sum(min_count=2) - expected = pd.Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") + expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") tm.assert_series_equal(result, expected) result = grouped.sum(min_count=2) - expected = pd.DataFrame( - {"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx - ) + expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 087b4f64307e6..b57fa2540add9 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -248,7 +248,7 @@ def test_len(): assert len(grouped) == expected # issue 11016 - df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) + df = DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) assert len(df.groupby("a")) == 0 assert len(df.groupby("b")) == 3 assert len(df.groupby(["a", "b"])) == 3 @@ -594,13 +594,13 @@ def test_groupby_multiple_columns(df, op): def test_as_index_select_column(): # GH 5764 - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) result = df.groupby("A", as_index=False)["B"].get_group(1) - expected = pd.Series([2, 4], name="B") + expected = Series([2, 4], name="B") tm.assert_series_equal(result, expected) result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) - expected = pd.Series( + expected = Series( [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) ) tm.assert_series_equal(result, expected) @@ -1186,9 +1186,9 @@ def test_groupby_dtype_inference_empty(): def test_groupby_unit64_float_conversion(): #  GH: 30859 groupby converts unit64 to floats sometimes - df = pd.DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]}) + df = DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]}) result = df.groupby(["first", "second"])["value"].max() - expected = pd.Series( + expected = Series( [16148277970000000000], pd.MultiIndex.from_product([[1], [1]], names=["first", "second"]), name="value", @@ -1217,7 +1217,7 @@ def test_groupby_keys_same_size_as_index(): index = pd.date_range( start=pd.Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq ) - df = pd.DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) + df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) result = df.groupby([pd.Grouper(level=0, freq=freq), "metric"]).mean() expected = df.set_index([df.index, "metric"]) @@ -1227,17 +1227,17 @@ def test_groupby_keys_same_size_as_index(): def test_groupby_one_row(): # GH 11741 msg = r"^'Z'$" - df1 = pd.DataFrame(np.random.randn(1, 4), columns=list("ABCD")) + df1 = DataFrame(np.random.randn(1, 4), columns=list("ABCD")) with pytest.raises(KeyError, match=msg): df1.groupby("Z") - df2 = pd.DataFrame(np.random.randn(2, 4), columns=list("ABCD")) + df2 = DataFrame(np.random.randn(2, 4), columns=list("ABCD")) with pytest.raises(KeyError, match=msg): df2.groupby("Z") def test_groupby_nat_exclude(): # GH 6992 - df = pd.DataFrame( + df = DataFrame( { "values": np.random.randn(8), "dt": [ @@ -1255,7 +1255,7 @@ def test_groupby_nat_exclude(): ) grouped = df.groupby("dt") - expected = [pd.Index([1, 7]), pd.Index([3, 5])] + expected = [Index([1, 7]), Index([3, 5])] keys = sorted(grouped.groups.keys()) assert len(keys) == 2 for k, e in zip(keys, expected): @@ -1268,8 +1268,8 @@ def test_groupby_nat_exclude(): assert grouped.ngroups == 2 expected = { - Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.int64), - Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.int64), + Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp), + Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp), } for k in grouped.indices: @@ -1454,7 +1454,7 @@ def foo(x): def test_group_name_available_in_inference_pass(): # gh-15062 - df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) + df = DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) names = [] @@ -1733,7 +1733,7 @@ def test_group_shift_lose_timezone(): def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 - df = pd.DataFrame( + df = DataFrame( { "eventDate": pd.date_range(datetime.today(), periods=20, freq="M").tolist(), "thename": range(0, 20), @@ -1762,7 +1762,7 @@ def test_empty_dataframe_groupby(): def test_tuple_as_grouping(): # https://github.com/pandas-dev/pandas/issues/18314 - df = pd.DataFrame( + df = DataFrame( { ("a", "b"): [1, 1, 1, 1], "a": [2, 2, 2, 2], @@ -1775,13 +1775,13 @@ def test_tuple_as_grouping(): df[["a", "b", "c"]].groupby(("a", "b")) result = df.groupby(("a", "b"))["c"].sum() - expected = pd.Series([4], name="c", index=pd.Index([1], name=("a", "b"))) + expected = Series([4], name="c", index=Index([1], name=("a", "b"))) tm.assert_series_equal(result, expected) def test_tuple_correct_keyerror(): # https://github.com/pandas-dev/pandas/issues/18798 - df = pd.DataFrame( + df = DataFrame( 1, index=range(3), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]]) ) with pytest.raises(KeyError, match=r"^\(7, 8\)$"): @@ -1790,13 +1790,13 @@ def test_tuple_correct_keyerror(): def test_groupby_agg_ohlc_non_first(): # GH 21716 - df = pd.DataFrame( + df = DataFrame( [[1], [1]], columns=["foo"], index=pd.date_range("2018-01-01", periods=2, freq="D"), ) - expected = pd.DataFrame( + expected = DataFrame( [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], columns=pd.MultiIndex.from_tuples( ( @@ -1824,10 +1824,10 @@ def test_groupby_multiindex_nat(): (datetime(2012, 1, 3), "a"), ] mi = pd.MultiIndex.from_tuples(values, names=["date", None]) - ser = pd.Series([3, 2, 2.5, 4], index=mi) + ser = Series([3, 2, 2.5, 4], index=mi) result = ser.groupby(level=1).mean() - expected = pd.Series([3.0, 2.5], index=["a", "b"]) + expected = Series([3.0, 2.5], index=["a", "b"]) tm.assert_series_equal(result, expected) @@ -1845,13 +1845,13 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): index_array = [["x", "x"], ["a", "b"], ["k", "k"]] index_names = ["first", "second", "third"] ri = pd.MultiIndex.from_arrays(index_array, names=index_names) - s = pd.Series(data=[1, 2], index=ri) + s = Series(data=[1, 2], index=ri) result = s.groupby(["first", "third"]).sum() index_array = [["x"], ["k"]] index_names = ["first", "third"] ei = pd.MultiIndex.from_arrays(index_array, names=index_names) - expected = pd.Series([3], index=ei) + expected = Series([3], index=ei) tm.assert_series_equal(result, expected) @@ -1860,7 +1860,7 @@ def test_groupby_groups_in_BaseGrouper(): # GH 26326 # Test if DataFrame grouped with a pandas.Grouper has correct groups mi = pd.MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"]) - df = pd.DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi) + df = DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi) result = df.groupby([pd.Grouper(level="alpha"), "beta"]) expected = df.groupby(["alpha", "beta"]) assert result.groups == expected.groups @@ -1873,7 +1873,7 @@ def test_groupby_groups_in_BaseGrouper(): @pytest.mark.parametrize("group_name", ["x", ["x"]]) def test_groupby_axis_1(group_name): # GH 27614 - df = pd.DataFrame( + df = DataFrame( np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20] ) df.index.name = "y" @@ -1886,7 +1886,7 @@ def test_groupby_axis_1(group_name): # test on MI column iterables = [["bar", "baz", "foo"], ["one", "two"]] mi = pd.MultiIndex.from_product(iterables=iterables, names=["x", "x1"]) - df = pd.DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi) + df = DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi) results = df.groupby(group_name, axis=1).sum() expected = df.T.groupby(group_name).sum().T tm.assert_frame_equal(results, expected) @@ -1961,27 +1961,27 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): def test_groupby_only_none_group(): # see GH21624 # this was crashing with "ValueError: Length of passed values is 1, index implies 0" - df = pd.DataFrame({"g": [None], "x": 1}) + df = DataFrame({"g": [None], "x": 1}) actual = df.groupby("g")["x"].transform("sum") - expected = pd.Series([np.nan], name="x") + expected = Series([np.nan], name="x") tm.assert_series_equal(actual, expected) def test_groupby_duplicate_index(): # GH#29189 the groupby call here used to raise - ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) + ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) gb = ser.groupby(level=0) result = gb.mean() - expected = pd.Series([2, 5.5, 8], index=[2.0, 4.0, 5.0]) + expected = Series([2, 5.5, 8], index=[2.0, 4.0, 5.0]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) def test_bool_aggs_dup_column_labels(bool_agg_func): # 21668 - df = pd.DataFrame([[True, True]], columns=["a", "a"]) + df = DataFrame([[True, True]], columns=["a", "a"]) grp_by = df.groupby([0]) result = getattr(grp_by, bool_agg_func)() @@ -1990,14 +1990,14 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): @pytest.mark.parametrize( - "idx", [pd.Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))] + "idx", [Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))] ) @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_dup_labels_output_shape(groupby_func, idx): if groupby_func in {"size", "ngroup", "cumcount"}: pytest.skip("Not applicable") - df = pd.DataFrame([[1, 1]], columns=idx) + df = DataFrame([[1, 1]], columns=idx) grp_by = df.groupby([0]) args = [] @@ -2017,7 +2017,7 @@ def test_dup_labels_output_shape(groupby_func, idx): def test_groupby_crash_on_nunique(axis): # Fix following 30253 - df = pd.DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) + df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) axis_number = df._get_axis_number(axis) if not axis_number: @@ -2025,7 +2025,7 @@ def test_groupby_crash_on_nunique(axis): result = df.groupby(axis=axis_number, level=0).nunique() - expected = pd.DataFrame({"A": [1, 2], "D": [1, 1]}) + expected = DataFrame({"A": [1, 2], "D": [1, 1]}) if not axis_number: expected = expected.T @@ -2034,7 +2034,7 @@ def test_groupby_crash_on_nunique(axis): def test_groupby_list_level(): # GH 9790 - expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3)) + expected = DataFrame(np.arange(0, 9).reshape(3, 3)) result = expected.groupby(level=[0]).mean() tm.assert_frame_equal(result, expected) @@ -2048,7 +2048,7 @@ def test_groupby_list_level(): ) def test_groups_repr_truncates(max_seq_items, expected): # GH 1135 - df = pd.DataFrame(np.random.randn(5, 1)) + df = DataFrame(np.random.randn(5, 1)) df["a"] = df.index with pd.option_context("display.max_seq_items", max_seq_items): @@ -2061,7 +2061,7 @@ def test_groups_repr_truncates(max_seq_items, expected): def test_group_on_two_row_multiindex_returns_one_tuple_key(): # GH 18451 - df = pd.DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}]) + df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}]) df = df.set_index(["a", "b"]) grp = df.groupby(["a", "b"]) @@ -2106,7 +2106,7 @@ def test_group_on_two_row_multiindex_returns_one_tuple_key(): ) def test_subsetting_columns_keeps_attrs(klass, attr, value): # GH 9959 - When subsetting columns, don't drop attributes - df = pd.DataFrame({"a": [1], "b": [2], "c": [3]}) + df = DataFrame({"a": [1], "b": [2], "c": [3]}) if attr != "axis": df = df.set_index("a") @@ -2118,8 +2118,8 @@ def test_subsetting_columns_keeps_attrs(klass, attr, value): @pytest.mark.parametrize("func", ["sum", "any", "shift"]) def test_groupby_column_index_name_lost(func): # GH: 29764 groupby loses index sometimes - expected = pd.Index(["a"], name="idx") - df = pd.DataFrame([[1]], columns=expected) + expected = Index(["a"], name="idx") + df = DataFrame([[1]], columns=expected) df_grouped = df.groupby([1]) result = getattr(df_grouped, func)().columns tm.assert_index_equal(result, expected) @@ -2127,10 +2127,10 @@ def test_groupby_column_index_name_lost(func): def test_groupby_duplicate_columns(): # GH: 31735 - df = pd.DataFrame( + df = DataFrame( {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} ).astype(object) df.columns = ["A", "B", "B"] result = df.groupby([0, 0, 0, 0]).min() - expected = pd.DataFrame([["e", "a", 1]], columns=["A", "B", "B"]) + expected = DataFrame([["e", "a", 1]], columns=["A", "B", "B"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 18ef95c05f291..6c74e1521eeeb 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -158,7 +158,7 @@ def test_grouper_multilevel_freq(self): d0 = date.today() - timedelta(days=14) dates = date_range(d0, date.today()) date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"]) - df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) + df = DataFrame(np.random.randint(0, 100, 225), index=date_index) # Check string level expected = ( @@ -167,7 +167,7 @@ def test_grouper_multilevel_freq(self): .sum() ) # reset index changes columns dtype to object - expected.columns = pd.Index([0], dtype="int64") + expected.columns = Index([0], dtype="int64") result = df.groupby( [pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")] @@ -258,7 +258,7 @@ def test_grouper_column_and_index(self): [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)] ) idx.names = ["outer", "inner"] - df_multi = pd.DataFrame( + df_multi = DataFrame( {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]}, index=idx, ) @@ -289,7 +289,7 @@ def test_groupby_levels_and_columns(self): idx = pd.MultiIndex.from_tuples( [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names ) - df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) + df = DataFrame(np.arange(12).reshape(-1, 3), index=idx) by_levels = df.groupby(level=idx_names).mean() # reset_index changes columns dtype to object @@ -297,7 +297,7 @@ def test_groupby_levels_and_columns(self): tm.assert_frame_equal(by_levels, by_columns, check_column_type=False) - by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) + by_columns.columns = Index(by_columns.columns, dtype=np.int64) tm.assert_frame_equal(by_levels, by_columns) def test_groupby_categorical_index_and_columns(self, observed): @@ -407,7 +407,7 @@ def test_multiindex_passthru(self): # GH 7997 # regression from 0.14.1 - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) result = df.groupby(axis=1, level=[0, 1]).first() @@ -463,7 +463,7 @@ def test_multiindex_columns_empty_level(self): def test_groupby_multiindex_tuple(self): # GH 17979 - df = pd.DataFrame( + df = DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), ) @@ -471,7 +471,7 @@ def test_groupby_multiindex_tuple(self): result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) - df2 = pd.DataFrame( + df2 = DataFrame( df.values, columns=pd.MultiIndex.from_arrays( [["a", "b", "b", "c"], ["d", "d", "e", "e"]] @@ -481,7 +481,7 @@ def test_groupby_multiindex_tuple(self): result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) - df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) + df3 = DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) expected = df3.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) @@ -596,13 +596,13 @@ def test_grouping_labels(self, mframe): def test_list_grouper_with_nat(self): # GH 14715 - df = pd.DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")}) + df = DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT grouper = pd.Grouper(key="date", freq="AS") # Grouper in a list grouping result = df.groupby([grouper]) - expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))} + expected = {pd.Timestamp("2011-01-01"): Index(list(range(364)))} tm.assert_dict_equal(result.groups, expected) # Test case without a list @@ -615,15 +615,15 @@ def test_list_grouper_with_nat(self): [ ( "transform", - pd.Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)), + Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)), ), ( "agg", - pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), + Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), ), ( "apply", - pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), + Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), ), ], ) @@ -632,14 +632,14 @@ def test_evaluate_with_empty_groups(self, func, expected): # test transform'ing empty groups # (not testing other agg fns, because they return # different index objects. - df = pd.DataFrame({1: [], 2: []}) + df = DataFrame({1: [], 2: []}) g = df.groupby(1) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 - s = pd.Series([], name="name", dtype="float64") + s = Series([], name="name", dtype="float64") gr = s.groupby([]) result = gr.mean() @@ -680,13 +680,13 @@ def test_groupby_level_index_value_all_na(self): def test_groupby_multiindex_level_empty(self): # https://github.com/pandas-dev/pandas/issues/31670 - df = pd.DataFrame( + df = DataFrame( [[123, "a", 1.0], [123, "b", 2.0]], columns=["id", "category", "value"] ) df = df.set_index(["id", "category"]) empty = df[df.value < 0] result = empty.groupby("id").sum() - expected = pd.DataFrame( + expected = DataFrame( dtype="float64", columns=["value"], index=pd.Int64Index([], name="id") ) tm.assert_frame_equal(result, expected) @@ -746,7 +746,7 @@ def test_get_group(self): def test_get_group_empty_bins(self, observed): - d = pd.DataFrame([3, 1, 7, 6]) + d = DataFrame([3, 1, 7, 6]) bins = [0, 5, 10, 15] g = d.groupby(pd.cut(d[0], bins), observed=observed) @@ -778,16 +778,16 @@ def test_get_group_grouped_by_tuple(self): def test_groupby_with_empty(self): index = pd.DatetimeIndex(()) data = () - series = pd.Series(data, index, dtype=object) + series = Series(data, index, dtype=object) grouper = pd.Grouper(freq="D") grouped = series.groupby(grouper) assert next(iter(grouped), None) is None def test_groupby_with_single_column(self): - df = pd.DataFrame({"a": list("abssbab")}) + df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = pd.DataFrame(index=pd.Index(["a", "b", "s"], name="a")) + exp = DataFrame(index=Index(["a", "b", "s"], name="a")) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) tm.assert_frame_equal(df.groupby("a").nth(1), exp) @@ -796,7 +796,7 @@ def test_gb_key_len_equal_axis_len(self): # GH16843 # test ensures that index and column keys are recognized correctly # when number of keys equals axis length of groupby - df = pd.DataFrame( + df = DataFrame( [["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]], columns=["first", "second", "third", "one"], ) @@ -905,7 +905,7 @@ def test_dictify(self, df): def test_groupby_with_small_elem(self): # GH 8542 # length=2 - df = pd.DataFrame( + df = DataFrame( {"event": ["start", "start"], "change": [1234, 5678]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]), ) @@ -920,7 +920,7 @@ def test_groupby_with_small_elem(self): res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) tm.assert_frame_equal(res, df.iloc[[1], :]) - df = pd.DataFrame( + df = DataFrame( {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]), ) @@ -936,7 +936,7 @@ def test_groupby_with_small_elem(self): tm.assert_frame_equal(res, df.iloc[[1], :]) # length=3 - df = pd.DataFrame( + df = DataFrame( {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]), ) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py new file mode 100644 index 0000000000000..28b740355f351 --- /dev/null +++ b/pandas/tests/groupby/test_libgroupby.py @@ -0,0 +1,237 @@ +import numpy as np + +from pandas._libs import groupby as libgroupby +from pandas._libs.groupby import ( + group_cumprod_float64, + group_cumsum, + group_var_float32, + group_var_float64, +) + +from pandas.core.dtypes.common import ensure_int64 + +from pandas import isna +import pandas._testing as tm + + +class GroupVarTestMixin: + def test_group_var_generic_1d(self): + prng = np.random.RandomState(1234) + + out = (np.nan * np.ones((5, 1))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(15, 1).astype(self.dtype) + labels = np.tile(np.arange(5), (3,)).astype("int64") + + expected_out = ( + np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 + )[:, np.newaxis] + expected_counts = counts + 3 + + self.algo(out, counts, values, labels) + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_1d_flat_labels(self): + prng = np.random.RandomState(1234) + + out = (np.nan * np.ones((1, 1))).astype(self.dtype) + counts = np.zeros(1, dtype="int64") + values = 10 * prng.rand(5, 1).astype(self.dtype) + labels = np.zeros(5, dtype="int64") + + expected_out = np.array([[values.std(ddof=1) ** 2]]) + expected_counts = counts + 5 + + self.algo(out, counts, values, labels) + + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_2d_all_finite(self): + prng = np.random.RandomState(1234) + + out = (np.nan * np.ones((5, 2))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(10, 2).astype(self.dtype) + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 + expected_counts = counts + 2 + + self.algo(out, counts, values, labels) + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_2d_some_nan(self): + prng = np.random.RandomState(1234) + + out = (np.nan * np.ones((5, 2))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(10, 2).astype(self.dtype) + values[:, 1] = np.nan + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.vstack( + [ + values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, + np.nan * np.ones(5), + ] + ).T.astype(self.dtype) + expected_counts = counts + 2 + + self.algo(out, counts, values, labels) + tm.assert_almost_equal(out, expected_out, rtol=0.5e-06) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_constant(self): + # Regression test from GH 10448. + + out = np.array([[np.nan]], dtype=self.dtype) + counts = np.array([0], dtype="int64") + values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) + labels = np.zeros(3, dtype="int64") + + self.algo(out, counts, values, labels) + + assert counts[0] == 3 + assert out[0, 0] >= 0 + tm.assert_almost_equal(out[0, 0], 0.0) + + +class TestGroupVarFloat64(GroupVarTestMixin): + __test__ = True + + algo = staticmethod(group_var_float64) + dtype = np.float64 + rtol = 1e-5 + + def test_group_var_large_inputs(self): + prng = np.random.RandomState(1234) + + out = np.array([[np.nan]], dtype=self.dtype) + counts = np.array([0], dtype="int64") + values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) + values.shape = (10 ** 6, 1) + labels = np.zeros(10 ** 6, dtype="int64") + + self.algo(out, counts, values, labels) + + assert counts[0] == 10 ** 6 + tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3) + + +class TestGroupVarFloat32(GroupVarTestMixin): + __test__ = True + + algo = staticmethod(group_var_float32) + dtype = np.float32 + rtol = 1e-2 + + +def test_group_ohlc(): + def _check(dtype): + obj = np.array(np.random.randn(20), dtype=dtype) + + bins = np.array([6, 12, 20]) + out = np.zeros((3, 4), dtype) + counts = np.zeros(len(out), dtype=np.int64) + labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) + + func = getattr(libgroupby, f"group_ohlc_{dtype}") + func(out, counts, obj[:, None], labels) + + def _ohlc(group): + if isna(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) + + tm.assert_almost_equal(out, expected) + tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) + + obj[:6] = np.nan + func(out, counts, obj[:, None], labels) + expected[0] = np.nan + tm.assert_almost_equal(out, expected) + + _check("float32") + _check("float64") + + +def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): + """ + Check a group transform that executes a cumulative function. + + Parameters + ---------- + pd_op : callable + The pandas cumulative function. + np_op : callable + The analogous one in NumPy. + dtype : type + The specified dtype of the data. + """ + is_datetimelike = False + + data = np.array([[1], [2], [3], [4]], dtype=dtype) + ans = np.zeros_like(data) + + labels = np.array([0, 0, 0, 0], dtype=np.int64) + ngroups = 1 + pd_op(ans, data, labels, ngroups, is_datetimelike) + + tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) + + +def test_cython_group_transform_cumsum(any_real_dtype): + # see gh-4095 + dtype = np.dtype(any_real_dtype).type + pd_op, np_op = group_cumsum, np.cumsum + _check_cython_group_transform_cumulative(pd_op, np_op, dtype) + + +def test_cython_group_transform_cumprod(): + # see gh-4095 + dtype = np.float64 + pd_op, np_op = group_cumprod_float64, np.cumproduct + _check_cython_group_transform_cumulative(pd_op, np_op, dtype) + + +def test_cython_group_transform_algos(): + # see gh-4095 + is_datetimelike = False + + # with nans + labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) + ngroups = 1 + + data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") + actual = np.zeros_like(data) + actual.fill(np.nan) + group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) + expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") + tm.assert_numpy_array_equal(actual[:, 0], expected) + + actual = np.zeros_like(data) + actual.fill(np.nan) + group_cumsum(actual, data, labels, ngroups, is_datetimelike) + expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") + tm.assert_numpy_array_equal(actual[:, 0], expected) + + # timedelta + is_datetimelike = True + data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] + actual = np.zeros_like(data, dtype="int64") + group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) + expected = np.array( + [ + np.timedelta64(1, "ns"), + np.timedelta64(2, "ns"), + np.timedelta64(3, "ns"), + np.timedelta64(4, "ns"), + np.timedelta64(5, "ns"), + ] + ) + tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 70d8dfc20822a..2c2147795bc07 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize("func", ["ffill", "bfill"]) def test_groupby_column_index_name_lost_fill_funcs(func): # GH: 29764 groupby loses index sometimes - df = pd.DataFrame( + df = DataFrame( [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]], columns=pd.Index(["type", "a", "b"], name="idx"), ) @@ -22,10 +22,10 @@ def test_groupby_column_index_name_lost_fill_funcs(func): @pytest.mark.parametrize("func", ["ffill", "bfill"]) def test_groupby_fill_duplicate_column_names(func): # GH: 25610 ValueError with duplicate column names - df1 = pd.DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]}) - df2 = pd.DataFrame({"field1": [1, np.nan, 4]}) + df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]}) + df2 = DataFrame({"field1": [1, np.nan, 4]}) df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"]) - expected = pd.DataFrame( + expected = DataFrame( [[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"] ) result = getattr(df_grouped, func)() @@ -34,7 +34,7 @@ def test_groupby_fill_duplicate_column_names(func): def test_ffill_missing_arguments(): # GH 14955 - df = pd.DataFrame({"a": [1, 2], "b": [1, 1]}) + df = DataFrame({"a": [1, 2], "b": [1, 1]}) with pytest.raises(ValueError, match="Must specify a fill"): df.groupby("b").fillna() @@ -90,7 +90,7 @@ def test_fill_consistency(): def test_ffill_handles_nan_groups(dropna, method, has_nan_group): # GH 34725 - df_without_nan_rows = pd.DataFrame([(1, 0.1), (2, 0.2)]) + df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)]) ridx = [-1, 0, -1, -1, 1, -1] df = df_without_nan_rows.reindex(ridx).reset_index(drop=True) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 0cbfbad85a8b6..e82b23054f1cb 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -57,9 +57,7 @@ def test_first_last_nth(df): @pytest.mark.parametrize("method", ["first", "last"]) def test_first_last_with_na_object(method, nulls_fixture): # https://github.com/pandas-dev/pandas/issues/32123 - groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby( - "a" - ) + groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a") result = getattr(groups, method)() if method == "first": @@ -68,8 +66,8 @@ def test_first_last_with_na_object(method, nulls_fixture): values = [2, 3] values = np.array(values, dtype=result["b"].dtype) - idx = pd.Index([1, 2], name="a") - expected = pd.DataFrame({"b": values}, index=idx) + idx = Index([1, 2], name="a") + expected = DataFrame({"b": values}, index=idx) tm.assert_frame_equal(result, expected) @@ -77,9 +75,7 @@ def test_first_last_with_na_object(method, nulls_fixture): @pytest.mark.parametrize("index", [0, -1]) def test_nth_with_na_object(index, nulls_fixture): # https://github.com/pandas-dev/pandas/issues/32123 - groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby( - "a" - ) + groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a") result = groups.nth(index) if index == 0: @@ -88,8 +84,8 @@ def test_nth_with_na_object(index, nulls_fixture): values = [2, nulls_fixture] values = np.array(values, dtype=result["b"].dtype) - idx = pd.Index([1, 2], name="a") - expected = pd.DataFrame({"b": values}, index=idx) + idx = Index([1, 2], name="a") + expected = DataFrame({"b": values}, index=idx) tm.assert_frame_equal(result, expected) @@ -142,7 +138,7 @@ def test_first_last_nth_dtypes(df_mixed_floats): def test_first_last_nth_nan_dtype(): # GH 33591 - df = pd.DataFrame({"data": ["A"], "nans": pd.Series([np.nan], dtype=object)}) + df = DataFrame({"data": ["A"], "nans": Series([np.nan], dtype=object)}) grouped = df.groupby("data") expected = df.set_index("data").nans @@ -154,7 +150,7 @@ def test_first_last_nth_nan_dtype(): def test_first_strings_timestamps(): # GH 11244 - test = pd.DataFrame( + test = DataFrame( { pd.Timestamp("2012-01-01 00:00:00"): ["a", "b"], pd.Timestamp("2012-01-02 00:00:00"): ["c", "d"], @@ -386,8 +382,8 @@ def test_first_last_tz(data, expected_first, expected_last): ) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 - category_string = pd.Series(list("abc")).astype("category") - df = pd.DataFrame( + category_string = Series(list("abc")).astype("category") + df = DataFrame( { "group": [1, 1, 2], "category_string": category_string, @@ -395,14 +391,14 @@ def test_first_last_tz_multi_column(method, ts, alpha): } ) result = getattr(df.groupby("group"), method)() - expected = pd.DataFrame( + expected = DataFrame( { "category_string": pd.Categorical( [alpha, "c"], dtype=category_string.dtype ), "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], }, - index=pd.Index([1, 2], name="group"), + index=Index([1, 2], name="group"), ) tm.assert_frame_equal(result, expected) @@ -500,9 +496,7 @@ def test_groupby_head_tail(): tm.assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) tm.assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - empty_not_as = DataFrame( - columns=df.columns, index=pd.Index([], dtype=df.index.dtype) - ) + empty_not_as = DataFrame(columns=df.columns, index=Index([], dtype=df.index.dtype)) empty_not_as["A"] = empty_not_as["A"].astype(df.A.dtype) empty_not_as["B"] = empty_not_as["B"].astype(df.B.dtype) tm.assert_frame_equal(empty_not_as, g_not_as.head(0)) @@ -614,7 +608,7 @@ def test_nth_nan_in_grouper(dropna): columns=list("abc"), ) result = df.groupby("a").nth(0, dropna=dropna) - expected = pd.DataFrame( + expected = DataFrame( [[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a") ) diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py index c3347b7ae52f3..7edb358170b50 100644 --- a/pandas/tests/groupby/test_nunique.py +++ b/pandas/tests/groupby/test_nunique.py @@ -83,7 +83,7 @@ def test_nunique(): def test_nunique_with_object(): # GH 11077 - data = pd.DataFrame( + data = DataFrame( [ [100, 1, "Alice"], [200, 2, "Bob"], @@ -96,21 +96,21 @@ def test_nunique_with_object(): result = data.groupby(["id", "amount"])["name"].nunique() index = MultiIndex.from_arrays([data.id, data.amount]) - expected = pd.Series([1] * 5, name="name", index=index) + expected = Series([1] * 5, name="name", index=index) tm.assert_series_equal(result, expected) def test_nunique_with_empty_series(): # GH 12553 - data = pd.Series(name="name", dtype=object) + data = Series(name="name", dtype=object) result = data.groupby(level=0).nunique() - expected = pd.Series(name="name", dtype="int64") + expected = Series(name="name", dtype="int64") tm.assert_series_equal(result, expected) def test_nunique_with_timegrouper(): # GH 13453 - test = pd.DataFrame( + test = DataFrame( { "time": [ Timestamp("2016-06-28 09:35:35"), @@ -156,22 +156,22 @@ def test_nunique_with_timegrouper(): ) def test_nunique_with_NaT(key, data, dropna, expected): # GH 27951 - df = pd.DataFrame({"key": key, "data": data}) + df = DataFrame({"key": key, "data": data}) result = df.groupby(["key"])["data"].nunique(dropna=dropna) tm.assert_series_equal(result, expected) def test_nunique_preserves_column_level_names(): # GH 23222 - test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) + test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) result = test.groupby([0, 0, 0]).nunique() - expected = pd.DataFrame([2], columns=test.columns) + expected = DataFrame([2], columns=test.columns) tm.assert_frame_equal(result, expected) def test_nunique_transform_with_datetime(): # GH 35109 - transform with nunique on datetimes results in integers - df = pd.DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) + df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) result = df.groupby([0, 0, 1])["date"].transform("nunique") - expected = pd.Series([2, 2, 1], name="date") + expected = Series([2, 2, 1], name="date") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index d2ab016f608fa..1acbc8cf5c0ad 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -42,7 +42,7 @@ def test_pipe_args(): # Test passing args to the pipe method of DataFrameGroupBy. # Issue #17871 - df = pd.DataFrame( + df = DataFrame( { "group": ["A", "A", "B", "B", "C"], "x": [1.0, 2.0, 3.0, 2.0, 5.0], @@ -64,7 +64,7 @@ def h(df, arg3): result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) # Assert the results here - index = pd.Index(["A", "B", "C"], name="group") + index = Index(["A", "B", "C"], name="group") expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 9338742195bfe..14b0d9ab60e52 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -54,18 +54,18 @@ def test_quantile(interpolation, a_vals, b_vals, q): def test_quantile_array(): # https://github.com/pandas-dev/pandas/issues/27526 - df = pd.DataFrame({"A": [0, 1, 2, 3, 4]}) + df = DataFrame({"A": [0, 1, 2, 3, 4]}) result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25]) index = pd.MultiIndex.from_product([[0, 1], [0.25]]) - expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index) + expected = DataFrame({"A": [0.25, 2.50]}, index=index) tm.assert_frame_equal(result, expected) - df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) + df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75]) - expected = pd.DataFrame( + expected = DataFrame( {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index ) tm.assert_frame_equal(result, expected) @@ -73,11 +73,11 @@ def test_quantile_array(): def test_quantile_array2(): # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 - df = pd.DataFrame( + df = DataFrame( np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC") ) result = df.groupby("A").quantile([0.3, 0.7]) - expected = pd.DataFrame( + expected = DataFrame( { "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0], "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0], @@ -90,16 +90,16 @@ def test_quantile_array2(): def test_quantile_array_no_sort(): - df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) + df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) - expected = pd.DataFrame( + expected = DataFrame( {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), ) tm.assert_frame_equal(result, expected) result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25]) - expected = pd.DataFrame( + expected = DataFrame( {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), ) @@ -107,7 +107,7 @@ def test_quantile_array_no_sort(): def test_quantile_array_multiple_levels(): - df = pd.DataFrame( + df = DataFrame( {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} ) result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) @@ -115,7 +115,7 @@ def test_quantile_array_multiple_levels(): [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], names=["c", "d", None], ) - expected = pd.DataFrame( + expected = DataFrame( {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index ) tm.assert_frame_equal(result, expected) @@ -127,9 +127,7 @@ def test_quantile_array_multiple_levels(): def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): # GH30289 nrow, ncol = frame_size - df = pd.DataFrame( - np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol) - ) + df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol)) idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q] idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ @@ -142,7 +140,7 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q ] expected_columns = [x for x in range(ncol) if x not in groupby] - expected = pd.DataFrame( + expected = DataFrame( expected_values, index=expected_index, columns=expected_columns ) result = df.groupby(groupby).quantile(q) @@ -151,9 +149,7 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, def test_quantile_raises(): - df = pd.DataFrame( - [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] - ) + df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): df.groupby("key").quantile() @@ -161,7 +157,7 @@ def test_quantile_raises(): def test_quantile_out_of_bounds_q_raises(): # https://github.com/pandas-dev/pandas/issues/27470 - df = pd.DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) + df = DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) g = df.groupby([0, 0, 0, 1, 1, 1]) with pytest.raises(ValueError, match="Got '50.0' instead"): g.quantile(50) @@ -173,7 +169,7 @@ def test_quantile_out_of_bounds_q_raises(): def test_quantile_missing_group_values_no_segfaults(): # GH 28662 data = np.array([1.0, np.nan, 1.0]) - df = pd.DataFrame(dict(key=data, val=range(3))) + df = DataFrame(dict(key=data, val=range(3))) # Random segfaults; would have been guaranteed in loop grp = df.groupby("key") @@ -195,9 +191,9 @@ def test_quantile_missing_group_values_correct_results( key, val, expected_key, expected_val ): # GH 28662, GH 33200, GH 33569 - df = pd.DataFrame({"key": key, "val": val}) + df = DataFrame({"key": key, "val": val}) - expected = pd.DataFrame( + expected = DataFrame( expected_val, index=pd.Index(expected_key, name="key"), columns=["val"] ) @@ -220,7 +216,7 @@ def test_quantile_missing_group_values_correct_results( @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) def test_groupby_quantile_nullable_array(values, q): # https://github.com/pandas-dev/pandas/issues/33136 - df = pd.DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) + df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) result = df.groupby("a")["b"].quantile(q) if isinstance(q, list): @@ -236,7 +232,7 @@ def test_groupby_quantile_nullable_array(values, q): @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) def test_groupby_quantile_skips_invalid_dtype(q): - df = pd.DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) + df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) result = df.groupby("a").quantile(q) expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 4ccbc6a65fd88..0a1232d3f24da 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -72,7 +72,7 @@ def test_groupby_with_timegrouper_methods(self, should_sort): # GH 3881 # make sure API of timegrouper conforms - df = pd.DataFrame( + df = DataFrame( { "Branch": "A A A A A B".split(), "Buyer": "Carl Mark Carl Joe Joe Carl".split(), @@ -403,12 +403,12 @@ def test_timegrouper_apply_return_type_series(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. # Issue #11742 - df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) + df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) df_dt = df.copy() df_dt["date"] = pd.to_datetime(df_dt["date"]) def sumfunc_series(x): - return pd.Series([x["value"].sum()], ("sum",)) + return Series([x["value"].sum()], ("sum",)) expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_series) result = df_dt.groupby(pd.Grouper(freq="M", key="date")).apply(sumfunc_series) @@ -420,7 +420,7 @@ def test_timegrouper_apply_return_type_value(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. # Issue #11742 - df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) + df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) df_dt = df.copy() df_dt["date"] = pd.to_datetime(df_dt["date"]) @@ -448,7 +448,7 @@ def test_groupby_groups_datetimeindex(self): # GH#11442 index = pd.date_range("2015/01/01", periods=5, name="date") - df = pd.DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) + df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) result = df.groupby(level="date").groups dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"] expected = { @@ -461,7 +461,7 @@ def test_groupby_groups_datetimeindex(self): result = grouped.get_group(date) data = [[df.loc[date, "A"], df.loc[date, "B"]]] expected_index = pd.DatetimeIndex([date], name="date", freq="D") - expected = pd.DataFrame(data, columns=list("AB"), index=expected_index) + expected = DataFrame(data, columns=list("AB"), index=expected_index) tm.assert_frame_equal(result, expected) def test_groupby_groups_datetimeindex_tz(self): @@ -671,7 +671,7 @@ def test_groupby_with_timezone_selection(self): # GH 11616 # Test that column selection returns output in correct timezone. np.random.seed(42) - df = pd.DataFrame( + df = DataFrame( { "factor": np.random.randint(0, 3, size=60), "time": pd.date_range( @@ -687,9 +687,9 @@ def test_timezone_info(self): # see gh-11682: Timezone info lost when broadcasting # scalar datetime to DataFrame - df = pd.DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) + df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) assert df["b"][0].tzinfo == pytz.utc - df = pd.DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) df["b"] = datetime.now(pytz.utc) assert df["b"][0].tzinfo == pytz.utc @@ -733,7 +733,7 @@ def test_first_last_max_min_on_time_data(self): def test_nunique_with_timegrouper_and_nat(self): # GH 17575 - test = pd.DataFrame( + test = DataFrame( { "time": [ Timestamp("2016-06-28 09:35:35"), @@ -754,13 +754,13 @@ def test_scalar_call_versus_list_call(self): # Issue: 17530 data_frame = { "location": ["shanghai", "beijing", "shanghai"], - "time": pd.Series( + "time": Series( ["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"], dtype="datetime64[ns]", ), "value": [1, 2, 3], } - data_frame = pd.DataFrame(data_frame).set_index("time") + data_frame = DataFrame(data_frame).set_index("time") grouper = pd.Grouper(freq="D") grouped = data_frame.groupby(grouper) @@ -776,10 +776,10 @@ def test_grouper_period_index(self): index = pd.period_range( start="2018-01", periods=periods, freq="M", name="Month" ) - period_series = pd.Series(range(periods), index=index) + period_series = Series(range(periods), index=index) result = period_series.groupby(period_series.index.month).sum() - expected = pd.Series( + expected = Series( range(0, periods), index=Index(range(1, periods + 1), name=index.name) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 97be039e16ebb..cd3c2771db8a4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._libs.groupby import group_cumprod_float64, group_cumsum - from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype import pandas as pd @@ -87,7 +85,7 @@ def test_transform_fast(): grp = df.groupby("id")["val"] values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index, name="val") + expected = Series(values, index=df.index, name="val") result = grp.transform(np.mean) tm.assert_series_equal(result, expected) @@ -96,7 +94,7 @@ def test_transform_fast(): tm.assert_series_equal(result, expected) # GH 12737 - df = pd.DataFrame( + df = DataFrame( { "grouping": [0, 1, 1, 3], "f": [1.1, 2.1, 3.1, 4.5], @@ -113,7 +111,7 @@ def test_transform_fast(): pd.Timestamp("2014-1-2"), pd.Timestamp("2014-1-4"), ] - expected = pd.DataFrame( + expected = DataFrame( {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]}, columns=["f", "i", "d"], ) @@ -125,7 +123,7 @@ def test_transform_fast(): tm.assert_frame_equal(result, expected) # dup columns - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"]) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"]) result = df.groupby("g").transform("first") expected = df.drop("g", axis=1) tm.assert_frame_equal(result, expected) @@ -221,13 +219,13 @@ def test_transform_bug(): def test_transform_numeric_to_boolean(): # GH 16875 # inconsistency in transforming boolean values - expected = pd.Series([True, True], name="A") + expected = Series([True, True], name="A") - df = pd.DataFrame({"A": [1.1, 2.2], "B": [1, 2]}) + df = DataFrame({"A": [1.1, 2.2], "B": [1, 2]}) result = df.groupby("B").A.transform(lambda x: True) tm.assert_series_equal(result, expected) - df = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + df = DataFrame({"A": [1, 2], "B": [1, 2]}) result = df.groupby("B").A.transform(lambda x: True) tm.assert_series_equal(result, expected) @@ -236,7 +234,7 @@ def test_transform_datetime_to_timedelta(): # GH 15429 # transforming a datetime to timedelta df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) - expected = pd.Series([Timestamp("20130101") - Timestamp("20130101")] * 5, name="A") + expected = Series([Timestamp("20130101") - Timestamp("20130101")] * 5, name="A") # this does date math without changing result type in transform base_time = df["A"][0] @@ -389,7 +387,7 @@ def test_transform_function_aliases(df): def test_series_fast_transform_date(): # GH 13191 - df = pd.DataFrame( + df = DataFrame( {"grouping": [np.nan, 1, 1, 3], "d": pd.date_range("2014-1-1", "2014-1-4")} ) result = df.groupby("grouping")["d"].transform("first") @@ -399,14 +397,14 @@ def test_series_fast_transform_date(): pd.Timestamp("2014-1-2"), pd.Timestamp("2014-1-4"), ] - expected = pd.Series(dates, name="d") + expected = Series(dates, name="d") tm.assert_series_equal(result, expected) def test_transform_length(): # GH 9697 - df = pd.DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]}) - expected = pd.Series([3.0] * 4) + df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]}) + expected = Series([3.0] * 4) def nsum(x): return np.nansum(x) @@ -426,7 +424,7 @@ def test_transform_coercion(): # 14457 # when we are transforming be sure to not coerce # via assignment - df = pd.DataFrame(dict(A=["a", "a"], B=[0, 1])) + df = DataFrame(dict(A=["a", "a"], B=[0, 1])) g = df.groupby("A") expected = g.transform(np.mean) @@ -482,11 +480,9 @@ def test_groupby_transform_with_int(): def test_groupby_transform_with_nan_group(): # GH 9941 - df = pd.DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + df = DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) result = df.groupby(df.b)["a"].transform(max) - expected = pd.Series( - [1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a" - ) + expected = Series([1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a") tm.assert_series_equal(result, expected) @@ -517,83 +513,6 @@ def f(group): tm.assert_frame_equal(res, result.loc[key]) -def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): - """ - Check a group transform that executes a cumulative function. - - Parameters - ---------- - pd_op : callable - The pandas cumulative function. - np_op : callable - The analogous one in NumPy. - dtype : type - The specified dtype of the data. - """ - is_datetimelike = False - - data = np.array([[1], [2], [3], [4]], dtype=dtype) - ans = np.zeros_like(data) - - labels = np.array([0, 0, 0, 0], dtype=np.int64) - ngroups = 1 - pd_op(ans, data, labels, ngroups, is_datetimelike) - - tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) - - -def test_cython_group_transform_cumsum(any_real_dtype): - # see gh-4095 - dtype = np.dtype(any_real_dtype).type - pd_op, np_op = group_cumsum, np.cumsum - _check_cython_group_transform_cumulative(pd_op, np_op, dtype) - - -def test_cython_group_transform_cumprod(): - # see gh-4095 - dtype = np.float64 - pd_op, np_op = group_cumprod_float64, np.cumproduct - _check_cython_group_transform_cumulative(pd_op, np_op, dtype) - - -def test_cython_group_transform_algos(): - # see gh-4095 - is_datetimelike = False - - # with nans - labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) - ngroups = 1 - - data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") - actual = np.zeros_like(data) - actual.fill(np.nan) - group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) - expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") - tm.assert_numpy_array_equal(actual[:, 0], expected) - - actual = np.zeros_like(data) - actual.fill(np.nan) - group_cumsum(actual, data, labels, ngroups, is_datetimelike) - expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") - tm.assert_numpy_array_equal(actual[:, 0], expected) - - # timedelta - is_datetimelike = True - data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] - actual = np.zeros_like(data, dtype="int64") - group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) - expected = np.array( - [ - np.timedelta64(1, "ns"), - np.timedelta64(2, "ns"), - np.timedelta64(3, "ns"), - np.timedelta64(4, "ns"), - np.timedelta64(5, "ns"), - ] - ) - tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) - - @pytest.mark.parametrize( "op, args, targop", [ @@ -625,7 +544,7 @@ def test_cython_transform_series(op, args, targop): "input, exp", [ # When everything is NaN - ({"key": ["b"] * 10, "value": np.nan}, pd.Series([np.nan] * 10, name="value")), + ({"key": ["b"] * 10, "value": np.nan}, Series([np.nan] * 10, name="value")), # When there is a single NaN ( {"key": ["b"] * 10 + ["a"] * 2, "value": [3] * 3 + [np.nan] + [3] * 8}, @@ -665,13 +584,13 @@ def test_cython_transform_series(op, args, targop): ], ) def test_groupby_cum_skipna(op, skipna, input, exp): - df = pd.DataFrame(input) + df = DataFrame(input) result = df.groupby("key")["value"].transform(op, skipna=skipna) if isinstance(exp, dict): expected = exp[(op, skipna)] else: expected = exp - expected = pd.Series(expected, name="value") + expected = Series(expected, name="value") tm.assert_series_equal(expected, result) @@ -780,7 +699,7 @@ def test_transform_with_non_scalar_group(): ("non", "G"), ] ) - df = pd.DataFrame( + df = DataFrame( np.random.randint(1, 10, (4, 12)), columns=cols, index=["A", "C", "G", "T"] ) @@ -792,10 +711,10 @@ def test_transform_with_non_scalar_group(): @pytest.mark.parametrize( "cols,exp,comp_func", [ - ("a", pd.Series([1, 1, 1], name="a"), tm.assert_series_equal), + ("a", Series([1, 1, 1], name="a"), tm.assert_series_equal), ( ["a", "c"], - pd.DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}), + DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}), tm.assert_frame_equal, ), ], @@ -809,7 +728,7 @@ def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request): request.node.add_marker(pytest.mark.xfail(reason=reason)) # GH 19200 - df = pd.DataFrame( + df = DataFrame( {"a": pd.date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)} ) @@ -892,7 +811,7 @@ def test_pad_stable_sorting(fill_method): if fill_method == "bfill": y = y[::-1] - df = pd.DataFrame({"x": x, "y": y}) + df = DataFrame({"x": x, "y": y}) expected = df.drop("x", 1) result = getattr(df.groupby("x"), fill_method)() @@ -980,11 +899,11 @@ def test_ffill_bfill_non_unique_multilevel(func, expected_status): @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 - df = pd.DataFrame( + df = DataFrame( [["foo", True], [np.nan, True], ["foo", True]], columns=["key", "val"] ) - exp = pd.Series([True, np.nan, True], name="val") + exp = Series([True, np.nan, True], name="val") res = df.groupby("key")["val"].transform(func) tm.assert_series_equal(res, exp) @@ -1002,8 +921,8 @@ def demean_rename(x): return result - df = pd.DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]}) - expected = pd.DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]}) + df = DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]}) + expected = DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]}) result = df.groupby("group").transform(demean_rename) tm.assert_frame_equal(result, expected) @@ -1015,9 +934,9 @@ def demean_rename(x): def test_groupby_transform_timezone_column(func): # GH 24198 ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") - result = pd.DataFrame({"end_time": [ts], "id": [1]}) + result = DataFrame({"end_time": [ts], "id": [1]}) result["max_end_time"] = result.groupby("id").end_time.transform(func) - expected = pd.DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) + expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) tm.assert_frame_equal(result, expected) @@ -1032,12 +951,12 @@ def test_groupby_transform_with_datetimes(func, values): # GH 15306 dates = pd.date_range("1/1/2011", periods=10, freq="D") - stocks = pd.DataFrame({"price": np.arange(10.0)}, index=dates) + stocks = DataFrame({"price": np.arange(10.0)}, index=dates) stocks["week_id"] = dates.isocalendar().week result = stocks.groupby(stocks["week_id"])["price"].transform(func) - expected = pd.Series(data=pd.to_datetime(values), index=dates, name="price") + expected = Series(data=pd.to_datetime(values), index=dates, name="price") tm.assert_series_equal(result, expected) @@ -1059,7 +978,7 @@ def test_transform_absent_categories(func): @pytest.mark.parametrize("key, val", [("level", 0), ("by", Series([0]))]) def test_ffill_not_in_axis(func, key, val): # GH 21521 - df = pd.DataFrame([[np.nan]]) + df = DataFrame([[np.nan]]) result = getattr(df.groupby(**{key: val}), func)() expected = df @@ -1145,7 +1064,7 @@ def test_transform_fastpath_raises(): # GH#29631 case where fastpath defined in groupby.generic _choose_path # raises, but slow_path does not - df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) + df = DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) gb = df.groupby("A") def func(grp): @@ -1167,13 +1086,13 @@ def func(grp): result = gb.transform(func) - expected = pd.DataFrame([2, -2, 2, 4], columns=["B"]) + expected = DataFrame([2, -2, 2, 4], columns=["B"]) tm.assert_frame_equal(result, expected) def test_transform_lambda_indexing(): # GH 7883 - df = pd.DataFrame( + df = DataFrame( { "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"], "B": ["one", "one", "two", "three", "two", "six", "five", "three"], @@ -1213,14 +1132,14 @@ def test_categorical_and_not_categorical_key(observed): # and a non-categorical key, doesn't try to expand the output to include # non-observed categories but instead matches the input shape. # GH 32494 - df_with_categorical = pd.DataFrame( + df_with_categorical = DataFrame( { "A": pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]), "B": [1, 2, 3], "C": ["a", "b", "a"], } ) - df_without_categorical = pd.DataFrame( + df_without_categorical = DataFrame( {"A": ["a", "b", "a"], "B": [1, 2, 3], "C": ["a", "b", "a"]} ) @@ -1228,7 +1147,7 @@ def test_categorical_and_not_categorical_key(observed): result = df_with_categorical.groupby(["A", "C"], observed=observed).transform("sum") expected = df_without_categorical.groupby(["A", "C"]).transform("sum") tm.assert_frame_equal(result, expected) - expected_explicit = pd.DataFrame({"B": [4, 2, 4]}) + expected_explicit = DataFrame({"B": [4, 2, 4]}) tm.assert_frame_equal(result, expected_explicit) # Series case @@ -1237,5 +1156,5 @@ def test_categorical_and_not_categorical_key(observed): ) expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum") tm.assert_series_equal(result, expected) - expected_explicit = pd.Series([4, 2, 4], name="B") + expected_explicit = Series([4, 2, 4], name="B") tm.assert_series_equal(result, expected_explicit) diff --git a/pandas/tests/indexes/base_class/test_indexing.py b/pandas/tests/indexes/base_class/test_indexing.py index b2fa8f31ee5ec..fd04a820037b9 100644 --- a/pandas/tests/indexes/base_class/test_indexing.py +++ b/pandas/tests/indexes/base_class/test_indexing.py @@ -1,6 +1,8 @@ +import numpy as np import pytest from pandas import Index +import pandas._testing as tm class TestGetSliceBounds: @@ -24,3 +26,11 @@ def test_get_slice_bounds_outside(self, kind, side, expected, data, bound): def test_get_slice_bounds_invalid_side(self): with pytest.raises(ValueError, match="Invalid value for side kwarg"): Index([]).get_slice_bound("a", kind=None, side="middle") + + +class TestGetIndexerNonUnique: + def test_get_indexer_non_unique_dtype_mismatch(self): + # GH#25459 + indexes, missing = Index(["A", "B"]).get_indexer_non_unique(Index([0])) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 61826f2403a4b..5ebab965e6f04 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -3,7 +3,6 @@ """ import pytest -import pandas as pd from pandas import Index import pandas._testing as tm @@ -11,8 +10,8 @@ class TestReshape: def test_repeat(self): repeats = 2 - index = pd.Index([1, 2, 3]) - expected = pd.Index([1, 1, 2, 2, 3, 3]) + index = Index([1, 2, 3]) + expected = Index([1, 1, 2, 2, 3, 3]) result = index.repeat(repeats) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 77b5e2780464d..94c6d2ad6dc95 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -12,14 +12,14 @@ class TestIndexSetOps: "method", ["union", "intersection", "difference", "symmetric_difference"] ) def test_setops_disallow_true(self, method): - idx1 = pd.Index(["a", "b"]) - idx2 = pd.Index(["b", "c"]) + idx1 = Index(["a", "b"]) + idx2 = Index(["b", "c"]) with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) def test_setops_preserve_object_dtype(self): - idx = pd.Index([1, 2, 3], dtype=object) + idx = Index([1, 2, 3], dtype=object) result = idx.intersection(idx[1:]) expected = idx[1:] tm.assert_index_equal(result, expected) @@ -67,7 +67,7 @@ def test_union_different_type_base(self, klass): def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 - idx = pd.Index([1, pd.Timestamp("2000")]) + idx = Index([1, pd.Timestamp("2000")]) # default (sort=None) with tm.assert_produces_warning(RuntimeWarning): result = idx.union(idx[:1]) @@ -87,7 +87,7 @@ def test_union_sort_other_incomparable(self): def test_union_sort_other_incomparable_true(self): # TODO decide on True behaviour # sort=True - idx = pd.Index([1, pd.Timestamp("2000")]) + idx = Index([1, pd.Timestamp("2000")]) with pytest.raises(TypeError, match=".*"): idx.union(idx[:1], sort=True) @@ -112,12 +112,12 @@ def test_intersection_different_type_base(self, klass, sort): assert tm.equalContents(result, second) def test_intersect_nosort(self): - result = pd.Index(["c", "b", "a"]).intersection(["b", "a"]) - expected = pd.Index(["b", "a"]) + result = Index(["c", "b", "a"]).intersection(["b", "a"]) + expected = Index(["b", "a"]) tm.assert_index_equal(result, expected) def test_intersection_equal_sort(self): - idx = pd.Index(["c", "a", "b"]) + idx = Index(["c", "a", "b"]) tm.assert_index_equal(idx.intersection(idx, sort=False), idx) tm.assert_index_equal(idx.intersection(idx, sort=None), idx) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 81b31e3ea180c..2bfe9bf0194ec 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -42,7 +42,7 @@ def test_can_hold_identifiers(self): def test_disallow_addsub_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError - idx = pd.Index(pd.Categorical(["a", "b"])) + idx = Index(pd.Categorical(["a", "b"])) cat_or_list = "'(Categorical|list)' and '(Categorical|list)'" msg = "|".join( [ diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 9cf901c0797d8..c720547aab3f8 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import CategoricalIndex, Index, IntervalIndex +from pandas import CategoricalIndex, Index, IntervalIndex, Timestamp import pandas._testing as tm @@ -251,6 +251,32 @@ def test_get_indexer(self): with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="nearest") + def test_get_indexer_array(self): + arr = np.array( + [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], + dtype=object, + ) + cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] + ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") + result = ci.get_indexer(arr) + expected = np.array([0, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_same_order(self): + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) + + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) + expected = np.array([1, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19551 + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) + + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) + expected = np.array([1, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + class TestWhere: @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) diff --git a/pandas/tests/indexes/categorical/test_map.py b/pandas/tests/indexes/categorical/test_map.py index 6cef555275444..1a326c1acea46 100644 --- a/pandas/tests/indexes/categorical/test_map.py +++ b/pandas/tests/indexes/categorical/test_map.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import CategoricalIndex, Index +from pandas import CategoricalIndex, Index, Series import pandas._testing as tm @@ -56,7 +56,7 @@ def f(x): ) tm.assert_index_equal(result, exp) - result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"])) + result = ci.map(Series([10, 20, 30], index=["A", "B", "C"])) tm.assert_index_equal(result, exp) result = ci.map({"A": 10, "B": 20, "C": 30}) @@ -64,13 +64,13 @@ def f(x): def test_map_with_categorical_series(self): # GH 12756 - a = pd.Index([1, 2, 3, 4]) - b = pd.Series(["even", "odd", "even", "odd"], dtype="category") - c = pd.Series(["even", "odd", "even", "odd"]) + a = Index([1, 2, 3, 4]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) exp = CategoricalIndex(["odd", "even", "odd", np.nan]) tm.assert_index_equal(a.map(b), exp) - exp = pd.Index(["odd", "even", "odd", np.nan]) + exp = Index(["odd", "even", "odd", np.nan]) tm.assert_index_equal(a.map(c), exp) @pytest.mark.parametrize( @@ -80,8 +80,8 @@ def test_map_with_categorical_series(self): ([1, 2, np.nan], pd.isna), ([1, 1, np.nan], {1: False}), ([1, 2, np.nan], {1: False, 2: False}), - ([1, 1, np.nan], pd.Series([False, False])), - ([1, 2, np.nan], pd.Series([False, False, False])), + ([1, 1, np.nan], Series([False, False])), + ([1, 2, np.nan], Series([False, False, False])), ), ) def test_map_with_nan(self, data, f): # GH 24241 @@ -91,5 +91,21 @@ def test_map_with_nan(self, data, f): # GH 24241 expected = pd.Categorical([False, False, np.nan]) tm.assert_categorical_equal(result, expected) else: - expected = pd.Index([False, False, np.nan]) + expected = Index([False, False, np.nan]) tm.assert_index_equal(result, expected) + + def test_map_with_dict_or_series(self): + orig_values = ["a", "B", 1, "a"] + new_values = ["one", 2, 3.0, "one"] + cur_index = CategoricalIndex(orig_values, name="XXX") + expected = CategoricalIndex(new_values, name="XXX", categories=[3.0, 2, "one"]) + + mapper = Series(new_values[:-1], index=orig_values[:-1]) + result = cur_index.map(mapper) + # Order of categories in result can be different + tm.assert_index_equal(result, expected) + + mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])} + result = cur_index.map(mapper) + # Order of categories in result can be different + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index bc178c138341f..b20026b20e1d7 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -93,15 +93,15 @@ def test_create_index_existing_name(self): expected = self.create_index() if not isinstance(expected, MultiIndex): expected.name = "foo" - result = pd.Index(expected) + result = Index(expected) tm.assert_index_equal(result, expected) - result = pd.Index(expected, name="bar") + result = Index(expected, name="bar") expected.name = "bar" tm.assert_index_equal(result, expected) else: expected.names = ["foo", "bar"] - result = pd.Index(expected) + result = Index(expected) tm.assert_index_equal( result, Index( @@ -120,7 +120,7 @@ def test_create_index_existing_name(self): ), ) - result = pd.Index(expected, names=["A", "B"]) + result = Index(expected, names=["A", "B"]) tm.assert_index_equal( result, Index( @@ -410,12 +410,12 @@ def test_take_invalid_kwargs(self): def test_repeat(self): rep = 2 i = self.create_index() - expected = pd.Index(i.values.repeat(rep), name=i.name) + expected = Index(i.values.repeat(rep), name=i.name) tm.assert_index_equal(i.repeat(rep), expected) i = self.create_index() rep = np.arange(len(i)) - expected = pd.Index(i.values.repeat(rep), name=i.name) + expected = Index(i.values.repeat(rep), name=i.name) tm.assert_index_equal(i.repeat(rep), expected) def test_numpy_repeat(self): @@ -441,7 +441,7 @@ def test_where(self, klass): tm.assert_index_equal(result, expected) cond = [False] + [True] * len(i[1:]) - expected = pd.Index([i._na_value] + i[1:].tolist(), dtype=i.dtype) + expected = Index([i._na_value] + i[1:].tolist(), dtype=i.dtype) result = i.where(klass(cond)) tm.assert_index_equal(result, expected) @@ -803,7 +803,7 @@ def test_map(self): "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index), + lambda values, index: Series(values, index), ], ) def test_map_dictlike(self, mapper): @@ -824,7 +824,7 @@ def test_map_dictlike(self, mapper): tm.assert_index_equal(result, expected) # empty mappable - expected = pd.Index([np.nan] * len(index)) + expected = Index([np.nan] * len(index)) result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 3e7e76bba0dde..13b658b31e3ee 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -179,7 +179,7 @@ def test_astype_object_tz(self, tz): Timestamp("2013-03-31", tz=tz), Timestamp("2013-04-30", tz=tz), ] - expected = pd.Index(expected_list, dtype=object, name="idx") + expected = Index(expected_list, dtype=object, name="idx") result = idx.astype(object) tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list @@ -195,7 +195,7 @@ def test_astype_object_with_nat(self): pd.NaT, Timestamp("2013-01-04"), ] - expected = pd.Index(expected_list, dtype=object, name="idx") + expected = Index(expected_list, dtype=object, name="idx") result = idx.astype(object) tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list @@ -269,7 +269,7 @@ def _check_rng(rng): def test_integer_index_astype_datetime(self, tz, dtype): # GH 20997, 20964, 24559 val = [pd.Timestamp("2018-01-01", tz=tz).value] - result = pd.Index(val, name="idx").astype(dtype) + result = Index(val, name="idx").astype(dtype) expected = pd.DatetimeIndex(["2018-01-01"], tz=tz, name="idx") tm.assert_index_equal(result, expected) @@ -304,7 +304,7 @@ def test_astype_category(self, tz): def test_astype_array_fallback(self, tz): obj = pd.date_range("2000", periods=2, tz=tz, name="idx") result = obj.astype(bool) - expected = pd.Index(np.array([True, True]), name="idx") + expected = Index(np.array([True, True]), name="idx") tm.assert_index_equal(result, expected) result = obj._data.astype(bool) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index d3c79f231449a..0c562e7b8f848 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -464,12 +464,12 @@ def test_construction_dti_with_mixed_timezones(self): def test_construction_base_constructor(self): arr = [pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03")] - tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) + tm.assert_index_equal(Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) arr = [np.nan, pd.NaT, pd.Timestamp("2011-01-03")] - tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) + tm.assert_index_equal(Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) def test_construction_outofbounds(self): # GH 13663 @@ -856,7 +856,7 @@ def test_constructor_no_precision_raises(self): pd.DatetimeIndex(["2000"], dtype="datetime64") with pytest.raises(ValueError, match=msg): - pd.Index(["2000"], dtype="datetime64") + Index(["2000"], dtype="datetime64") def test_constructor_wrong_precision_raises(self): msg = "Unexpected value for 'dtype': 'datetime64\\[us\\]'" diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index cb12365f38605..e0506df5cd939 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -618,6 +618,16 @@ def test_get_indexer_out_of_bounds_date(self, target, positions): expected = np.array(positions, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_pad_requires_monotonicity(self): + rng = date_range("1/1/2000", "3/1/2000", freq="B") + + # neither monotonic increasing or decreasing + rng2 = rng[[1, 0, 2]] + + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(ValueError, match=msg): + rng2.get_indexer(rng, method="pad") + class TestMaybeCastSliceBound: def test_maybe_cast_slice_bounds_empty(self): diff --git a/pandas/tests/indexes/datetimes/test_insert.py b/pandas/tests/indexes/datetimes/test_insert.py index b4f6cc3798f4f..d2c999f61b4bb 100644 --- a/pandas/tests/indexes/datetimes/test_insert.py +++ b/pandas/tests/indexes/datetimes/test_insert.py @@ -21,7 +21,8 @@ def test_insert_nat(self, tz, null): @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) def test_insert_invalid_na(self, tz): idx = DatetimeIndex(["2017-01-01"], tz=tz) - with pytest.raises(TypeError, match="incompatible label"): + msg = "value should be a 'Timestamp' or 'NaT'. Got 'timedelta64' instead." + with pytest.raises(TypeError, match=msg): idx.insert(0, np.timedelta64("NaT")) def test_insert_empty_preserves_freq(self, tz_naive_fixture): @@ -174,7 +175,7 @@ def test_insert_mismatched_types_raises(self, tz_aware_fixture, item): tz = tz_aware_fixture dti = date_range("2019-11-04", periods=9, freq="-1D", name=9, tz=tz) - msg = "incompatible label" + msg = "value should be a 'Timestamp' or 'NaT'. Got '.*' instead" with pytest.raises(TypeError, match=msg): dti.insert(1, item) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 51841727d510b..375a88f2f2634 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -375,7 +375,7 @@ def test_datetime_name_accessors(self, time_locale): def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) - tm.assert_index_equal(dti.nanosecond, pd.Index(np.arange(10, dtype=np.int64))) + tm.assert_index_equal(dti.nanosecond, Index(np.arange(10, dtype=np.int64))) def test_iter_readonly(): diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index ea6381547009c..4c632aba51c45 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -328,7 +328,7 @@ def test_equals(self): assert idx.astype(object).equals(idx) assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) - assert not idx.equals(pd.Series(idx)) + assert not idx.equals(Series(idx)) idx2 = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"], tz="US/Pacific") assert not idx.equals(idx2) @@ -336,7 +336,7 @@ def test_equals(self): assert not idx.equals(idx2.astype(object)) assert not idx.astype(object).equals(idx2) assert not idx.equals(list(idx2)) - assert not idx.equals(pd.Series(idx2)) + assert not idx.equals(Series(idx2)) # same internal, different tz idx3 = pd.DatetimeIndex(idx.asi8, tz="US/Pacific") @@ -346,10 +346,10 @@ def test_equals(self): assert not idx.equals(idx3.astype(object)) assert not idx.astype(object).equals(idx3) assert not idx.equals(list(idx3)) - assert not idx.equals(pd.Series(idx3)) + assert not idx.equals(Series(idx3)) # check that we do not raise when comparing with OutOfBounds objects - oob = pd.Index([datetime(2500, 1, 1)] * 3, dtype=object) + oob = Index([datetime(2500, 1, 1)] * 3, dtype=object) assert not idx.equals(oob) assert not idx2.equals(oob) assert not idx3.equals(oob) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 0d39e034905d2..d6016b9e14743 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -38,7 +38,9 @@ def test_dti_date_out_of_range(self, data): "field", [ "dayofweek", + "day_of_week", "dayofyear", + "day_of_year", "quarter", "days_in_month", "is_month_start", diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index a8baf67273490..c02441b5883df 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -201,7 +201,7 @@ def test_intersection2(self): third = Index(["a", "b", "c"]) result = first.intersection(third) - expected = pd.Index([], dtype=object) + expected = Index([], dtype=object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index d661a56311e6c..bd4926880c13d 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -127,9 +127,9 @@ def test_append_mixed_dtypes(): [1, 2, 3, "x", "y", "z"], [1.1, np.nan, 3.3, "x", "y", "z"], ["a", "b", "c", "x", "y", "z"], - dti.append(pd.Index(["x", "y", "z"])), - dti_tz.append(pd.Index(["x", "y", "z"])), - pi.append(pd.Index(["x", "y", "z"])), + dti.append(Index(["x", "y", "z"])), + dti_tz.append(Index(["x", "y", "z"])), + pi.append(Index(["x", "y", "z"])), ] ) tm.assert_index_equal(res, exp) @@ -203,7 +203,7 @@ def test_map_dictlike(idx, mapper): tm.assert_index_equal(result, expected) # empty mappable - expected = pd.Index([np.nan] * len(idx)) + expected = Index([np.nan] * len(idx)) result = idx.map(mapper(expected, idx)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 16af884c89e9e..63afd5e130508 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -196,7 +196,7 @@ def test_from_arrays_index_series_datetimetz(): tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) - result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + result2 = pd.MultiIndex.from_arrays([Series(idx1), Series(idx2)]) tm.assert_index_equal(result2.get_level_values(0), idx1) tm.assert_index_equal(result2.get_level_values(1), idx2) @@ -210,7 +210,7 @@ def test_from_arrays_index_series_timedelta(): tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) - result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + result2 = pd.MultiIndex.from_arrays([Series(idx1), Series(idx2)]) tm.assert_index_equal(result2.get_level_values(0), idx1) tm.assert_index_equal(result2.get_level_values(1), idx2) @@ -224,7 +224,7 @@ def test_from_arrays_index_series_period(): tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) - result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + result2 = pd.MultiIndex.from_arrays([Series(idx1), Series(idx2)]) tm.assert_index_equal(result2.get_level_values(0), idx1) tm.assert_index_equal(result2.get_level_values(1), idx2) @@ -244,7 +244,7 @@ def test_from_arrays_index_datetimelike_mixed(): tm.assert_index_equal(result.get_level_values(3), idx4) result2 = pd.MultiIndex.from_arrays( - [pd.Series(idx1), pd.Series(idx2), pd.Series(idx3), pd.Series(idx4)] + [Series(idx1), Series(idx2), Series(idx3), Series(idx4)] ) tm.assert_index_equal(result2.get_level_values(0), idx1) tm.assert_index_equal(result2.get_level_values(1), idx2) @@ -263,7 +263,7 @@ def test_from_arrays_index_series_categorical(): tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) - result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + result2 = pd.MultiIndex.from_arrays([Series(idx1), Series(idx2)]) tm.assert_index_equal(result2.get_level_values(0), idx1) tm.assert_index_equal(result2.get_level_values(1), idx2) @@ -340,8 +340,8 @@ def test_from_arrays_different_lengths(idx1, idx2): def test_from_arrays_respects_none_names(): # GH27292 - a = pd.Series([1, 2, 3], name="foo") - b = pd.Series(["a", "b", "c"], name="bar") + a = Series([1, 2, 3], name="foo") + b = Series(["a", "b", "c"], name="bar") result = MultiIndex.from_arrays([a, b], names=None) expected = MultiIndex( @@ -402,9 +402,9 @@ def test_tuples_with_name_string(): li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] msg = "Names should be list-like for a MultiIndex" with pytest.raises(ValueError, match=msg): - pd.Index(li, name="abc") + Index(li, name="abc") with pytest.raises(ValueError, match=msg): - pd.Index(li, name="a") + Index(li, name="a") def test_from_tuples_with_tuple_label(): @@ -429,7 +429,7 @@ def test_from_product_empty_zero_levels(): def test_from_product_empty_one_level(): result = MultiIndex.from_product([[]], names=["A"]) - expected = pd.Index([], name="A") + expected = Index([], name="A") tm.assert_index_equal(result.levels[0], expected) assert result.names == ["A"] @@ -478,7 +478,7 @@ def test_from_product_datetimeindex(): @pytest.mark.parametrize("ordered", [False, True]) -@pytest.mark.parametrize("f", [lambda x: x, lambda x: pd.Series(x), lambda x: x.values]) +@pytest.mark.parametrize("f", [lambda x: x, lambda x: Series(x), lambda x: x.values]) def test_from_product_index_series_categorical(ordered, f): # GH13743 first = ["foo", "bar"] @@ -547,11 +547,11 @@ def test_from_product_iterator(): "a, b, expected_names", [ ( - pd.Series([1, 2, 3], name="foo"), - pd.Series(["a", "b"], name="bar"), + Series([1, 2, 3], name="foo"), + Series(["a", "b"], name="bar"), ["foo", "bar"], ), - (pd.Series([1, 2, 3], name="foo"), ["a", "b"], ["foo", None]), + (Series([1, 2, 3], name="foo"), ["a", "b"], ["foo", None]), ([1, 2, 3], ["a", "b"], None), ], ) @@ -568,8 +568,8 @@ def test_from_product_infer_names(a, b, expected_names): def test_from_product_respects_none_names(): # GH27292 - a = pd.Series([1, 2, 3], name="foo") - b = pd.Series(["a", "b"], name="bar") + a = Series([1, 2, 3], name="foo") + b = Series(["a", "b"], name="bar") result = MultiIndex.from_product([a, b], names=None) expected = MultiIndex( @@ -597,7 +597,7 @@ def test_create_index_existing_name(idx): # specified, the new index should inherit the previous object name index = idx index.names = ["foo", "bar"] - result = pd.Index(index) + result = Index(index) expected = Index( Index( [ @@ -613,7 +613,7 @@ def test_create_index_existing_name(idx): ) tm.assert_index_equal(result, expected) - result = pd.Index(index, name="A") + result = Index(index, name="A") expected = Index( Index( [ @@ -649,10 +649,10 @@ def test_from_frame(): @pytest.mark.parametrize( "non_frame", [ - pd.Series([1, 2, 3, 4]), + Series([1, 2, 3, 4]), [1, 2, 3, 4], [[1, 2], [3, 4], [5, 6]], - pd.Index([1, 2, 3, 4]), + Index([1, 2, 3, 4]), np.array([[1, 2], [3, 4], [5, 6]]), 27, ], diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 3519c5d0d5a9a..c80548783d148 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -98,7 +98,7 @@ def test_to_frame_dtype_fidelity(): ) original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} - expected_df = pd.DataFrame( + expected_df = DataFrame( { "dates": pd.date_range("19910905", periods=6, tz="US/Eastern"), "a": [1, 1, 1, 2, 2, 2], diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 184cedea7dc5c..e1b011b762fe7 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -20,7 +20,7 @@ def test_equals(idx): if idx.nlevels == 1: # do not test MultiIndex - assert not idx.equals(pd.Series(idx)) + assert not idx.equals(Series(idx)) def test_equals_op(idx): @@ -84,6 +84,46 @@ def test_equals_op(idx): tm.assert_series_equal(series_a == item, Series(expected3)) +def test_compare_tuple(): + # GH#21517 + mi = MultiIndex.from_product([[1, 2]] * 2) + + all_false = np.array([False, False, False, False]) + + result = mi == mi[0] + expected = np.array([True, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = mi != mi[0] + tm.assert_numpy_array_equal(result, ~expected) + + result = mi < mi[0] + tm.assert_numpy_array_equal(result, all_false) + + result = mi <= mi[0] + tm.assert_numpy_array_equal(result, expected) + + result = mi > mi[0] + tm.assert_numpy_array_equal(result, ~expected) + + result = mi >= mi[0] + tm.assert_numpy_array_equal(result, ~all_false) + + +def test_compare_tuple_strs(): + # GH#34180 + + mi = MultiIndex.from_tuples([("a", "b"), ("b", "c"), ("c", "a")]) + + result = mi == ("c", "a") + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = mi == ("c",) + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + def test_equals_multi(idx): assert idx.equals(idx) assert not idx.equals(idx.values) @@ -218,11 +258,11 @@ def test_multiindex_compare(): midx = pd.MultiIndex.from_product([[0, 1]]) # Equality self-test: MultiIndex object vs self - expected = pd.Series([True, True]) - result = pd.Series(midx == midx) + expected = Series([True, True]) + result = Series(midx == midx) tm.assert_series_equal(result, expected) # Greater than comparison: MultiIndex object vs self - expected = pd.Series([False, False]) - result = pd.Series(midx > midx) + expected = Series([False, False]) + result = Series(midx > midx) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py index 985fe5773ceed..ec65ec2c54689 100644 --- a/pandas/tests/indexes/multi/test_get_level_values.py +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -44,11 +44,11 @@ def test_get_level_values_all_na(): arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(0) - expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64) + expected = Index([np.nan, np.nan, np.nan], dtype=np.float64) tm.assert_index_equal(result, expected) result = index.get_level_values(1) - expected = pd.Index(["a", np.nan, 1], dtype=object) + expected = Index(["a", np.nan, 1], dtype=object) tm.assert_index_equal(result, expected) @@ -71,11 +71,11 @@ def test_get_level_values_na(): arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(0) - expected = pd.Index([np.nan, np.nan, np.nan]) + expected = Index([np.nan, np.nan, np.nan]) tm.assert_index_equal(result, expected) result = index.get_level_values(1) - expected = pd.Index(["a", np.nan, 1]) + expected = Index(["a", np.nan, 1]) tm.assert_index_equal(result, expected) arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] @@ -87,7 +87,7 @@ def test_get_level_values_na(): arrays = [[], []] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(0) - expected = pd.Index([], dtype=object) + expected = Index([], dtype=object) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index b9132f429905d..6f79878fd3ab1 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -27,6 +27,15 @@ def test_get_level_number_integer(idx): idx._get_level_number("fourth") +def test_get_level_number_out_of_bounds(multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + with pytest.raises(IndexError, match="Too many levels"): + frame.index._get_level_number(2) + with pytest.raises(IndexError, match="not a valid level number"): + frame.index._get_level_number(-3) + + def test_set_name_methods(idx, index_names): # so long as these are synonyms, we don't need to test set_names assert idx.rename == idx.set_names diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 6b27682ed5674..57747f8274d85 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas.errors import InvalidIndexError +from pandas.errors import InvalidIndexError, PerformanceWarning import pandas as pd from pandas import Categorical, Index, MultiIndex, date_range @@ -461,10 +461,10 @@ def test_getitem_group_select(idx): assert sorted_idx.get_loc("foo") == slice(0, 2) -@pytest.mark.parametrize("ind1", [[True] * 5, pd.Index([True] * 5)]) +@pytest.mark.parametrize("ind1", [[True] * 5, Index([True] * 5)]) @pytest.mark.parametrize( "ind2", - [[True, False, True, False, False], pd.Index([True, False, True, False, False])], + [[True, False, True, False, False], Index([True, False, True, False, False])], ) def test_getitem_bool_index_all(ind1, ind2): # GH#22533 @@ -475,8 +475,8 @@ def test_getitem_bool_index_all(ind1, ind2): tm.assert_index_equal(idx[ind2], expected) -@pytest.mark.parametrize("ind1", [[True], pd.Index([True])]) -@pytest.mark.parametrize("ind2", [[False], pd.Index([False])]) +@pytest.mark.parametrize("ind1", [[True], Index([True])]) +@pytest.mark.parametrize("ind2", [[False], Index([False])]) def test_getitem_bool_index_single(ind1, ind2): # GH#22533 idx = MultiIndex.from_tuples([(10, 1)]) @@ -646,6 +646,29 @@ def test_get_loc_duplicates2(self): assert index.get_loc("D") == slice(0, 3) + def test_get_loc_past_lexsort_depth(self): + # GH#30053 + idx = MultiIndex( + levels=[["a"], [0, 7], [1]], + codes=[[0, 0], [1, 0], [0, 0]], + names=["x", "y", "z"], + sortorder=0, + ) + key = ("a", 7) + + with tm.assert_produces_warning(PerformanceWarning): + # PerformanceWarning: indexing past lexsort depth may impact performance + result = idx.get_loc(key) + + assert result == slice(0, 1, None) + + def test_multiindex_get_loc_list_raises(self): + # GH#35878 + idx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = "unhashable type" + with pytest.raises(TypeError, match=msg): + idx.get_loc([]) + class TestWhere: def test_where(self): diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 562d07d283293..6b6b9346fe1fe 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -52,7 +52,7 @@ def test_join_self(idx, join_type): def test_join_multi(): # GH 10665 midx = pd.MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"]) - idx = pd.Index([1, 2, 5], name="b") + idx = Index([1, 2, 5], name="b") # inner jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True) diff --git a/pandas/tests/indexes/multi/test_monotonic.py b/pandas/tests/indexes/multi/test_monotonic.py index ca1cb0932f63d..8659573d8123a 100644 --- a/pandas/tests/indexes/multi/test_monotonic.py +++ b/pandas/tests/indexes/multi/test_monotonic.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd from pandas import Index, MultiIndex @@ -174,3 +175,14 @@ def test_is_strictly_monotonic_decreasing(): ) assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is False + + +@pytest.mark.parametrize("attr", ["is_monotonic_increasing", "is_monotonic_decreasing"]) +@pytest.mark.parametrize( + "values", + [[(np.nan,), (1,), (2,)], [(1,), (np.nan,), (2,)], [(1,), (2,), (np.nan,)]], +) +def test_is_monotonic_with_nans(values, attr): + # GH: 37220 + idx = pd.MultiIndex.from_tuples(values, names=["test"]) + assert getattr(idx, attr) is False diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index a1e5cc33ef2f6..3d7e6e9c32248 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -98,7 +98,7 @@ def test_unsortedindex(): [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], names=["one", "two"], ) - df = pd.DataFrame([[i, 10 * i] for i in range(6)], index=mi, columns=["one", "two"]) + df = DataFrame([[i, 10 * i] for i in range(6)], index=mi, columns=["one", "two"]) # GH 16734: not sorted, but no real slicing result = df.loc(axis=0)["z", "a"] diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index f85f37e4127c3..678967db72a0b 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -5,7 +5,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype -import pandas as pd from pandas import ( Index, NaT, @@ -195,7 +194,7 @@ def test_constructor_datetime64arr_ok(self, box): if box is None: data = data._values elif box == "series": - data = pd.Series(data) + data = Series(data) result = PeriodIndex(data, freq="D") expected = PeriodIndex( @@ -362,7 +361,7 @@ def test_constructor_nat(self): period_range(start="2011-01-01", end="NaT", freq="M") def test_constructor_year_and_quarter(self): - year = pd.Series([2001, 2002, 2003]) + year = Series([2001, 2002, 2003]) quarter = year - 2000 idx = PeriodIndex(year=year, quarter=quarter) strs = [f"{t[0]:d}Q{t[1]:d}" for t in zip(quarter, year)] diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 85a01f1c5278c..b6d3c36f1682c 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -143,10 +143,10 @@ def test_getitem_nat(self): assert idx[0] == Period("2011-01", freq="M") assert idx[1] is NaT - s = pd.Series([0, 1, 2], index=idx) + s = Series([0, 1, 2], index=idx) assert s[NaT] == 1 - s = pd.Series(idx, index=idx) + s = Series(idx, index=idx) assert s[Period("2011-01", freq="M")] == Period("2011-01", freq="M") assert s[NaT] is NaT @@ -366,7 +366,7 @@ def test_get_loc_invalid_string_raises_keyerror(self): with pytest.raises(KeyError, match="A"): pi.get_loc("A") - ser = pd.Series([1, 2, 3], index=pi) + ser = Series([1, 2, 3], index=pi) with pytest.raises(KeyError, match="A"): ser.loc["A"] @@ -687,7 +687,7 @@ def test_get_value(self): p2 = Period("2017-09-03") idx0 = PeriodIndex([p0, p1, p2]) - input0 = pd.Series(np.array([1, 2, 3]), index=idx0) + input0 = Series(np.array([1, 2, 3]), index=idx0) expected0 = 2 with tm.assert_produces_warning(FutureWarning): @@ -695,7 +695,7 @@ def test_get_value(self): assert result0 == expected0 idx1 = PeriodIndex([p1, p1, p2]) - input1 = pd.Series(np.array([1, 2, 3]), index=idx1) + input1 = Series(np.array([1, 2, 3]), index=idx1) expected1 = input1.iloc[[0, 1]] with tm.assert_produces_warning(FutureWarning): @@ -703,7 +703,7 @@ def test_get_value(self): tm.assert_series_equal(result1, expected1) idx2 = PeriodIndex([p1, p2, p1]) - input2 = pd.Series(np.array([1, 2, 3]), index=idx2) + input2 = Series(np.array([1, 2, 3]), index=idx2) expected2 = input2.iloc[[0, 2]] with tm.assert_produces_warning(FutureWarning): @@ -713,7 +713,7 @@ def test_get_value(self): def test_loc_str(self): # https://github.com/pandas-dev/pandas/issues/33964 index = pd.period_range(start="2000", periods=20, freq="B") - series = pd.Series(range(20), index=index) + series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 @pytest.mark.parametrize("freq", ["H", "D"]) @@ -721,7 +721,7 @@ def test_get_value_datetime_hourly(self, freq): # get_loc and get_value should treat datetime objects symmetrically dti = date_range("2016-01-01", periods=3, freq="MS") pi = dti.to_period(freq) - ser = pd.Series(range(7, 10), index=pi) + ser = Series(range(7, 10), index=pi) ts = dti[0] @@ -753,14 +753,14 @@ def test_get_value_integer(self): msg = "index 16801 is out of bounds for axis 0 with size 3" dti = date_range("2016-01-01", periods=3) pi = dti.to_period("D") - ser = pd.Series(range(3), index=pi) + ser = Series(range(3), index=pi) with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning): pi.get_value(ser, 16801) msg = "index 46 is out of bounds for axis 0 with size 3" pi2 = dti.to_period("Y") # duplicates, ordinals are all 46 - ser2 = pd.Series(range(3), index=pi2) + ser2 = Series(range(3), index=pi2) with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning): pi2.get_value(ser2, 46) @@ -776,7 +776,7 @@ def test_contains(self): ps0 = [p0, p1, p2] idx0 = PeriodIndex(ps0) - ser = pd.Series(range(6, 9), index=idx0) + ser = Series(range(6, 9), index=idx0) for p in ps0: assert p in idx0 diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index d1b34c315b682..74ca6ec59736b 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -297,7 +297,7 @@ def test_equals(self, freq): assert idx.astype(object).equals(idx) assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) - assert not idx.equals(pd.Series(idx)) + assert not idx.equals(Series(idx)) idx2 = pd.PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="H") assert not idx.equals(idx2) @@ -305,7 +305,7 @@ def test_equals(self, freq): assert not idx.equals(idx2.astype(object)) assert not idx.astype(object).equals(idx2) assert not idx.equals(list(idx2)) - assert not idx.equals(pd.Series(idx2)) + assert not idx.equals(Series(idx2)) # same internal, different tz idx3 = pd.PeriodIndex._simple_new( @@ -317,7 +317,7 @@ def test_equals(self, freq): assert not idx.equals(idx3.astype(object)) assert not idx.astype(object).equals(idx3) assert not idx.equals(list(idx3)) - assert not idx.equals(pd.Series(idx3)) + assert not idx.equals(Series(idx3)) def test_freq_setter_deprecated(self): # GH 20678 diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 085d41aaa5b76..f4773e885829e 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -249,14 +249,16 @@ def _check_all_fields(self, periodindex): "weekofyear", "week", "dayofweek", + "day_of_week", "dayofyear", + "day_of_year", "quarter", "qyear", "days_in_month", ] periods = list(periodindex) - s = pd.Series(periodindex) + s = Series(periodindex) for field in fields: field_idx = getattr(periodindex, field) @@ -307,9 +309,9 @@ def test_period_reindex_with_object( period_index = PeriodIndex(p_values) object_index = Index(o_values) - s = pd.Series(values, index=period_index) + s = Series(values, index=period_index) result = s.reindex(object_index) - expected = pd.Series(expected_values, index=object_index) + expected = Series(expected_values, index=object_index) tm.assert_series_equal(result, expected) def test_is_(self): diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py index f2950b9f6065c..6ffdbbfcd2ce6 100644 --- a/pandas/tests/indexes/period/test_searchsorted.py +++ b/pandas/tests/indexes/period/test_searchsorted.py @@ -62,7 +62,7 @@ def test_searchsorted_invalid(self): msg = "|".join( [ "searchsorted requires compatible dtype or scalar", - "Unexpected type for 'value'", + "value should be a 'Period', 'NaT', or array of those. Got", ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 899c8cbc0425d..cd3a0e7b2241c 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -105,7 +105,7 @@ def test_insert(self): tm.assert_index_equal(result, expected) result = RangeIndex(5).insert(1, pd.NaT) - expected = pd.Index([0, pd.NaT, 1, 2, 3, 4], dtype=object) + expected = Index([0, pd.NaT, 1, 2, 3, 4], dtype=object) tm.assert_index_equal(result, expected) def test_delete(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8db1bcc84bfa6..9820b39e20651 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -14,8 +14,6 @@ from pandas.compat.numpy import np_datetime64_compat from pandas.util._test_decorators import async_mark -from pandas.core.dtypes.generic import ABCIndex - import pandas as pd from pandas import ( CategoricalIndex, @@ -108,9 +106,9 @@ def test_constructor_copy(self, index): ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: - result = pd.Index(index.astype(object)) + result = Index(index.astype(object)) else: - result = pd.Index(index) + result = Index(index) tm.assert_index_equal(result, index) @@ -121,7 +119,7 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): # incorrectly raise ValueError, and that nanoseconds are not # dropped index += pd.Timedelta(nanoseconds=50) - result = pd.Index(index, dtype=object) + result = Index(index, dtype=object) assert result.dtype == np.object_ assert list(result) == list(index) @@ -137,7 +135,7 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): ], ) def test_constructor_from_series_dtlike(self, index, has_tz): - result = pd.Index(pd.Series(index)) + result = Index(Series(index)) tm.assert_index_equal(result, index) if has_tz: @@ -168,7 +166,7 @@ def test_constructor_from_frame_series_freq(self): expected.name = "date" tm.assert_index_equal(result, expected) - expected = pd.Series(dts, name="date") + expected = Series(dts, name="date") tm.assert_series_equal(df["date"], expected) # GH 6274 @@ -195,8 +193,8 @@ def __init__(self, array): def __array__(self, dtype=None) -> np.ndarray: return self.array - expected = pd.Index(array) - result = pd.Index(ArrayLike(array)) + expected = Index(array) + result = Index(ArrayLike(array)) tm.assert_index_equal(result, expected) def test_constructor_int_dtype_nan(self): @@ -215,9 +213,9 @@ def test_constructor_int_dtype_nan_raises(self, dtype): Index(data, dtype=dtype) def test_constructor_no_pandas_array(self): - ser = pd.Series([1, 2, 3]) - result = pd.Index(ser.array) - expected = pd.Index([1, 2, 3]) + ser = Series([1, 2, 3]) + result = Index(ser.array) + expected = Index([1, 2, 3]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -338,7 +336,7 @@ def test_constructor_dtypes_to_timedelta(self, cast_index, vals): assert isinstance(index, TimedeltaIndex) @pytest.mark.parametrize("attr", ["values", "asi8"]) - @pytest.mark.parametrize("klass", [pd.Index, pd.DatetimeIndex]) + @pytest.mark.parametrize("klass", [Index, pd.DatetimeIndex]) def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, klass): # Test constructing with a datetimetz dtype # .values produces numpy datetimes, so these are considered naive @@ -375,7 +373,7 @@ def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, klass): tm.assert_index_equal(result, index) @pytest.mark.parametrize("attr", ["values", "asi8"]) - @pytest.mark.parametrize("klass", [pd.Index, pd.TimedeltaIndex]) + @pytest.mark.parametrize("klass", [Index, pd.TimedeltaIndex]) def test_constructor_dtypes_timedelta(self, attr, klass): index = pd.timedelta_range("1 days", periods=5) index = index._with_freq(None) # wont be preserved by constructors @@ -706,8 +704,8 @@ def test_intersect_str_dates(self, sort): @pytest.mark.xfail(reason="Not implemented") def test_intersection_equal_sort_true(self): # TODO decide on True behaviour - idx = pd.Index(["c", "a", "b"]) - sorted_ = pd.Index(["a", "b", "c"]) + idx = Index(["c", "a", "b"]) + sorted_ = Index(["a", "b", "c"]) tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) def test_chained_union(self, sort): @@ -741,7 +739,7 @@ def test_union(self, index, sort): def test_union_sort_other_special(self, slice_): # https://github.com/pandas-dev/pandas/issues/24959 - idx = pd.Index([1, 0, 2]) + idx = Index([1, 0, 2]) # default, sort=None other = idx[slice_] tm.assert_index_equal(idx.union(other), idx) @@ -755,12 +753,12 @@ def test_union_sort_other_special(self, slice_): def test_union_sort_special_true(self, slice_): # TODO decide on True behaviour # sort=True - idx = pd.Index([1, 0, 2]) + idx = Index([1, 0, 2]) # default, sort=None other = idx[slice_] result = idx.union(other, sort=True) - expected = pd.Index([0, 1, 2]) + expected = Index([0, 1, 2]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [np.array, Series, list]) @@ -879,7 +877,7 @@ def test_map_tseries_indices_accsr_return_index(self): "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index), + lambda values, index: Series(values, index), ], ) def test_map_dictlike_simple(self, mapper): @@ -893,7 +891,7 @@ def test_map_dictlike_simple(self, mapper): "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index), + lambda values, index: Series(values, index), ], ) def test_map_dictlike(self, index, mapper): @@ -1016,13 +1014,13 @@ def test_symmetric_difference(self, sort): @pytest.mark.parametrize("opname", ["difference", "symmetric_difference"]) def test_difference_incomparable(self, opname): - a = pd.Index([3, pd.Timestamp("2000"), 1]) - b = pd.Index([2, pd.Timestamp("1999"), 1]) + a = Index([3, pd.Timestamp("2000"), 1]) + b = Index([2, pd.Timestamp("1999"), 1]) op = operator.methodcaller(opname, b) # sort=None, the default result = op(a) - expected = pd.Index([3, pd.Timestamp("2000"), 2, pd.Timestamp("1999")]) + expected = Index([3, pd.Timestamp("2000"), 2, pd.Timestamp("1999")]) if opname == "difference": expected = expected[:2] tm.assert_index_equal(result, expected) @@ -1037,8 +1035,8 @@ def test_difference_incomparable(self, opname): def test_difference_incomparable_true(self, opname): # TODO decide on True behaviour # # sort=True, raises - a = pd.Index([3, pd.Timestamp("2000"), 1]) - b = pd.Index([2, pd.Timestamp("1999"), 1]) + a = Index([3, pd.Timestamp("2000"), 1]) + b = Index([2, pd.Timestamp("1999"), 1]) op = operator.methodcaller(opname, b, sort=True) with pytest.raises(TypeError, match="Cannot compare"): @@ -1337,13 +1335,13 @@ def test_get_indexer_nearest_decreasing(self, method, expected): ], ) def test_get_indexer_strings(self, method, expected): - index = pd.Index(["b", "c"]) + index = Index(["b", "c"]) actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected) def test_get_indexer_strings_raises(self): - index = pd.Index(["b", "c"]) + index = Index(["b", "c"]) msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" with pytest.raises(TypeError, match=msg): @@ -1375,7 +1373,7 @@ def test_get_indexer_with_NA_values( if unique_nulls_fixture is unique_nulls_fixture2: return # skip it, values are not unique arr = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object) - index = pd.Index(arr, dtype=object) + index = Index(arr, dtype=object) result = index.get_indexer( [unique_nulls_fixture, unique_nulls_fixture2, "Unknown"] ) @@ -1384,7 +1382,7 @@ def test_get_indexer_with_NA_values( @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) def test_get_loc(self, method): - index = pd.Index([0, 1, 2]) + index = Index([0, 1, 2]) assert index.get_loc(1, method=method) == 1 if method: @@ -1392,7 +1390,7 @@ def test_get_loc(self, method): @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) def test_get_loc_raises_bad_label(self, method): - index = pd.Index([0, 1, 2]) + index = Index([0, 1, 2]) if method: msg = "not supported between" else: @@ -1405,38 +1403,38 @@ def test_get_loc_raises_bad_label(self, method): "method,loc", [("pad", 1), ("backfill", 2), ("nearest", 1)] ) def test_get_loc_tolerance(self, method, loc): - index = pd.Index([0, 1, 2]) + index = Index([0, 1, 2]) assert index.get_loc(1.1, method) == loc assert index.get_loc(1.1, method, tolerance=1) == loc @pytest.mark.parametrize("method", ["pad", "backfill", "nearest"]) def test_get_loc_outside_tolerance_raises(self, method): - index = pd.Index([0, 1, 2]) + index = Index([0, 1, 2]) with pytest.raises(KeyError, match="1.1"): index.get_loc(1.1, method, tolerance=0.05) def test_get_loc_bad_tolerance_raises(self): - index = pd.Index([0, 1, 2]) + index = Index([0, 1, 2]) with pytest.raises(ValueError, match="must be numeric"): index.get_loc(1.1, "nearest", tolerance="invalid") def test_get_loc_tolerance_no_method_raises(self): - index = pd.Index([0, 1, 2]) + index = Index([0, 1, 2]) with pytest.raises(ValueError, match="tolerance .* valid if"): index.get_loc(1.1, tolerance=1) def test_get_loc_raises_missized_tolerance(self): - index = pd.Index([0, 1, 2]) + index = Index([0, 1, 2]) with pytest.raises(ValueError, match="tolerance size must match"): index.get_loc(1.1, "nearest", tolerance=[1, 1]) def test_get_loc_raises_object_nearest(self): - index = pd.Index(["a", "c"]) + index = Index(["a", "c"]) with pytest.raises(TypeError, match="unsupported operand type"): index.get_loc("a", method="nearest") def test_get_loc_raises_object_tolerance(self): - index = pd.Index(["a", "c"]) + index = Index(["a", "c"]) with pytest.raises(TypeError, match="unsupported operand type"): index.get_loc("a", method="pad", tolerance="invalid") @@ -1535,7 +1533,7 @@ def test_slice_locs_negative_step(self, in_slice, expected): s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = pd.Index(list(expected)) + expected = Index(list(expected)) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) @@ -1600,8 +1598,8 @@ def test_drop_by_numeric_label_errors_ignore(self, key, expected): @pytest.mark.parametrize("to_drop", [[("c", "d"), "a"], ["a", ("c", "d")]]) def test_drop_tuple(self, values, to_drop): # GH 18304 - index = pd.Index(values) - expected = pd.Index(["b"]) + index = Index(values) + expected = Index(["b"]) result = index.drop(to_drop) tm.assert_index_equal(result, expected) @@ -1916,8 +1914,8 @@ def test_tab_completion(self, index, expected): def test_indexing_doesnt_change_class(self): index = Index([1, 2, 3, "a", "b", "c"]) - assert index[1:3].identical(pd.Index([2, 3], dtype=np.object_)) - assert index[[0, 1]].identical(pd.Index([1, 2], dtype=np.object_)) + assert index[1:3].identical(Index([2, 3], dtype=np.object_)) + assert index[[0, 1]].identical(Index([1, 2], dtype=np.object_)) def test_outer_join_sort(self): left_index = Index(np.random.permutation(15)) @@ -1941,23 +1939,23 @@ def test_nan_first_take_datetime(self): def test_take_fill_value(self): # GH 12631 - index = pd.Index(list("ABC"), name="xxx") + index = Index(list("ABC"), name="xxx") result = index.take(np.array([1, 0, -1])) - expected = pd.Index(list("BAC"), name="xxx") + expected = Index(list("BAC"), name="xxx") tm.assert_index_equal(result, expected) # fill_value result = index.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.Index(["B", "A", np.nan], name="xxx") + expected = Index(["B", "A", np.nan], name="xxx") tm.assert_index_equal(result, expected) # allow_fill=False result = index.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = pd.Index(["B", "A", "C"], name="xxx") + expected = Index(["B", "A", "C"], name="xxx") tm.assert_index_equal(result, expected) def test_take_fill_value_none_raises(self): - index = pd.Index(list("ABC"), name="xxx") + index = Index(list("ABC"), name="xxx") msg = ( "When allow_fill=True and fill_value is not None, " "all indices must be >= -1" @@ -1969,7 +1967,7 @@ def test_take_fill_value_none_raises(self): index.take(np.array([1, 0, -5]), fill_value=True) def test_take_bad_bounds_raises(self): - index = pd.Index(list("ABC"), name="xxx") + index = Index(list("ABC"), name="xxx") with pytest.raises(IndexError, match="out of bounds"): index.take(np.array([1, -5])) @@ -1990,14 +1988,14 @@ def test_take_bad_bounds_raises(self): ) def test_reindex_preserves_name_if_target_is_list_or_ndarray(self, name, labels): # GH6552 - index = pd.Index([0, 1, 2]) + index = Index([0, 1, 2]) index.name = name assert index.reindex(labels)[0].name == name @pytest.mark.parametrize("labels", [[], np.array([]), np.array([], dtype=np.int64)]) def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, labels): # GH7774 - index = pd.Index(list("abc")) + index = Index(list("abc")) assert index.reindex(labels)[0].dtype.type == np.object_ @pytest.mark.parametrize( @@ -2010,11 +2008,11 @@ def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, labels): ) def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self, labels, dtype): # GH7774 - index = pd.Index(list("abc")) + index = Index(list("abc")) assert index.reindex(labels)[0].dtype.type == dtype def test_reindex_no_type_preserve_target_empty_mi(self): - index = pd.Index(list("abc")) + index = Index(list("abc")) result = index.reindex( pd.MultiIndex([pd.Int64Index([]), pd.Float64Index([])], [[], []]) )[0] @@ -2024,7 +2022,7 @@ def test_reindex_no_type_preserve_target_empty_mi(self): def test_groupby(self): index = Index(range(5)) result = index.groupby(np.array([1, 1, 2, 2, 2])) - expected = {1: pd.Index([0, 1]), 2: pd.Index([2, 3, 4])} + expected = {1: Index([0, 1]), 2: Index([2, 3, 4])} tm.assert_dict_equal(result, expected) @@ -2074,7 +2072,7 @@ def test_equals_op_index_vs_mi_same_length(self): @pytest.mark.parametrize("dt_conv", [pd.to_datetime, pd.to_timedelta]) def test_dt_conversion_preserves_name(self, dt_conv): # GH 10875 - index = pd.Index(["01:02:03", "01:02:04"], name="label") + index = Index(["01:02:03", "01:02:04"], name="label") assert index.name == dt_conv(index).name @pytest.mark.parametrize( @@ -2083,12 +2081,12 @@ def test_dt_conversion_preserves_name(self, dt_conv): # ASCII # short ( - pd.Index(["a", "bb", "ccc"]), + Index(["a", "bb", "ccc"]), """Index(['a', 'bb', 'ccc'], dtype='object')""", ), # multiple lines ( - pd.Index(["a", "bb", "ccc"] * 10), + Index(["a", "bb", "ccc"] * 10), """\ Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', @@ -2097,7 +2095,7 @@ def test_dt_conversion_preserves_name(self, dt_conv): ), # truncated ( - pd.Index(["a", "bb", "ccc"] * 100), + Index(["a", "bb", "ccc"] * 100), """\ Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... @@ -2107,12 +2105,12 @@ def test_dt_conversion_preserves_name(self, dt_conv): # Non-ASCII # short ( - pd.Index(["あ", "いい", "ううう"]), + Index(["あ", "いい", "ううう"]), """Index(['あ', 'いい', 'ううう'], dtype='object')""", ), # multiple lines ( - pd.Index(["あ", "いい", "ううう"] * 10), + Index(["あ", "いい", "ううう"] * 10), ( "Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" @@ -2125,7 +2123,7 @@ def test_dt_conversion_preserves_name(self, dt_conv): ), # truncated ( - pd.Index(["あ", "いい", "ううう"] * 100), + Index(["あ", "いい", "ううう"] * 100), ( "Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " "'あ', 'いい', 'ううう', 'あ',\n" @@ -2146,12 +2144,12 @@ def test_string_index_repr(self, index, expected): [ # short ( - pd.Index(["あ", "いい", "ううう"]), + Index(["あ", "いい", "ううう"]), ("Index(['あ', 'いい', 'ううう'], dtype='object')"), ), # multiple lines ( - pd.Index(["あ", "いい", "ううう"] * 10), + Index(["あ", "いい", "ううう"] * 10), ( "Index(['あ', 'いい', 'ううう', 'あ', 'いい', " "'ううう', 'あ', 'いい', 'ううう',\n" @@ -2166,7 +2164,7 @@ def test_string_index_repr(self, index, expected): ), # truncated ( - pd.Index(["あ", "いい", "ううう"] * 100), + Index(["あ", "いい", "ううう"] * 100), ( "Index(['あ', 'いい', 'ううう', 'あ', 'いい', " "'ううう', 'あ', 'いい', 'ううう',\n" @@ -2187,7 +2185,7 @@ def test_string_index_repr_with_unicode_option(self, index, expected): assert result == expected def test_cached_properties_not_settable(self): - index = pd.Index([1, 2, 3]) + index = Index([1, 2, 3]) with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False @@ -2197,7 +2195,7 @@ async def test_tab_complete_warning(self, ip): pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter - code = "import pandas as pd; idx = pd.Index([1, 2])" + code = "import pandas as pd; idx = Index([1, 2])" await ip.run_code(code) # GH 31324 newer jedi version raises Deprecation warning @@ -2223,7 +2221,7 @@ def test_contains_method_removed(self, index): index.contains(1) def test_sortlevel(self): - index = pd.Index([5, 4, 3, 2, 1]) + index = Index([5, 4, 3, 2, 1]) with pytest.raises(Exception, match="ascending must be a single bool value or"): index.sortlevel(ascending="True") @@ -2235,15 +2233,15 @@ def test_sortlevel(self): with pytest.raises(Exception, match="ascending must be a bool value"): index.sortlevel(ascending=["True"]) - expected = pd.Index([1, 2, 3, 4, 5]) + expected = Index([1, 2, 3, 4, 5]) result = index.sortlevel(ascending=[True]) tm.assert_index_equal(result[0], expected) - expected = pd.Index([1, 2, 3, 4, 5]) + expected = Index([1, 2, 3, 4, 5]) result = index.sortlevel(ascending=True) tm.assert_index_equal(result[0], expected) - expected = pd.Index([5, 4, 3, 2, 1]) + expected = Index([5, 4, 3, 2, 1]) result = index.sortlevel(ascending=False) tm.assert_index_equal(result[0], expected) @@ -2296,7 +2294,7 @@ def test_copy_name(self): def test_copy_name2(self): # Check that adding a "name" parameter to the copy is honored # GH14302 - index = pd.Index([1, 2], name="MyName") + index = Index([1, 2], name="MyName") index1 = index.copy() tm.assert_index_equal(index, index1) @@ -2314,8 +2312,8 @@ def test_copy_name2(self): assert index3.names == ["NewName"] def test_unique_na(self): - idx = pd.Index([2, np.nan, 2, 1], name="my_index") - expected = pd.Index([2, np.nan, 1], name="my_index") + idx = Index([2, np.nan, 2, 1], name="my_index") + expected = Index([2, np.nan, 1], name="my_index") result = idx.unique() tm.assert_index_equal(result, expected) @@ -2338,9 +2336,9 @@ def test_logical_compat(self): ) def test_dropna(self, how, dtype, vals, expected): # GH 6194 - index = pd.Index(vals, dtype=dtype) + index = Index(vals, dtype=dtype) result = index.dropna(how=how) - expected = pd.Index(expected, dtype=dtype) + expected = Index(expected, dtype=dtype) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("how", ["any", "all"]) @@ -2380,7 +2378,7 @@ def test_dropna_dt_like(self, how, index, expected): def test_dropna_invalid_how_raises(self): msg = "invalid how option: xxx" with pytest.raises(ValueError, match=msg): - pd.Index([1, 2, 3]).dropna(how="xxx") + Index([1, 2, 3]).dropna(how="xxx") def test_get_combined_index(self): result = _get_combined_index([]) @@ -2390,10 +2388,10 @@ def test_get_combined_index(self): @pytest.mark.parametrize( "index", [ - pd.Index([np.nan]), - pd.Index([np.nan, 1]), - pd.Index([1, 2, np.nan]), - pd.Index(["a", "b", np.nan]), + Index([np.nan]), + Index([np.nan, 1]), + Index([1, 2, np.nan]), + Index(["a", "b", np.nan]), pd.to_datetime(["NaT"]), pd.to_datetime(["NaT", "2000-01-01"]), pd.to_datetime(["2000-01-01", "NaT", "2000-01-02"]), @@ -2408,7 +2406,7 @@ def test_is_monotonic_na(self, index): def test_repr_summary(self): with cf.option_context("display.max_seq_items", 10): - result = repr(pd.Index(np.arange(1000))) + result = repr(Index(np.arange(1000))) assert len(result) < 200 assert "..." in result @@ -2518,10 +2516,6 @@ def test_ensure_index_mixed_closed_intervals(self): ], ) def test_generated_op_names(opname, index): - if isinstance(index, ABCIndex) and opname == "rsub": - # pd.Index.__rsub__ does not exist; though the method does exist - # for subclasses. see GH#19723 - return opname = f"__{opname}__" method = getattr(index, opname) assert method.__name__ == opname @@ -2537,7 +2531,7 @@ def test_index_subclass_constructor_wrong_kwargs(index_maker): def test_deprecated_fastpath(): msg = "[Uu]nexpected keyword argument" with pytest.raises(TypeError, match=msg): - pd.Index(np.array(["a", "b"], dtype=object), name="test", fastpath=True) + Index(np.array(["a", "b"], dtype=object), name="test", fastpath=True) with pytest.raises(TypeError, match=msg): pd.Int64Index(np.array([1, 2, 3], dtype="int64"), name="test", fastpath=True) @@ -2555,7 +2549,7 @@ def test_shape_of_invalid_index(): # about this). However, as long as this is not solved in general,this test ensures # that the returned shape is consistent with this underlying array for # compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775) - idx = pd.Index([0, 1, 2, 3]) + idx = Index([0, 1, 2, 3]) with tm.assert_produces_warning(FutureWarning): # GH#30588 multi-dimensional indexing deprecated assert idx[:, None].shape == (4, 1) @@ -2567,7 +2561,7 @@ def test_validate_1d_input(): arr = np.arange(8).reshape(2, 2, 2) with pytest.raises(ValueError, match=msg): - pd.Index(arr) + Index(arr) with pytest.raises(ValueError, match=msg): pd.Float64Index(arr.astype(np.float64)) @@ -2580,11 +2574,11 @@ def test_validate_1d_input(): df = pd.DataFrame(arr.reshape(4, 2)) with pytest.raises(ValueError, match=msg): - pd.Index(df) + Index(df) # GH#13601 trying to assign a multi-dimensional array to an index is not # allowed - ser = pd.Series(0, range(4)) + ser = Series(0, range(4)) with pytest.raises(ValueError, match=msg): ser.index = np.array([[2, 3]] * 4) @@ -2622,7 +2616,7 @@ def construct(dtype): if dtype is dtlike_dtypes[-1]: # PeriodArray will try to cast ints to strings return pd.DatetimeIndex(vals).astype(dtype) - return pd.Index(vals, dtype=dtype) + return Index(vals, dtype=dtype) left = construct(ldtype) right = construct(rdtype) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 94b10572fb5e1..6a681ede8ff42 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -320,6 +320,10 @@ def test_get_unique_index(self, index): vals[0] = np.nan vals_unique = vals[:2] + if index.dtype.kind in ["m", "M"]: + # i.e. needs_i8_conversion but not period_dtype, as above + vals = type(index._data)._simple_new(vals, dtype=index.dtype) + vals_unique = type(index._data)._simple_new(vals_unique, dtype=index.dtype) idx_nan = index._shallow_copy(vals) idx_unique_nan = index._shallow_copy(vals_unique) assert idx_unique_nan.is_unique is True diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 7fa7a571d2571..045816b3c9513 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -521,7 +521,7 @@ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): Index([-1], dtype=uint_dtype) def test_constructor_unwraps_index(self): - idx = pd.Index([1, 2]) + idx = Index([1, 2]) result = pd.Int64Index(idx) expected = np.array([1, 2], dtype="int64") tm.assert_numpy_array_equal(result._data, expected) @@ -613,7 +613,7 @@ def test_intersection(self, index_large): def test_int_float_union_dtype(dtype): # https://github.com/pandas-dev/pandas/issues/26778 # [u]int | float -> float - index = pd.Index([0, 2, 3], dtype=dtype) + index = Index([0, 2, 3], dtype=dtype) other = pd.Float64Index([0.5, 1.5]) expected = pd.Float64Index([0.0, 0.5, 1.5, 2.0, 3.0]) result = index.union(other) @@ -637,7 +637,7 @@ def test_range_float_union_dtype(): @pytest.mark.parametrize( "box", - [list, lambda x: np.array(x, dtype=object), lambda x: pd.Index(x, dtype=object)], + [list, lambda x: np.array(x, dtype=object), lambda x: Index(x, dtype=object)], ) def test_uint_index_does_not_convert_to_float64(box): # https://github.com/pandas-dev/pandas/issues/28279 @@ -667,8 +667,8 @@ def test_uint_index_does_not_convert_to_float64(box): def test_float64_index_equals(): # https://github.com/pandas-dev/pandas/issues/35217 - float_index = pd.Index([1.0, 2, 3]) - string_index = pd.Index(["1", "2", "3"]) + float_index = Index([1.0, 2, 3]) + string_index = Index(["1", "2", "3"]) result = float_index.equals(string_index) assert result is False @@ -679,8 +679,8 @@ def test_float64_index_equals(): def test_float64_index_difference(): # https://github.com/pandas-dev/pandas/issues/35217 - float_index = pd.Index([1.0, 2, 3]) - string_index = pd.Index(["1", "2", "3"]) + float_index = Index([1.0, 2, 3]) + string_index = Index(["1", "2", "3"]) result = float_index.difference(string_index) tm.assert_index_equal(result, float_index) diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index d9f24b4a35520..d274aa0b06584 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -117,7 +117,7 @@ def test_astype_category(self): def test_astype_array_fallback(self): obj = pd.timedelta_range("1H", periods=2) result = obj.astype(bool) - expected = pd.Index(np.array([True, True])) + expected = Index(np.array([True, True])) tm.assert_index_equal(result, expected) result = obj._data.astype(bool) diff --git a/pandas/tests/indexes/timedeltas/test_insert.py b/pandas/tests/indexes/timedeltas/test_insert.py index 1ebc0a4b1eca0..66fec2310e50c 100644 --- a/pandas/tests/indexes/timedeltas/test_insert.py +++ b/pandas/tests/indexes/timedeltas/test_insert.py @@ -79,7 +79,8 @@ def test_insert_nat(self, null): def test_insert_invalid_na(self): idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") - with pytest.raises(TypeError, match="incompatible label"): + msg = r"value should be a 'Timedelta' or 'NaT'\. Got 'datetime64' instead\." + with pytest.raises(TypeError, match=msg): idx.insert(0, np.datetime64("NaT")) @pytest.mark.parametrize( @@ -89,7 +90,7 @@ def test_insert_mismatched_types_raises(self, item): # GH#33703 dont cast these to td64 tdi = TimedeltaIndex(["4day", "1day", "2day"], name="idx") - msg = "incompatible label" + msg = r"value should be a 'Timedelta' or 'NaT'\. Got '.*' instead\." with pytest.raises(TypeError, match=msg): tdi.insert(1, item) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 3e452e7e2841d..c4429137d17f0 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -72,7 +72,7 @@ def test_nonunique_contains(self): def test_unknown_attribute(self): # see gh-9680 tdi = pd.timedelta_range(start=0, periods=10, freq="1s") - ts = pd.Series(np.random.normal(size=10), index=tdi) + ts = Series(np.random.normal(size=10), index=tdi) assert "foo" not in ts.__dict__.keys() msg = "'Series' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): @@ -237,7 +237,7 @@ def test_equals(self): assert idx.astype(object).equals(idx) assert idx.astype(object).equals(idx.astype(object)) assert not idx.equals(list(idx)) - assert not idx.equals(pd.Series(idx)) + assert not idx.equals(Series(idx)) idx2 = pd.TimedeltaIndex(["2 days", "1 days", "NaT"]) assert not idx.equals(idx2) @@ -246,7 +246,7 @@ def test_equals(self): assert not idx.astype(object).equals(idx2) assert not idx.astype(object).equals(idx2.astype(object)) assert not idx.equals(list(idx2)) - assert not idx.equals(pd.Series(idx2)) + assert not idx.equals(Series(idx2)) # Check that we dont raise OverflowError on comparisons outside the # implementation range diff --git a/pandas/tests/indexes/timedeltas/test_searchsorted.py b/pandas/tests/indexes/timedeltas/test_searchsorted.py index 3cf45931cf6b7..e3b52058469f0 100644 --- a/pandas/tests/indexes/timedeltas/test_searchsorted.py +++ b/pandas/tests/indexes/timedeltas/test_searchsorted.py @@ -21,6 +21,6 @@ def test_searchsorted_different_argument_classes(self, klass): ) def test_searchsorted_invalid_argument_dtype(self, arg): idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) - msg = "searchsorted requires compatible dtype" + msg = "value should be a 'Timedelta', 'NaT', or array of those. Got" with pytest.raises(TypeError, match=msg): idx.searchsorted(arg) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 8976e87a1b75a..df59d09edd3ef 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -131,9 +131,9 @@ def test_mi_intervalindex_slicing_with_scalar(self): ) idx.names = ["Item", "RID", "MP"] - df = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8]}) + df = DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8]}) df.index = idx - query_df = pd.DataFrame( + query_df = DataFrame( { "Item": ["FC", "OWNER", "FC", "OWNER", "OWNER"], "RID": ["RID1", "RID1", "RID1", "RID2", "RID2"], @@ -146,5 +146,5 @@ def test_mi_intervalindex_slicing_with_scalar(self): idx = pd.MultiIndex.from_arrays([query_df.Item, query_df.RID, query_df.MP]) query_df.index = idx result = df.value.loc[query_df.index] - expected = pd.Series([1, 6, 2, 8, 7], index=idx, name="value") + expected = Series([1, 6, 2, 8, 7], index=idx, name="value") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/conftest.py b/pandas/tests/indexing/multiindex/conftest.py deleted file mode 100644 index c69d6f86a6ce6..0000000000000 --- a/pandas/tests/indexing/multiindex/conftest.py +++ /dev/null @@ -1,17 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame, Index, MultiIndex - - -@pytest.fixture -def multiindex_dataframe_random_data(): - """DataFrame with 2 level MultiIndex with random data""" - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["first", "second"], - ) - return DataFrame( - np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") - ) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index d8e56661b7d61..efe1e0f0d75b5 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -7,17 +7,46 @@ from pandas import DataFrame, Series import pandas._testing as tm +m = 50 +n = 1000 +cols = ["jim", "joe", "jolie", "joline", "jolia"] + +vals = [ + np.random.randint(0, 10, n), + np.random.choice(list("abcdefghij"), n), + np.random.choice(pd.date_range("20141009", periods=10).tolist(), n), + np.random.choice(list("ZYXWVUTSRQ"), n), + np.random.randn(n), +] +vals = list(map(tuple, zip(*vals))) + +# bunch of keys for testing +keys = [ + np.random.randint(0, 11, m), + np.random.choice(list("abcdefghijk"), m), + np.random.choice(pd.date_range("20141009", periods=11).tolist(), m), + np.random.choice(list("ZYXWVUTSRQP"), m), +] +keys = list(map(tuple, zip(*keys))) +keys += list(map(lambda t: t[:-1], vals[:: n // m])) + + +# covers both unique index and non-unique index +df = DataFrame(vals, columns=cols) +a = pd.concat([df, df]) +b = df.drop_duplicates(subset=cols[:-1]) + -@pytest.mark.slow @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") -def test_multiindex_get_loc(): # GH7724, GH2646 +@pytest.mark.parametrize("lexsort_depth", list(range(5))) +@pytest.mark.parametrize("key", keys) +@pytest.mark.parametrize("frame", [a, b]) +def test_multiindex_get_loc(lexsort_depth, key, frame): + # GH7724, GH2646 with warnings.catch_warnings(record=True): # test indexing into a multi-index before & past the lexsort depth - from numpy.random import choice, randint, randn - - cols = ["jim", "joe", "jolie", "joline", "jolia"] def validate(mi, df, key): mask = np.ones(len(df)).astype("bool") @@ -51,38 +80,11 @@ def validate(mi, df, key): else: # multi hit tm.assert_frame_equal(mi.loc[key[: i + 1]], right) - def loop(mi, df, keys): - for key in keys: - validate(mi, df, key) - - n, m = 1000, 50 - - vals = [ - randint(0, 10, n), - choice(list("abcdefghij"), n), - choice(pd.date_range("20141009", periods=10).tolist(), n), - choice(list("ZYXWVUTSRQ"), n), - randn(n), - ] - vals = list(map(tuple, zip(*vals))) - - # bunch of keys for testing - keys = [ - randint(0, 11, m), - choice(list("abcdefghijk"), m), - choice(pd.date_range("20141009", periods=11).tolist(), m), - choice(list("ZYXWVUTSRQP"), m), - ] - keys = list(map(tuple, zip(*keys))) - keys += list(map(lambda t: t[:-1], vals[:: n // m])) - - # covers both unique index and non-unique index - df = DataFrame(vals, columns=cols) - a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) - - for frame in a, b: - for i in range(5): # lexsort depth - df = frame.copy() if i == 0 else frame.sort_values(by=cols[:i]) - mi = df.set_index(cols[:-1]) - assert not mi.index.lexsort_depth < i - loop(mi, df, keys) + if lexsort_depth == 0: + df = frame.copy() + else: + df = frame.sort_values(by=cols[:lexsort_depth]) + + mi = df.set_index(cols[:-1]) + assert not mi.index.lexsort_depth < lexsort_depth + validate(mi, df, key) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 1b659bec0e9e8..2cb5b55f14596 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -448,7 +448,7 @@ def test_loc_period_string_indexing(): a = pd.period_range("2013Q1", "2013Q4", freq="Q") i = (1111, 2222, 3333) idx = pd.MultiIndex.from_product((a, i), names=("Periode", "CVR")) - df = pd.DataFrame( + df = DataFrame( index=idx, columns=( "OMS", @@ -463,7 +463,7 @@ def test_loc_period_string_indexing(): ), ) result = df.loc[("2013Q1", 1111), "OMS"] - expected = pd.Series( + expected = Series( [np.nan], dtype=object, name="OMS", @@ -478,11 +478,11 @@ def test_loc_datetime_mask_slicing(): # GH 16699 dt_idx = pd.to_datetime(["2017-05-04", "2017-05-05"]) m_idx = pd.MultiIndex.from_product([dt_idx, dt_idx], names=["Idx1", "Idx2"]) - df = pd.DataFrame( + df = DataFrame( data=[[1, 2], [3, 4], [5, 6], [7, 6]], index=m_idx, columns=["C1", "C2"] ) result = df.loc[(dt_idx[0], (df.index.get_level_values(1) > "2017-05-04")), "C1"] - expected = pd.Series( + expected = Series( [3], name="C1", index=MultiIndex.from_tuples( @@ -496,7 +496,7 @@ def test_loc_datetime_mask_slicing(): def test_loc_datetime_series_tuple_slicing(): # https://github.com/pandas-dev/pandas/issues/35858 date = pd.Timestamp("2000") - ser = pd.Series( + ser = Series( 1, index=pd.MultiIndex.from_tuples([("a", date)], names=["a", "b"]), name="c", @@ -522,3 +522,79 @@ def test_loc_with_mi_indexer(): columns=["author", "price"], ) tm.assert_frame_equal(result, expected) + + +def test_loc_mi_with_level1_named_0(): + # GH#37194 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + + ser = Series(range(3), index=dti) + df = ser.to_frame() + df[1] = dti + + df2 = df.set_index(0, append=True) + assert df2.index.names == (None, 0) + df2.index.get_loc(dti[0]) # smoke test + + result = df2.loc[dti[0]] + expected = df2.iloc[[0]].droplevel(None) + tm.assert_frame_equal(result, expected) + + ser2 = df2[1] + assert ser2.index.names == (None, 0) + + result = ser2.loc[dti[0]] + expected = ser2.iloc[[0]].droplevel(None) + tm.assert_series_equal(result, expected) + + +def test_getitem_str_slice(datapath): + # GH#15928 + path = datapath("reshape", "merge", "data", "quotes2.csv") + df = pd.read_csv(path, parse_dates=["time"]) + df2 = df.set_index(["ticker", "time"]).sort_index() + + res = df2.loc[("AAPL", slice("2016-05-25 13:30:00")), :].droplevel(0) + expected = df2.loc["AAPL"].loc[slice("2016-05-25 13:30:00"), :] + tm.assert_frame_equal(res, expected) + + +def test_3levels_leading_period_index(): + # GH#24091 + pi = pd.PeriodIndex( + ["20181101 1100", "20181101 1200", "20181102 1300", "20181102 1400"], + name="datetime", + freq="B", + ) + lev2 = ["A", "A", "Z", "W"] + lev3 = ["B", "C", "Q", "F"] + mi = pd.MultiIndex.from_arrays([pi, lev2, lev3]) + + ser = Series(range(4), index=mi, dtype=np.float64) + result = ser.loc[(pi[0], "A", "B")] + assert result == 0.0 + + +class TestKeyErrorsWithMultiIndex: + def test_missing_keys_raises_keyerror(self): + # GH#27420 KeyError, not TypeError + df = DataFrame(np.arange(12).reshape(4, 3), columns=["A", "B", "C"]) + df2 = df.set_index(["A", "B"]) + + with pytest.raises(KeyError, match="1"): + df2.loc[(1, 6)] + + def test_missing_key_raises_keyerror2(self): + # GH#21168 KeyError, not "IndexingError: Too many indexers" + ser = Series(-1, index=pd.MultiIndex.from_product([[0, 1]] * 2)) + + with pytest.raises(KeyError, match=r"\(0, 3\)"): + ser.loc[0, 3] + + +def test_getitem_loc_commutability(multiindex_year_month_day_dataframe_random_data): + df = multiindex_year_month_day_dataframe_random_data + ser = df["A"] + result = ser[2000, 5] + expected = df.loc[2000, 5]["A"] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 4565d79c632de..a3b8d66c92024 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,5 +1,4 @@ import numpy as np -import pytest import pandas._libs.index as _index from pandas.errors import PerformanceWarning @@ -70,12 +69,12 @@ def test_nested_tuples_duplicates(self): # GH#30892 dti = pd.to_datetime(["20190101", "20190101", "20190102"]) - idx = pd.Index(["a", "a", "c"]) + idx = Index(["a", "a", "c"]) mi = pd.MultiIndex.from_arrays([dti, idx], names=["index1", "index2"]) - df = pd.DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi) + df = DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi) - expected = pd.DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi) + expected = DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi) df2 = df.copy(deep=True) df2.loc[(dti[0], "a"), "c2"] = 1.0 @@ -84,10 +83,3 @@ def test_nested_tuples_duplicates(self): df3 = df.copy(deep=True) df3.loc[[(dti[0], "a")], "c2"] = 1.0 tm.assert_frame_equal(df3, expected) - - def test_multiindex_get_loc_list_raises(self): - # https://github.com/pandas-dev/pandas/issues/35878 - idx = pd.MultiIndex.from_tuples([("a", 1), ("b", 2)]) - msg = "unhashable type" - with pytest.raises(TypeError, match=msg): - idx.get_loc([]) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 853b92ea91274..b58b81d5aa1b3 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -429,9 +429,9 @@ def test_setitem_nonmonotonic(self): index = pd.MultiIndex.from_tuples( [("a", "c"), ("b", "x"), ("a", "d")], names=["l1", "l2"] ) - df = pd.DataFrame(data=[0, 1, 2], index=index, columns=["e"]) + df = DataFrame(data=[0, 1, 2], index=index, columns=["e"]) df.loc["a", "e"] = np.arange(99, 101, dtype="int64") - expected = pd.DataFrame({"e": [99, 1, 100]}, index=index) + expected = DataFrame({"e": [99, 1, 100]}, index=index) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index ec0391a2ccc26..024cc3ad72688 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -496,7 +496,7 @@ def test_loc_axis_arguments(self): def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): # GH29519 - df = pd.DataFrame( + df = DataFrame( np.arange(27).reshape(3, 9), columns=pd.MultiIndex.from_product( [["a1", "a2", "a3"], ["b1", "b2", "b3"]] @@ -510,7 +510,7 @@ def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): def test_loc_axis_single_level_single_col_indexing_multiindex_col_df(self): # GH29519 - df = pd.DataFrame( + df = DataFrame( np.arange(27).reshape(3, 9), columns=pd.MultiIndex.from_product( [["a1", "a2", "a3"], ["b1", "b2", "b3"]] @@ -526,9 +526,9 @@ def test_loc_ax_single_level_indexer_simple_df(self): # GH29519 # test single level indexing on single index column data frame - df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=["a", "b", "c"]) + df = DataFrame(np.arange(9).reshape(3, 3), columns=["a", "b", "c"]) result = df.loc(axis=1)["a"] - expected = pd.Series(np.array([0, 3, 6]), name="a") + expected = Series(np.array([0, 3, 6]), name="a") tm.assert_series_equal(result, expected) def test_per_axis_per_level_setitem(self): @@ -736,11 +736,11 @@ def test_non_reducing_slice_on_multiindex(self): ("b", "c"): [3, 2], ("b", "d"): [4, 1], } - df = pd.DataFrame(dic, index=[0, 1]) + df = DataFrame(dic, index=[0, 1]) idx = pd.IndexSlice slice_ = idx[:, idx["b", "d"]] tslice_ = non_reducing_slice(slice_) result = df.loc[tslice_] - expected = pd.DataFrame({("b", "d"): [4, 1]}) + expected = DataFrame({("b", "d"): [4, 1]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py deleted file mode 100644 index 91be1d913001b..0000000000000 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ /dev/null @@ -1,280 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame, Index, IndexSlice, MultiIndex, Series, concat, date_range -import pandas._testing as tm -import pandas.core.common as com - - -@pytest.fixture -def four_level_index_dataframe(): - arr = np.array( - [ - [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], - [-0.6662, -0.5243, -0.358, 0.89145, 2.5838], - ] - ) - index = MultiIndex( - levels=[["a", "x"], ["b", "q"], [10.0032, 20.0, 30.0], [3, 4, 5]], - codes=[[0, 0, 1], [0, 1, 1], [0, 1, 2], [2, 1, 0]], - names=["one", "two", "three", "four"], - ) - return DataFrame(arr, index=index, columns=list("ABCDE")) - - -@pytest.mark.parametrize( - "key, level, exp_arr, exp_index", - [ - ("a", "lvl0", lambda x: x[:, 0:2], Index(["bar", "foo"], name="lvl1")), - ("foo", "lvl1", lambda x: x[:, 1:2], Index(["a"], name="lvl0")), - ], -) -def test_xs_named_levels_axis_eq_1(key, level, exp_arr, exp_index): - # see gh-2903 - arr = np.random.randn(4, 4) - index = MultiIndex( - levels=[["a", "b"], ["bar", "foo", "hello", "world"]], - codes=[[0, 0, 1, 1], [0, 1, 2, 3]], - names=["lvl0", "lvl1"], - ) - df = DataFrame(arr, columns=index) - result = df.xs(key, level=level, axis=1) - expected = DataFrame(exp_arr(arr), columns=exp_index) - tm.assert_frame_equal(result, expected) - - -def test_xs_values(multiindex_dataframe_random_data): - df = multiindex_dataframe_random_data - result = df.xs(("bar", "two")).values - expected = df.values[4] - tm.assert_almost_equal(result, expected) - - -def test_xs_loc_equality(multiindex_dataframe_random_data): - df = multiindex_dataframe_random_data - result = df.xs(("bar", "two")) - expected = df.loc[("bar", "two")] - tm.assert_series_equal(result, expected) - - -def test_xs_missing_values_in_index(): - # see gh-6574 - # missing values in returned index should be preserved - acc = [ - ("a", "abcde", 1), - ("b", "bbcde", 2), - ("y", "yzcde", 25), - ("z", "xbcde", 24), - ("z", None, 26), - ("z", "zbcde", 25), - ("z", "ybcde", 26), - ] - df = DataFrame(acc, columns=["a1", "a2", "cnt"]).set_index(["a1", "a2"]) - expected = DataFrame( - {"cnt": [24, 26, 25, 26]}, - index=Index(["xbcde", np.nan, "zbcde", "ybcde"], name="a2"), - ) - - result = df.xs("z", level="a1") - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) -def test_xs_with_duplicates(key, level, multiindex_dataframe_random_data): - # see gh-13719 - frame = multiindex_dataframe_random_data - df = concat([frame] * 2) - assert df.index.is_unique is False - expected = concat([frame.xs("one", level="second")] * 2) - - result = df.xs(key, level=level) - tm.assert_frame_equal(result, expected) - - -def test_xs_level(multiindex_dataframe_random_data): - df = multiindex_dataframe_random_data - result = df.xs("two", level="second") - expected = df[df.index.get_level_values(1) == "two"] - expected.index = Index(["foo", "bar", "baz", "qux"], name="first") - tm.assert_frame_equal(result, expected) - - -def test_xs_level_eq_2(): - arr = np.random.randn(3, 5) - index = MultiIndex( - levels=[["a", "p", "x"], ["b", "q", "y"], ["c", "r", "z"]], - codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]], - ) - df = DataFrame(arr, index=index) - expected = DataFrame(arr[1:2], index=[["a"], ["b"]]) - result = df.xs("c", level=2) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "indexer", - [ - lambda df: df.xs(("a", 4), level=["one", "four"]), - lambda df: df.xs("a").xs(4, level="four"), - ], -) -def test_xs_level_multiple(indexer, four_level_index_dataframe): - df = four_level_index_dataframe - expected_values = [[0.4473, 1.4152, 0.2834, 1.00661, 0.1744]] - expected_index = MultiIndex( - levels=[["q"], [20.0]], codes=[[0], [0]], names=["two", "three"] - ) - expected = DataFrame(expected_values, index=expected_index, columns=list("ABCDE")) - result = indexer(df) - tm.assert_frame_equal(result, expected) - - -def test_xs_setting_with_copy_error(multiindex_dataframe_random_data): - # this is a copy in 0.14 - df = multiindex_dataframe_random_data - result = df.xs("two", level="second") - - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): - result[:] = 10 - - -def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): - # this is a copy in 0.14 - df = four_level_index_dataframe - result = df.xs(("a", 4), level=["one", "four"]) - - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): - result[:] = 10 - - -def test_xs_integer_key(): - # see gh-2107 - dates = range(20111201, 20111205) - ids = list("abcde") - index = MultiIndex.from_product([dates, ids], names=["date", "secid"]) - df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) - - result = df.xs(20111201, level="date") - expected = df.loc[20111201, :] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "indexer", [lambda df: df.xs("a", level=0), lambda df: df.xs("a")] -) -def test_xs_level0(indexer, four_level_index_dataframe): - df = four_level_index_dataframe - expected_values = [ - [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], - ] - expected_index = MultiIndex( - levels=[["b", "q"], [10.0032, 20.0], [4, 5]], - codes=[[0, 1], [0, 1], [1, 0]], - names=["two", "three", "four"], - ) - expected = DataFrame(expected_values, index=expected_index, columns=list("ABCDE")) - - result = indexer(df) - tm.assert_frame_equal(result, expected) - - -def test_xs_level_series(multiindex_dataframe_random_data): - # this test is not explicitly testing .xs functionality - # TODO: move to another module or refactor - df = multiindex_dataframe_random_data - s = df["A"] - result = s[:, "two"] - expected = df.xs("two", level=1)["A"] - tm.assert_series_equal(result, expected) - - -def test_xs_level_series_ymd(multiindex_year_month_day_dataframe_random_data): - # this test is not explicitly testing .xs functionality - # TODO: move to another module or refactor - df = multiindex_year_month_day_dataframe_random_data - s = df["A"] - result = s[2000, 5] - expected = df.loc[2000, 5]["A"] - tm.assert_series_equal(result, expected) - - -def test_xs_level_series_slice_not_implemented( - multiindex_year_month_day_dataframe_random_data, -): - # this test is not explicitly testing .xs functionality - # TODO: move to another module or refactor - # not implementing this for now - df = multiindex_year_month_day_dataframe_random_data - s = df["A"] - - msg = r"\(2000, slice\(3, 4, None\)\)" - with pytest.raises(TypeError, match=msg): - s[2000, 3:4] - - -def test_xs_IndexSlice_argument_not_implemented(): - # GH 35301 - - index = MultiIndex( - levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - ) - - series = Series(np.random.randn(6), index=index) - frame = DataFrame(np.random.randn(6, 4), index=index) - - msg = ( - "Expected label or tuple of labels, got " - r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)" - ) - with pytest.raises(TypeError, match=msg): - frame.xs(IndexSlice[("foo", "qux", 0), :]) - with pytest.raises(TypeError, match=msg): - series.xs(IndexSlice[("foo", "qux", 0), :]) - - -def test_series_getitem_multiindex_xs(): - # GH6258 - dt = list(date_range("20130903", periods=3)) - idx = MultiIndex.from_product([list("AB"), dt]) - s = Series([1, 3, 4, 1, 3, 4], index=idx) - expected = Series([1, 1], index=list("AB")) - - result = s.xs("20130903", level=1) - tm.assert_series_equal(result, expected) - - -def test_series_getitem_multiindex_xs_by_label(): - # GH5684 - idx = MultiIndex.from_tuples( - [("a", "one"), ("a", "two"), ("b", "one"), ("b", "two")] - ) - s = Series([1, 2, 3, 4], index=idx) - return_value = s.index.set_names(["L1", "L2"], inplace=True) - assert return_value is None - expected = Series([1, 3], index=["a", "b"]) - return_value = expected.index.set_names(["L1"], inplace=True) - assert return_value is None - - result = s.xs("one", level="L2") - tm.assert_series_equal(result, expected) - - -def test_xs_levels_raises(): - df = DataFrame({"A": [1, 2, 3]}) - - msg = "Index must be a MultiIndex" - with pytest.raises(TypeError, match=msg): - df.xs(0, level="as") - - s = df.A - with pytest.raises(TypeError, match=msg): - s.xs(0, level="as") diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index fae229aecc3d4..854ca176fd2f4 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -25,27 +25,15 @@ def setup_method(self, method): self.df = DataFrame( { "A": np.arange(6, dtype="int64"), - "B": Series(list("aabbca")).astype(CDT(list("cab"))), - } - ).set_index("B") + }, + index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"), + ) self.df2 = DataFrame( { "A": np.arange(6, dtype="int64"), - "B": Series(list("aabbca")).astype(CDT(list("cabe"))), - } - ).set_index("B") - self.df3 = DataFrame( - { - "A": np.arange(6, dtype="int64"), - "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=True))), - } - ).set_index("B") - self.df4 = DataFrame( - { - "A": np.arange(6, dtype="int64"), - "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), - } - ).set_index("B") + }, + index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + ) def test_loc_scalar(self): result = self.df.loc["a"] @@ -435,32 +423,6 @@ def test_loc_listlike_dtypes(self): with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] - def test_get_indexer_array(self): - arr = np.array( - [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], - dtype=object, - ) - cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] - ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") - result = ci.get_indexer(arr) - expected = np.array([0, 1], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - - def test_get_indexer_same_categories_same_order(self): - ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) - - result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) - expected = np.array([1, 1], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - - def test_get_indexer_same_categories_different_order(self): - # https://github.com/pandas-dev/pandas/issues/19551 - ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) - - result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) - expected = np.array([1, 1], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - def test_getitem_with_listlike(self): # GH 16115 cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) @@ -472,22 +434,6 @@ def test_getitem_with_listlike(self): result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) - def test_setitem_listlike(self): - - # GH 9469 - # properly coerce the input indexers - np.random.seed(1) - c = Categorical( - np.random.randint(0, 5, size=150000).astype(np.int8) - ).add_categories([-1000]) - indexer = np.array([100000]).astype(np.int64) - c[indexer] = -1000 - - # we are asserting the code result here - # which maps to the -1000 category - result = c.codes[np.array([100000]).astype(np.int64)] - tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) - def test_ix_categorical_index(self): # GH 12531 df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) @@ -556,91 +502,6 @@ def test_read_only_source(self): tm.assert_series_equal(rw_df.loc[1], ro_df.loc[1]) tm.assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3]) - def test_reindexing(self): - df = DataFrame( - { - "A": np.arange(3, dtype="int64"), - "B": Series(list("abc")).astype(CDT(list("cabe"))), - } - ).set_index("B") - - # reindexing - # convert to a regular index - result = df.reindex(["a", "b", "e"]) - expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( - "B" - ) - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["a", "b"]) - expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["e"]) - expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["d"]) - expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - # since we are actually reindexing with a Categorical - # then return a Categorical - cats = list("cabe") - - result = df.reindex(Categorical(["a", "e"], categories=cats)) - expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} - ).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(Categorical(["a"], categories=cats)) - expected = DataFrame( - {"A": [0], "B": Series(list("a")).astype(CDT(cats))} - ).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["a", "b", "e"]) - expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( - "B" - ) - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["a", "b"]) - expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["e"]) - expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - # give back the type of categorical that we received - result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) - expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} - ).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) - expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} - ).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - # passed duplicate indexers are not allowed - msg = "cannot reindex from a duplicate axis" - with pytest.raises(ValueError, match=msg): - self.df2.reindex(["a", "b"]) - - # args NotImplemented ATM - msg = r"argument {} is not implemented for CategoricalIndex\.reindex" - with pytest.raises(NotImplementedError, match=msg.format("method")): - df.reindex(["a"], method="ffill") - with pytest.raises(NotImplementedError, match=msg.format("level")): - df.reindex(["a"], level=1) - with pytest.raises(NotImplementedError, match=msg.format("limit")): - df.reindex(["a"], limit=2) - def test_loc_slice(self): # GH9748 with pytest.raises(KeyError, match="1"): @@ -661,10 +522,24 @@ def test_loc_and_at_with_categorical_index(self): assert df.loc["B", 1] == 4 assert df.at["B", 1] == 4 - def test_boolean_selection(self): + def test_getitem_bool_mask_categorical_index(self): - df3 = self.df3 - df4 = self.df4 + df3 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex( + [1, 1, 2, 1, 3, 2], dtype=CDT([3, 2, 1], ordered=True), name="B" + ), + ) + df4 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex( + [1, 1, 2, 1, 3, 2], dtype=CDT([3, 2, 1], ordered=False), name="B" + ), + ) result = df3[df3.index == "a"] expected = df3.iloc[[]] @@ -725,24 +600,6 @@ def test_indexing_with_category(self): res = cat[["A"]] == "foo" tm.assert_frame_equal(res, exp) - def test_map_with_dict_or_series(self): - orig_values = ["a", "B", 1, "a"] - new_values = ["one", 2, 3.0, "one"] - cur_index = pd.CategoricalIndex(orig_values, name="XXX") - expected = pd.CategoricalIndex( - new_values, name="XXX", categories=[3.0, 2, "one"] - ) - - mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) - output = cur_index.map(mapper) - # Order of categories in output can be different - tm.assert_index_equal(expected, output) - - mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])} - output = cur_index.map(mapper) - # Order of categories in output can be different - tm.assert_index_equal(expected, output) - @pytest.mark.parametrize( "idx_values", [ @@ -807,31 +664,3 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "categories", - [ - pytest.param(["a", "b", "c"], id="str"), - pytest.param( - [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], - id="pd.Interval", - ), - ], - ) - def test_reorder_index_with_categories(self, categories): - # GH23452 - df = DataFrame( - {"foo": range(len(categories))}, - index=CategoricalIndex( - data=categories, categories=categories, ordered=True - ), - ) - df.index = df.index.reorder_categories(df.index.categories[::-1]) - result = df.sort_index() - expected = DataFrame( - {"foo": reversed(range(len(categories)))}, - index=CategoricalIndex( - data=categories[::-1], categories=categories[::-1], ordered=True - ), - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 1241d394d7936..d162468235767 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -83,7 +83,7 @@ def test_setitem_cache_updating(self): def test_altering_series_clears_parent_cache(self): # GH #33675 - df = pd.DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) + df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) ser = df["A"] assert "A" in df._item_cache @@ -350,14 +350,12 @@ def test_detect_chained_assignment_warnings_errors(self): def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): # xref gh-13017. with option_context("chained_assignment", "warn"): - df = pd.DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] - ) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"]) with tm.assert_produces_warning(com.SettingWithCopyWarning): df.c.loc[df.c > 0] = None - expected = pd.DataFrame( + expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 752ecd47fe089..436b2aa838b08 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -29,11 +29,21 @@ def has_test(combo): klass in x.name and dtype in x.name and method in x.name for x in cls_funcs ) - for combo in combos: - if not has_test(combo): - raise AssertionError(f"test method is not defined: {cls.__name__}, {combo}") + opts = request.config.option + if opts.lf or opts.keyword: + # If we are running with "last-failed" or -k foo, we expect to only + # run a subset of tests. + yield - yield + else: + + for combo in combos: + if not has_test(combo): + raise AssertionError( + f"test method is not defined: {cls.__name__}, {combo}" + ) + + yield class CoercionBase: @@ -124,7 +134,7 @@ def test_setitem_series_int8(self, val, exp_dtype, request): exp = pd.Series([1, 0, 3, 4], dtype=np.int8) self._assert_setitem_series_conversion(obj, val, exp, np.int8) mark = pytest.mark.xfail( - reason="BUG: it must be Series([1, 1, 3, 4], dtype=np.int16" + reason="BUG: it must be pd.Series([1, 1, 3, 4], dtype=np.int16" ) request.node.add_marker(mark) @@ -444,7 +454,7 @@ def test_insert_index_datetimes(self, fill_val, exp_dtype): with pytest.raises(TypeError, match=msg): obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) - msg = "cannot insert DatetimeArray with incompatible label" + msg = "value should be a 'Timestamp' or 'NaT'. Got 'int' instead." with pytest.raises(TypeError, match=msg): obj.insert(1, 1) @@ -461,12 +471,12 @@ def test_insert_index_timedelta64(self): ) # ToDo: must coerce to object - msg = "cannot insert TimedeltaArray with incompatible label" + msg = "value should be a 'Timedelta' or 'NaT'. Got 'Timestamp' instead." with pytest.raises(TypeError, match=msg): obj.insert(1, pd.Timestamp("2012-01-01")) # ToDo: must coerce to object - msg = "cannot insert TimedeltaArray with incompatible label" + msg = "value should be a 'Timedelta' or 'NaT'. Got 'int' instead." with pytest.raises(TypeError, match=msg): obj.insert(1, 1) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index ad71b6b72df33..5e00056c33db7 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -302,7 +302,7 @@ def test_loc_getitem_across_dst(self): idx = pd.date_range( "2017-10-29 01:30:00", tz="Europe/Berlin", periods=5, freq="30 min" ) - series2 = pd.Series([0, 1, 2, 3, 4], index=idx) + series2 = Series([0, 1, 2, 3, 4], index=idx) t_1 = pd.Timestamp( "2017-10-29 02:30:00+02:00", tz="Europe/Berlin", freq="30min" @@ -311,7 +311,7 @@ def test_loc_getitem_across_dst(self): "2017-10-29 02:00:00+01:00", tz="Europe/Berlin", freq="30min" ) result = series2.loc[t_1:t_2] - expected = pd.Series([2, 3], index=idx[2:4]) + expected = Series([2, 3], index=idx[2:4]) tm.assert_series_equal(result, expected) result = series2[t_1] @@ -322,10 +322,10 @@ def test_loc_incremental_setitem_with_dst(self): # GH 20724 base = datetime(2015, 11, 1, tzinfo=tz.gettz("US/Pacific")) idxs = [base + timedelta(seconds=i * 900) for i in range(16)] - result = pd.Series([0], index=[idxs[0]]) + result = Series([0], index=[idxs[0]]) for ts in idxs: result.loc[ts] = 1 - expected = pd.Series(1, index=idxs) + expected = Series(1, index=idxs) tm.assert_series_equal(result, expected) def test_loc_setitem_with_existing_dst(self): @@ -334,9 +334,9 @@ def test_loc_setitem_with_existing_dst(self): end = pd.Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") ts = pd.Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") idx = pd.date_range(start, end, closed="left", freq="H") - result = pd.DataFrame(index=idx, columns=["value"]) + result = DataFrame(index=idx, columns=["value"]) result.loc[ts, "value"] = 12 - expected = pd.DataFrame( + expected = DataFrame( [np.nan] * len(idx) + [12], index=idx.append(pd.DatetimeIndex([ts])), columns=["value"], @@ -372,7 +372,7 @@ def test_loc_label_slicing(self): ) def test_getitem_slice_date(self, slice_, positions): # https://github.com/pandas-dev/pandas/issues/31501 - s = pd.Series( + s = Series( [0, 1, 2], pd.DatetimeIndex(["2019-01-01", "2019-01-01T06:00:00", "2019-01-02"]), ) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index d3d455f83c41a..4ef6463fd9e31 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -51,7 +51,7 @@ class TestiLoc2: def test_is_scalar_access(self): # GH#32085 index with duplicates doesnt matter for _is_scalar_access index = pd.Index([1, 2, 1]) - ser = pd.Series(range(3), index=index) + ser = Series(range(3), index=index) assert ser.iloc._is_scalar_access((1,)) @@ -195,7 +195,7 @@ def test_iloc_array_not_mutating_negative_indices(self): # GH 21867 array_with_neg_numbers = np.array([1, 2, -1]) array_copy = array_with_neg_numbers.copy() - df = pd.DataFrame( + df = DataFrame( {"A": [100, 101, 102], "B": [103, 104, 105], "C": [106, 107, 108]}, index=[1, 2, 3], ) @@ -372,7 +372,7 @@ def test_iloc_setitem_dups(self): def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self): # Same as the "assign back to self" check in test_iloc_setitem_dups # but on a DataFrame with multiple blocks - df = pd.DataFrame([[0, 1], [2, 3]], columns=["B", "B"]) + df = DataFrame([[0, 1], [2, 3]], columns=["B", "B"]) df.iloc[:, 0] = df.iloc[:, 0].astype("f8") assert len(df._mgr.blocks) == 2 @@ -562,7 +562,7 @@ def test_iloc_setitem_with_scalar_index(self, indexer, value): # assigning like "df.iloc[0, [0]] = ['Z']" should be evaluated # elementwisely, not using "setter('A', ['Z'])". - df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) df.iloc[0, indexer] = value result = df.iloc[0, 0] @@ -699,7 +699,7 @@ def test_indexing_zerodim_np_array(self): # GH24919 df = DataFrame([[1, 2], [3, 4]]) result = df.iloc[np.array(0)] - s = pd.Series([1, 2], name=0) + s = Series([1, 2], name=0) tm.assert_series_equal(result, s) def test_series_indexing_zerodim_np_array(self): @@ -712,7 +712,7 @@ def test_series_indexing_zerodim_np_array(self): def test_iloc_setitem_categorical_updates_inplace(self): # Mixed dtype ensures we go through take_split_path in setitem_with_indexer cat = pd.Categorical(["A", "B", "C"]) - df = pd.DataFrame({1: cat, 2: [1, 2, 3]}) + df = DataFrame({1: cat, 2: [1, 2, 3]}) # This should modify our original values in-place df.iloc[:, 0] = cat[::-1] @@ -739,12 +739,21 @@ def test_iloc_with_boolean_operation(self): expected = DataFrame([[0.0, 4.0], [8.0, 12.0], [4.0, 5.0], [6.0, np.nan]]) tm.assert_frame_equal(result, expected) + def test_iloc_getitem_singlerow_slice_categoricaldtype_gives_series(self): + # GH#29521 + df = DataFrame({"x": pd.Categorical("a b c d e".split())}) + result = df.iloc[0] + raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"]) + expected = Series(raw_cat, index=["x"], name=0, dtype="category") + + tm.assert_series_equal(result, expected) + class TestILocSetItemDuplicateColumns: def test_iloc_setitem_scalar_duplicate_columns(self): # GH#15686, duplicate columns and mixed dtype - df1 = pd.DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) - df2 = pd.DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) + df1 = DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) df = pd.concat([df1, df2], axis=1) df.iloc[0, 0] = -1 @@ -754,15 +763,15 @@ def test_iloc_setitem_scalar_duplicate_columns(self): def test_iloc_setitem_list_duplicate_columns(self): # GH#22036 setting with same-sized list - df = pd.DataFrame([[0, "str", "str2"]], columns=["a", "b", "b"]) + df = DataFrame([[0, "str", "str2"]], columns=["a", "b", "b"]) df.iloc[:, 2] = ["str3"] - expected = pd.DataFrame([[0, "str", "str3"]], columns=["a", "b", "b"]) + expected = DataFrame([[0, "str", "str3"]], columns=["a", "b", "b"]) tm.assert_frame_equal(df, expected) def test_iloc_setitem_series_duplicate_columns(self): - df = pd.DataFrame( + df = DataFrame( np.arange(8, dtype=np.int64).reshape(2, 4), columns=["A", "B", "A", "B"] ) df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index fd83f9ab29407..79834dc36ce7d 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -329,7 +329,7 @@ def test_dups_fancy_indexing2(self): @pytest.mark.parametrize("case", [lambda s: s, lambda s: s.loc]) def test_duplicate_int_indexing(self, case): # GH 17347 - s = pd.Series(range(3), index=[1, 1, 3]) + s = Series(range(3), index=[1, 1, 3]) expected = s[1] result = case(s)[[1]] tm.assert_series_equal(result, expected) @@ -991,16 +991,14 @@ def test_none_coercion_mixed_dtypes(self): def test_extension_array_cross_section(): # A cross-section of a homogeneous EA should be an EA - df = pd.DataFrame( + df = DataFrame( { "A": pd.core.arrays.integer_array([1, 2]), "B": pd.core.arrays.integer_array([3, 4]), }, index=["a", "b"], ) - expected = pd.Series( - pd.core.arrays.integer_array([1, 3]), index=["A", "B"], name="a" - ) + expected = Series(pd.core.arrays.integer_array([1, 3]), index=["A", "B"], name="a") result = df.loc["a"] tm.assert_series_equal(result, expected) @@ -1010,23 +1008,23 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): # all numeric columns -> numeric series - df = pd.DataFrame( + df = DataFrame( {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"] ) result = df.loc["a"] - expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a") + expected = Series([1, 1], dtype="Int64", index=["A", "B"], name="a") tm.assert_series_equal(result, expected) result = df.iloc[0] tm.assert_series_equal(result, expected) # mixed columns -> object series - df = pd.DataFrame( + df = DataFrame( {"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])}, index=["a", "b"], ) result = df.loc["a"] - expected = pd.Series([1, "a"], dtype=object, index=["A", "B"], name="a") + expected = Series([1, "a"], dtype=object, index=["A", "B"], name="a") tm.assert_series_equal(result, expected) result = df.iloc[0] @@ -1035,7 +1033,7 @@ def test_extension_array_cross_section_converts(): def test_readonly_indices(): # GH#17192 iloc with read-only array raising TypeError - df = pd.DataFrame({"data": np.ones(100, dtype="float64")}) + df = DataFrame({"data": np.ones(100, dtype="float64")}) indices = np.array([1, 3, 6]) indices.flags.writeable = False @@ -1049,7 +1047,7 @@ def test_readonly_indices(): def test_1tuple_without_multiindex(): - ser = pd.Series(range(5)) + ser = Series(range(5)) key = (slice(3),) result = ser[key] @@ -1059,7 +1057,7 @@ def test_1tuple_without_multiindex(): def test_duplicate_index_mistyped_key_raises_keyerror(): # GH#29189 float_index.get_loc(None) should raise KeyError, not TypeError - ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) + ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) with pytest.raises(KeyError, match="None"): ser[None] @@ -1072,17 +1070,17 @@ def test_duplicate_index_mistyped_key_raises_keyerror(): def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(): # GH 30567 - ser = pd.Series([None] * 10) + ser = Series([None] * 10) mask = [False] * 3 + [True] * 5 + [False] * 2 ser[mask] = range(5) result = ser - expected = pd.Series([None] * 3 + list(range(5)) + [None] * 2).astype("object") + expected = Series([None] * 3 + list(range(5)) + [None] * 2).astype("object") tm.assert_series_equal(result, expected) def test_missing_labels_inside_loc_matched_in_error_message(): # GH34272 - s = pd.Series({"a": 1, "b": 2, "c": 3}) + s = Series({"a": 1, "b": 2, "c": 3}) error_message_regex = "missing_0.*missing_1.*missing_2" with pytest.raises(KeyError, match=error_message_regex): s.loc[["a", "b", "missing_0", "c", "missing_1", "missing_2"]] @@ -1092,7 +1090,7 @@ def test_many_missing_labels_inside_loc_error_message_limited(): # GH34272 n = 10000 missing_labels = [f"missing_{label}" for label in range(n)] - s = pd.Series({"a": 1, "b": 2, "c": 3}) + s = Series({"a": 1, "b": 2, "c": 3}) # regex checks labels between 4 and 9995 are replaced with ellipses error_message_regex = "missing_4.*\\.\\.\\..*missing_9995" with pytest.raises(KeyError, match=error_message_regex): @@ -1101,7 +1099,7 @@ def test_many_missing_labels_inside_loc_error_message_limited(): def test_long_text_missing_labels_inside_loc_error_message_limited(): # GH34272 - s = pd.Series({"a": 1, "b": 2, "c": 3}) + s = Series({"a": 1, "b": 2, "c": 3}) missing_labels = [f"long_missing_label_text_{i}" * 5 for i in range(3)] # regex checks for very long labels there are new lines between each error_message_regex = "long_missing_label_text_0.*\\\\n.*long_missing_label_text_1" @@ -1111,9 +1109,9 @@ def test_long_text_missing_labels_inside_loc_error_message_limited(): def test_setitem_categorical(): # https://github.com/pandas-dev/pandas/issues/35369 - df = pd.DataFrame({"h": pd.Series(list("mn")).astype("category")}) + df = DataFrame({"h": Series(list("mn")).astype("category")}) df.h = df.h.cat.reorder_categories(["n", "m"]) - expected = pd.DataFrame( + expected = DataFrame( {"h": pd.Categorical(["m", "n"]).reorder_categories(["n", "m"])} ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 9b9bca77e17ec..8fb418ab78307 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -8,7 +8,7 @@ from pandas.compat.numpy import is_numpy_dev import pandas as pd -from pandas import DataFrame, Series, Timestamp, date_range +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range import pandas._testing as tm from pandas.api.types import is_scalar from pandas.tests.indexing.common import Base @@ -632,7 +632,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): # assigning like "df.loc[0, ['A']] = ['Z']" should be evaluated # elementwisely, not using "setter('A', ['Z'])". - df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) df.loc[0, indexer] = value result = df.loc[0, "A"] @@ -644,7 +644,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): ( ([0, 2], ["A", "B", "C", "D"]), 7, - pd.DataFrame( + DataFrame( [[7, 7, 7, 7], [3, 4, np.nan, np.nan], [7, 7, 7, 7]], columns=["A", "B", "C", "D"], ), @@ -652,7 +652,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): ( (1, ["C", "D"]), [7, 8], - pd.DataFrame( + DataFrame( [[1, 2, np.nan, np.nan], [3, 4, 7, 8], [5, 6, np.nan, np.nan]], columns=["A", "B", "C", "D"], ), @@ -660,14 +660,14 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): ( (1, ["A", "B", "C"]), np.array([7, 8, 9], dtype=np.int64), - pd.DataFrame( + DataFrame( [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], columns=["A", "B", "C"] ), ), ( (slice(1, 3, None), ["B", "C", "D"]), [[7, 8, 9], [10, 11, 12]], - pd.DataFrame( + DataFrame( [[1, 2, np.nan, np.nan], [3, 7, 8, 9], [5, 10, 11, 12]], columns=["A", "B", "C", "D"], ), @@ -675,15 +675,15 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): ( (slice(1, 3, None), ["C", "A", "D"]), np.array([[7, 8, 9], [10, 11, 12]], dtype=np.int64), - pd.DataFrame( + DataFrame( [[1, 2, np.nan, np.nan], [8, 4, 7, 9], [11, 6, 10, 12]], columns=["A", "B", "C", "D"], ), ), ( (slice(None, None, None), ["A", "C"]), - pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), - pd.DataFrame( + DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + DataFrame( [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] ), ), @@ -691,7 +691,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): ) def test_loc_setitem_missing_columns(self, index, box, expected): # GH 29334 - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) df.loc[index] = box tm.assert_frame_equal(df, expected) @@ -737,15 +737,15 @@ def test_setitem_new_key_tz(self): pd.to_datetime(42).tz_localize("UTC"), pd.to_datetime(666).tz_localize("UTC"), ] - expected = pd.Series(vals, index=["foo", "bar"]) + expected = Series(vals, index=["foo", "bar"]) - ser = pd.Series(dtype=object) + ser = Series(dtype=object) ser["foo"] = vals[0] ser["bar"] = vals[1] tm.assert_series_equal(ser, expected) - ser = pd.Series(dtype=object) + ser = Series(dtype=object) ser.loc["foo"] = vals[0] ser.loc["bar"] = vals[1] @@ -907,9 +907,7 @@ def test_loc_copy_vs_view(self): def test_loc_uint64(self): # GH20722 # Test whether loc accept uint64 max value as index. - s = pd.Series( - [1, 2], index=[np.iinfo("uint64").max - 1, np.iinfo("uint64").max] - ) + s = Series([1, 2], index=[np.iinfo("uint64").max - 1, np.iinfo("uint64").max]) result = s.loc[np.iinfo("uint64").max - 1] expected = s.iloc[0] @@ -961,7 +959,7 @@ def test_indexing_zerodim_np_array(self): # GH24924 df = DataFrame([[1, 2], [3, 4]]) result = df.loc[np.array(0)] - s = pd.Series([1, 2], name=0) + s = Series([1, 2], name=0) tm.assert_series_equal(result, s) def test_series_indexing_zerodim_np_array(self): @@ -975,11 +973,95 @@ def test_loc_reverse_assignment(self): data = [1, 2, 3, 4, 5, 6] + [None] * 4 expected = Series(data, index=range(2010, 2020)) - result = pd.Series(index=range(2010, 2020), dtype=np.float64) + result = Series(index=range(2010, 2020), dtype=np.float64) result.loc[2015:2010:-1] = [6, 5, 4, 3, 2, 1] tm.assert_series_equal(result, expected) + def test_loc_setitem_str_to_small_float_conversion_type(self): + # GH#20388 + np.random.seed(13) + col_data = [str(np.random.random() * 1e-12) for _ in range(5)] + result = DataFrame(col_data, columns=["A"]) + expected = DataFrame(col_data, columns=["A"], dtype=object) + tm.assert_frame_equal(result, expected) + + # change the dtype of the elements from object to float one by one + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float) + tm.assert_frame_equal(result, expected) + + +class TestLocWithMultiIndex: + @pytest.mark.parametrize( + "keys, expected", + [ + (["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]), + (["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]), + ((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]), + ((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]), + ((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]), + ], + ) + @pytest.mark.parametrize("dim", ["index", "columns"]) + def test_loc_getitem_multilevel_index_order(self, dim, keys, expected): + # GH#22797 + # Try to respect order of keys given for MultiIndex.loc + kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} + df = DataFrame(np.arange(25).reshape(5, 5), **kwargs) + exp_index = MultiIndex.from_arrays(expected) + if dim == "index": + res = df.loc[keys, :] + tm.assert_index_equal(res.index, exp_index) + elif dim == "columns": + res = df.loc[:, keys] + tm.assert_index_equal(res.columns, exp_index) + + def test_loc_preserve_names(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + + result = ymd.loc[2000] + result2 = ymd["A"].loc[2000] + assert result.index.names == ymd.index.names[1:] + assert result2.index.names == ymd.index.names[1:] + + result = ymd.loc[2000, 2] + result2 = ymd["A"].loc[2000, 2] + assert result.index.name == ymd.index.names[2] + assert result2.index.name == ymd.index.names[2] + + def test_loc_getitem_multiindex_nonunique_len_zero(self): + # GH#13691 + mi = MultiIndex.from_product([[0], [1, 1]]) + ser = Series(0, index=mi) + + res = ser.loc[[]] + + expected = ser[:0] + tm.assert_series_equal(res, expected) + + res2 = ser.loc[ser.iloc[0:0]] + tm.assert_series_equal(res2, expected) + + def test_loc_getitem_access_none_value_in_multiindex(self): + # GH#34318: test that you can access a None value using .loc + # through a Multiindex + + ser = Series([None], pd.MultiIndex.from_arrays([["Level1"], ["Level2"]])) + result = ser.loc[("Level1", "Level2")] + assert result is None + + midx = MultiIndex.from_product([["Level1"], ["Level2_a", "Level2_b"]]) + ser = Series([None] * len(midx), dtype=object, index=midx) + result = ser.loc[("Level1", "Level2_a")] + assert result is None + + ser = Series([1] * len(midx), dtype=object, index=midx) + result = ser.loc[("Level1", "Level2_a")] + assert result == 1 + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 @@ -1012,13 +1094,13 @@ def test_loc_getitem_label_list_integer_labels( def test_loc_setitem_float_intindex(): # GH 8720 rand_data = np.random.randn(8, 4) - result = pd.DataFrame(rand_data) + result = DataFrame(rand_data) result.loc[:, 0.5] = np.nan expected_data = np.hstack((rand_data, np.array([np.nan] * 8).reshape(8, 1))) - expected = pd.DataFrame(expected_data, columns=[0.0, 1.0, 2.0, 3.0, 0.5]) + expected = DataFrame(expected_data, columns=[0.0, 1.0, 2.0, 3.0, 0.5]) tm.assert_frame_equal(result, expected) - result = pd.DataFrame(rand_data) + result = DataFrame(rand_data) result.loc[:, 0.5] = np.nan tm.assert_frame_equal(result, expected) @@ -1026,13 +1108,13 @@ def test_loc_setitem_float_intindex(): def test_loc_axis_1_slice(): # GH 10586 cols = [(yr, m) for yr in [2014, 2015] for m in [7, 8, 9, 10]] - df = pd.DataFrame( + df = DataFrame( np.ones((10, 8)), index=tuple("ABCDEFGHIJ"), columns=pd.MultiIndex.from_tuples(cols), ) result = df.loc(axis=1)[(2014, 9):(2015, 8)] - expected = pd.DataFrame( + expected = DataFrame( np.ones((10, 4)), index=tuple("ABCDEFGHIJ"), columns=pd.MultiIndex.from_tuples( @@ -1044,7 +1126,7 @@ def test_loc_axis_1_slice(): def test_loc_set_dataframe_multiindex(): # GH 14592 - expected = pd.DataFrame( + expected = DataFrame( "a", index=range(2), columns=pd.MultiIndex.from_product([range(2), range(2)]) ) result = expected.copy() @@ -1054,7 +1136,7 @@ def test_loc_set_dataframe_multiindex(): def test_loc_mixed_int_float(): # GH#19456 - ser = pd.Series(range(2), pd.Index([1, 2.0], dtype=object)) + ser = Series(range(2), pd.Index([1, 2.0], dtype=object)) result = ser.loc[1] assert result == 0 @@ -1062,19 +1144,19 @@ def test_loc_mixed_int_float(): def test_loc_with_positional_slice_deprecation(): # GH#31840 - ser = pd.Series(range(4), index=["A", "B", "C", "D"]) + ser = Series(range(4), index=["A", "B", "C", "D"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): ser.loc[:3] = 2 - expected = pd.Series([2, 2, 2, 3], index=["A", "B", "C", "D"]) + expected = Series([2, 2, 2, 3], index=["A", "B", "C", "D"]) tm.assert_series_equal(ser, expected) def test_loc_slice_disallows_positional(): # GH#16121, GH#24612, GH#31810 dti = pd.date_range("2016-01-01", periods=3) - df = pd.DataFrame(np.random.random((3, 2)), index=dti) + df = DataFrame(np.random.random((3, 2)), index=dti) ser = df[0] @@ -1102,7 +1184,7 @@ def test_loc_slice_disallows_positional(): def test_loc_datetimelike_mismatched_dtypes(): # GH#32650 dont mix and match datetime/timedelta/period dtypes - df = pd.DataFrame( + df = DataFrame( np.random.randn(5, 3), columns=["a", "b", "c"], index=pd.date_range("2012", freq="H", periods=5), @@ -1124,7 +1206,7 @@ def test_loc_datetimelike_mismatched_dtypes(): def test_loc_with_period_index_indexer(): # GH#4125 idx = pd.period_range("2002-01", "2003-12", freq="M") - df = pd.DataFrame(np.random.randn(24, 10), index=idx) + df = DataFrame(np.random.randn(24, 10), index=idx) tm.assert_frame_equal(df, df.loc[idx]) tm.assert_frame_equal(df, df.loc[list(idx)]) tm.assert_frame_equal(df, df.loc[list(idx)]) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 337ec683ee745..80b7947eb5239 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -335,7 +335,7 @@ def test_partial_set_invalid(self): df = orig.copy() # don't allow not string inserts - msg = "cannot insert DatetimeArray with incompatible label" + msg = r"value should be a 'Timestamp' or 'NaT'\. Got '.*' instead\." with pytest.raises(TypeError, match=msg): df.loc[100.0, :] = df.iloc[0] @@ -663,21 +663,42 @@ def test_indexing_timeseries_regression(self): def test_index_name_empty(self): # GH 31368 - df = pd.DataFrame({}, index=pd.RangeIndex(0, name="df_index")) - series = pd.Series(1.23, index=pd.RangeIndex(4, name="series_index")) + df = DataFrame({}, index=pd.RangeIndex(0, name="df_index")) + series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) df["series"] = series - expected = pd.DataFrame( + expected = DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") ) tm.assert_frame_equal(df, expected) # GH 36527 - df = pd.DataFrame() - series = pd.Series(1.23, index=pd.RangeIndex(4, name="series_index")) + df = DataFrame() + series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) df["series"] = series - expected = pd.DataFrame( + expected = DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") ) tm.assert_frame_equal(df, expected) + + def test_slice_irregular_datetime_index_with_nan(self): + # GH36953 + index = pd.to_datetime(["2012-01-01", "2012-01-02", "2012-01-03", None]) + df = DataFrame(range(len(index)), index=index) + expected = DataFrame(range(len(index[:3])), index=index[:3]) + result = df["2012-01-01":"2012-01-04"] + tm.assert_frame_equal(result, expected) + + def test_slice_datetime_index(self): + # GH35509 + df = DataFrame( + {"col1": ["a", "b", "c"], "col2": [1, 2, 3]}, + index=pd.to_datetime(["2020-08-01", "2020-07-02", "2020-08-05"]), + ) + expected = DataFrame( + {"col1": ["a", "c"], "col2": [1, 3]}, + index=pd.to_datetime(["2020-08-01", "2020-08-05"]), + ) + result = df.loc["2020-08"] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index cd2f5a903d8cc..9d1a2bed5db12 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1,6 +1,5 @@ from datetime import date, datetime import itertools -import operator import re import numpy as np @@ -613,7 +612,7 @@ def test_reindex_items(self): reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) assert reindexed.nblocks == 2 - tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) + tm.assert_index_equal(reindexed.items, Index(["g", "c", "a", "d"])) tm.assert_almost_equal( mgr.iget(6).internal_values(), reindexed.iget(0).internal_values() ) @@ -636,9 +635,7 @@ def test_get_numeric_data(self): mgr.iset(5, np.array([1, 2, 3], dtype=np.object_)) numeric = mgr.get_numeric_data() - tm.assert_index_equal( - numeric.items, pd.Index(["int", "float", "complex", "bool"]) - ) + tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("float")).internal_values(), numeric.iget(numeric.items.get_loc("float")).internal_values(), @@ -652,9 +649,7 @@ def test_get_numeric_data(self): ) numeric2 = mgr.get_numeric_data(copy=True) - tm.assert_index_equal( - numeric.items, pd.Index(["int", "float", "complex", "bool"]) - ) + tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) numeric2.iset( numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0]) ) @@ -672,7 +667,7 @@ def test_get_bool_data(self): mgr.iset(6, np.array([True, False, True], dtype=np.object_)) bools = mgr.get_bool_data() - tm.assert_index_equal(bools.items, pd.Index(["bool"])) + tm.assert_index_equal(bools.items, Index(["bool"])) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(), bools.iget(bools.items.get_loc("bool")).internal_values(), @@ -840,14 +835,12 @@ def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): tm.assert_index_equal(reindexed.axes[axis], new_labels) for ax in range(mgr.ndim): - assert_reindex_axis_is_ok(mgr, ax, pd.Index([]), fill_value) + assert_reindex_axis_is_ok(mgr, ax, Index([]), fill_value) assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax], fill_value) assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][[0, 0, 0]], fill_value) + assert_reindex_axis_is_ok(mgr, ax, Index(["foo", "bar", "baz"]), fill_value) assert_reindex_axis_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), fill_value - ) - assert_reindex_axis_is_ok( - mgr, ax, pd.Index(["foo", mgr.axes[ax][0], "baz"]), fill_value + mgr, ax, Index(["foo", mgr.axes[ax][0], "baz"]), fill_value ) if mgr.shape[ax] >= 3: @@ -872,14 +865,14 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): tm.assert_index_equal(reindexed.axes[axis], new_labels) for ax in range(mgr.ndim): - assert_reindex_indexer_is_ok(mgr, ax, pd.Index([]), [], fill_value) + assert_reindex_indexer_is_ok(mgr, ax, Index([]), [], fill_value) assert_reindex_indexer_is_ok( mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value ) assert_reindex_indexer_is_ok( mgr, ax, - pd.Index(["foo"] * mgr.shape[ax]), + Index(["foo"] * mgr.shape[ax]), np.arange(mgr.shape[ax]), fill_value, ) @@ -890,22 +883,22 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value ) assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value + mgr, ax, Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value ) assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value + mgr, ax, Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value ) assert_reindex_indexer_is_ok( mgr, ax, - pd.Index(["foo", mgr.axes[ax][0], "baz"]), + Index(["foo", mgr.axes[ax][0], "baz"]), [-1, -1, -1], fill_value, ) if mgr.shape[ax] >= 3: assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value + mgr, ax, Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value ) @@ -1048,31 +1041,6 @@ def test_blockplacement_add_int_raises(self, val): BlockPlacement(val).add(-10) -class DummyElement: - def __init__(self, value, dtype): - self.value = value - self.dtype = np.dtype(dtype) - - def __array__(self): - return np.array(self.value, dtype=self.dtype) - - def __str__(self) -> str: - return f"DummyElement({self.value}, {self.dtype})" - - def __repr__(self) -> str: - return str(self) - - def astype(self, dtype, copy=False): - self.dtype = dtype - return self - - def view(self, dtype): - return type(self)(self.value.view(dtype), dtype) - - def any(self, axis=None): - return bool(self.value) - - class TestCanHoldElement: def test_datetime_block_can_hold_element(self): block = create_block("datetime", [0]) @@ -1095,88 +1063,17 @@ def test_datetime_block_can_hold_element(self): assert not block._can_hold_element(val) msg = ( - "'value' should be a 'Timestamp', 'NaT', " + "value should be a 'Timestamp', 'NaT', " "or array of those. Got 'date' instead." ) with pytest.raises(TypeError, match=msg): arr[0] = val - @pytest.mark.parametrize( - "value, dtype", - [ - (1, "i8"), - (1.0, "f8"), - (2 ** 63, "f8"), - (1j, "complex128"), - (2 ** 63, "complex128"), - (True, "bool"), - (np.timedelta64(20, "ns"), " 43 chars.", "dog"), ] ) - df = pd.DataFrame(1, index=index, columns=columns) + df = DataFrame(1, index=index, columns=columns) result = repr(df) @@ -398,7 +381,7 @@ def test_repr_truncates_terminal_size(self, monkeypatch): assert "dog" in h2 # regular columns - df2 = pd.DataFrame({"A" * 41: [1, 2], "B" * 41: [1, 2]}) + df2 = DataFrame({"A" * 41: [1, 2], "B" * 41: [1, 2]}) result = repr(df2) assert df2.columns[0] in result.split("\n")[0] @@ -406,7 +389,7 @@ def test_repr_truncates_terminal_size(self, monkeypatch): def test_repr_truncates_terminal_size_full(self, monkeypatch): # GH 22984 ensure entire window is filled terminal_size = (80, 24) - df = pd.DataFrame(np.random.rand(1, 7)) + df = DataFrame(np.random.rand(1, 7)) monkeypatch.setattr( "pandas.io.formats.format.get_terminal_size", lambda: terminal_size @@ -416,7 +399,7 @@ def test_repr_truncates_terminal_size_full(self, monkeypatch): def test_repr_truncation_column_size(self): # dataframe with last column very wide -> check it is not used to # determine size of truncation (...) column - df = pd.DataFrame( + df = DataFrame( { "a": [108480, 30830], "b": [12345, 12345], @@ -474,13 +457,13 @@ def mkframe(n): assert has_expanded_repr(df) def test_repr_min_rows(self): - df = pd.DataFrame({"a": range(20)}) + df = DataFrame({"a": range(20)}) # default setting no truncation even if above min_rows assert ".." not in repr(df) assert ".." not in df._repr_html_() - df = pd.DataFrame({"a": range(61)}) + df = DataFrame({"a": range(61)}) # default of max_rows 60 triggers truncation if above assert ".." in repr(df) @@ -510,7 +493,7 @@ def test_repr_min_rows(self): def test_str_max_colwidth(self): # GH 7856 - df = pd.DataFrame( + df = DataFrame( [ { "a": "foo", @@ -534,45 +517,6 @@ def test_str_max_colwidth(self): "1 foo bar stuff 1" ) - def test_to_string_truncate(self): - # GH 9784 - dont truncate when calling DataFrame.to_string - df = pd.DataFrame( - [ - { - "a": "foo", - "b": "bar", - "c": "let's make this a very VERY long line that is longer " - "than the default 50 character limit", - "d": 1, - }, - {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, - ] - ) - df.set_index(["a", "b", "c"]) - assert df.to_string() == ( - " a b " - " c d\n" - "0 foo bar let's make this a very VERY long line t" - "hat is longer than the default 50 character limit 1\n" - "1 foo bar " - " stuff 1" - ) - with option_context("max_colwidth", 20): - # the display option has no effect on the to_string method - assert df.to_string() == ( - " a b " - " c d\n" - "0 foo bar let's make this a very VERY long line t" - "hat is longer than the default 50 character limit 1\n" - "1 foo bar " - " stuff 1" - ) - assert df.to_string(max_colwidth=20) == ( - " a b c d\n" - "0 foo bar let's make this ... 1\n" - "1 foo bar stuff 1" - ) - def test_auto_detect(self): term_width, term_height = get_terminal_size() fac = 1.05 # Arbitrary large factor to exceed term width @@ -633,95 +577,6 @@ def test_to_string_repr_unicode(self): finally: sys.stdin = _stdin - def test_to_string_unicode_columns(self, float_frame): - df = DataFrame({"\u03c3": np.arange(10.0)}) - - buf = StringIO() - df.to_string(buf=buf) - buf.getvalue() - - buf = StringIO() - df.info(buf=buf) - buf.getvalue() - - result = float_frame.to_string() - assert isinstance(result, str) - - def test_to_string_utf8_columns(self): - n = "\u05d0".encode() - - with option_context("display.max_rows", 1): - df = DataFrame([1, 2], columns=[n]) - repr(df) - - def test_to_string_unicode_two(self): - dm = DataFrame({"c/\u03c3": []}) - buf = StringIO() - dm.to_string(buf) - - def test_to_string_unicode_three(self): - dm = DataFrame(["\xc2"]) - buf = StringIO() - dm.to_string(buf) - - def test_to_string_with_formatters(self): - df = DataFrame( - { - "int": [1, 2, 3], - "float": [1.0, 2.0, 3.0], - "object": [(1, 2), True, False], - }, - columns=["int", "float", "object"], - ) - - formatters = [ - ("int", lambda x: f"0x{x:x}"), - ("float", lambda x: f"[{x: 4.1f}]"), - ("object", lambda x: f"-{x!s}-"), - ] - result = df.to_string(formatters=dict(formatters)) - result2 = df.to_string(formatters=list(zip(*formatters))[1]) - assert result == ( - " int float object\n" - "0 0x1 [ 1.0] -(1, 2)-\n" - "1 0x2 [ 2.0] -True-\n" - "2 0x3 [ 3.0] -False-" - ) - assert result == result2 - - def test_to_string_with_datetime64_monthformatter(self): - months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] - x = DataFrame({"months": months}) - - def format_func(x): - return x.strftime("%Y-%m") - - result = x.to_string(formatters={"months": format_func}) - expected = "months\n0 2016-01\n1 2016-02" - assert result.strip() == expected - - def test_to_string_with_datetime64_hourformatter(self): - - x = DataFrame( - { - "hod": pd.to_datetime( - ["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f" - ) - } - ) - - def format_func(x): - return x.strftime("%H:%M") - - result = x.to_string(formatters={"hod": format_func}) - expected = "hod\n0 10:10\n1 12:12" - assert result.strip() == expected - - def test_to_string_with_formatters_unicode(self): - df = DataFrame({"c/\u03c3": [1, 2, 3]}) - result = df.to_string(formatters={"c/\u03c3": str}) - assert result == " c/\u03c3\n" + "0 1\n1 2\n2 3" - def test_east_asian_unicode_false(self): # not aligned properly because of east asian width @@ -788,7 +643,7 @@ def test_east_asian_unicode_false(self): # index name df = DataFrame( {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, - index=pd.Index(["あ", "い", "うう", "え"], name="おおおお"), + index=Index(["あ", "い", "うう", "え"], name="おおおお"), ) expected = ( " a b\n" @@ -803,7 +658,7 @@ def test_east_asian_unicode_false(self): # all df = DataFrame( {"あああ": ["あああ", "い", "う", "えええええ"], "いいいいい": ["あ", "いいい", "う", "ええ"]}, - index=pd.Index(["あ", "いいい", "うう", "え"], name="お"), + index=Index(["あ", "いいい", "うう", "え"], name="お"), ) expected = ( " あああ いいいいい\n" @@ -834,7 +689,7 @@ def test_east_asian_unicode_false(self): # truncate with option_context("display.max_rows", 3, "display.max_columns", 3): - df = pd.DataFrame( + df = DataFrame( { "a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"], @@ -932,7 +787,7 @@ def test_east_asian_unicode_true(self): # index name df = DataFrame( {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, - index=pd.Index(["あ", "い", "うう", "え"], name="おおおお"), + index=Index(["あ", "い", "うう", "え"], name="おおおお"), ) expected = ( " a b\n" @@ -947,7 +802,7 @@ def test_east_asian_unicode_true(self): # all df = DataFrame( {"あああ": ["あああ", "い", "う", "えええええ"], "いいいいい": ["あ", "いいい", "う", "ええ"]}, - index=pd.Index(["あ", "いいい", "うう", "え"], name="お"), + index=Index(["あ", "いいい", "うう", "え"], name="お"), ) expected = ( " あああ いいいいい\n" @@ -979,7 +834,7 @@ def test_east_asian_unicode_true(self): # truncate with option_context("display.max_rows", 3, "display.max_columns", 3): - df = pd.DataFrame( + df = DataFrame( { "a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"], @@ -1165,7 +1020,7 @@ def test_datetimelike_frame(self): assert "[6 rows x 1 columns]" in result dts = [pd.Timestamp("2011-01-01", tz="US/Eastern")] * 5 + [pd.NaT] * 5 - df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + df = DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) with option_context("display.max_rows", 5): expected = ( " dt x\n" @@ -1179,7 +1034,7 @@ def test_datetimelike_frame(self): assert repr(df) == expected dts = [pd.NaT] * 5 + [pd.Timestamp("2011-01-01", tz="US/Eastern")] * 5 - df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + df = DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) with option_context("display.max_rows", 5): expected = ( " dt x\n" @@ -1195,7 +1050,7 @@ def test_datetimelike_frame(self): dts = [pd.Timestamp("2011-01-01", tz="Asia/Tokyo")] * 5 + [ pd.Timestamp("2011-01-01", tz="US/Eastern") ] * 5 - df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + df = DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) with option_context("display.max_rows", 5): expected = ( " dt x\n" @@ -2146,14 +2001,14 @@ def test_categorical_columns(self): # GH35439 data = [[4, 2], [3, 2], [4, 3]] cols = ["aaaaaaaaa", "b"] - df = pd.DataFrame(data, columns=cols) - df_cat_cols = pd.DataFrame(data, columns=pd.CategoricalIndex(cols)) + df = DataFrame(data, columns=cols) + df_cat_cols = DataFrame(data, columns=pd.CategoricalIndex(cols)) assert df.to_string() == df_cat_cols.to_string() def test_period(self): # GH 12615 - df = pd.DataFrame( + df = DataFrame( { "A": pd.period_range("2013-01", periods=4, freq="M"), "B": [ @@ -2176,9 +2031,9 @@ def test_period(self): def gen_series_formatting(): - s1 = pd.Series(["a"] * 100) - s2 = pd.Series(["ab"] * 100) - s3 = pd.Series(["a", "ab", "abc", "abcd", "abcde", "abcdef"]) + s1 = Series(["a"] * 100) + s2 = Series(["ab"] * 100) + s3 = Series(["a", "ab", "abc", "abcd", "abcde", "abcdef"]) s4 = s3[::-1] test_sers = {"onel": s1, "twol": s2, "asc": s3, "desc": s4} return test_sers @@ -2674,7 +2529,7 @@ def test_max_multi_index_display(self): # Make sure #8532 is fixed def test_consistent_format(self): - s = pd.Series([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.9999, 1, 1] * 10) + s = Series([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.9999, 1, 1] * 10) with option_context("display.max_rows", 10, "display.show_dimensions", False): res = repr(s) exp = ( @@ -2763,12 +2618,12 @@ def test_show_dimensions(self): assert "Length" not in repr(s) def test_repr_min_rows(self): - s = pd.Series(range(20)) + s = Series(range(20)) # default setting no truncation even if above min_rows assert ".." not in repr(s) - s = pd.Series(range(61)) + s = Series(range(61)) # default of max_rows 60 triggers truncation if above assert ".." in repr(s) @@ -2816,19 +2671,19 @@ def test_to_string_length(self): assert res == exp def test_to_string_na_rep(self): - s = pd.Series(index=range(100), dtype=np.float64) + s = Series(index=range(100), dtype=np.float64) res = s.to_string(na_rep="foo", max_rows=2) exp = "0 foo\n ..\n99 foo" assert res == exp def test_to_string_float_format(self): - s = pd.Series(range(10), dtype="float64") + s = Series(range(10), dtype="float64") res = s.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) exp = "0 0.0\n ..\n9 9.0" assert res == exp def test_to_string_header(self): - s = pd.Series(range(10), dtype="int64") + s = Series(range(10), dtype="int64") s.index.name = "foo" res = s.to_string(header=True, max_rows=2) exp = "foo\n0 0\n ..\n9 9" @@ -2839,16 +2694,14 @@ def test_to_string_header(self): def test_to_string_multindex_header(self): # GH 16718 - df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index( - ["a", "b"] - ) + df = DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index(["a", "b"]) res = df.to_string(header=["r1", "r2"]) exp = " r1 r2\na b \n0 1 2 3" assert res == exp def test_to_string_empty_col(self): # GH 13653 - s = pd.Series(["", "Hello", "World", "", "", "Mooooo", "", ""]) + s = Series(["", "Hello", "World", "", "", "Mooooo", "", ""]) res = s.to_string(index=False) exp = " \n Hello\n World\n \n \nMooooo\n \n " assert re.match(exp, res) @@ -2905,7 +2758,7 @@ def __getitem__(self, ix): def dtype(self): return DtypeStub() - series = pd.Series(ExtTypeStub()) + series = Series(ExtTypeStub()) res = repr(series) # This line crashed before #33770 was fixed. expected = "0 [False True]\n" + "1 [ True False]\n" + "dtype: DtypeStub" assert res == expected @@ -2932,7 +2785,7 @@ def test_output_display_precision_trailing_zeroes(self): # Happens when display precision is set to zero with pd.option_context("display.precision", 0): - s = pd.Series([840.0, 4200.0]) + s = Series([840.0, 4200.0]) expected_output = "0 840\n1 4200\ndtype: float64" assert str(s) == expected_output @@ -2942,7 +2795,7 @@ def test_output_significant_digits(self): # In case default display precision changes: with pd.option_context("display.precision", 6): # DataFrame example from issue #9764 - d = pd.DataFrame( + d = DataFrame( { "col1": [ 9.999e-8, @@ -3014,11 +2867,11 @@ def test_too_long(self): with pd.option_context("display.precision", 4): # need both a number > 1e6 and something that normally formats to # having length > display.precision + 6 - df = pd.DataFrame(dict(x=[12345.6789])) + df = DataFrame(dict(x=[12345.6789])) assert str(df) == " x\n0 12345.6789" - df = pd.DataFrame(dict(x=[2e6])) + df = DataFrame(dict(x=[2e6])) assert str(df) == " x\n0 2000000.0" - df = pd.DataFrame(dict(x=[12345.6789, 2e6])) + df = DataFrame(dict(x=[12345.6789, 2e6])) assert str(df) == " x\n0 1.2346e+04\n1 2.0000e+06" @@ -3350,8 +3203,8 @@ def test_format_percentiles_integer_idx(): def test_repr_html_ipython_config(ip): code = textwrap.dedent( """\ - import pandas as pd - df = pd.DataFrame({"A": [1, 2]}) + from pandas import DataFrame + df = DataFrame({"A": [1, 2]}) df._repr_html_() cfg = get_ipython().config @@ -3398,37 +3251,3 @@ def test_filepath_or_buffer_bad_arg_raises(float_frame, method): msg = "buf is not a file name and it has no write method" with pytest.raises(TypeError, match=msg): getattr(float_frame, method)(buf=object()) - - -@pytest.mark.parametrize( - "input_array, expected", - [ - ("a", "a"), - (["a", "b"], "a\nb"), - ([1, "a"], "1\na"), - (1, "1"), - ([0, -1], " 0\n-1"), - (1.0, "1.0"), - ([" a", " b"], " a\n b"), - ([".1", "1"], ".1\n 1"), - (["10", "-10"], " 10\n-10"), - ], -) -def test_format_remove_leading_space_series(input_array, expected): - # GH: 24980 - s = pd.Series(input_array).to_string(index=False) - assert s == expected - - -@pytest.mark.parametrize( - "input_array, expected", - [ - ({"A": ["a"]}, "A\na"), - ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"), - ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"), - ], -) -def test_format_remove_leading_space_dataframe(input_array, expected): - # GH: 24980 - df = pd.DataFrame(input_array).to_string(index=False) - assert df == expected diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index d98530b5435e7..8c2155aec7248 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.compat import PYPY +from pandas.compat import IS64, PYPY from pandas import ( CategoricalIndex, @@ -51,6 +51,20 @@ def datetime_frame(): return DataFrame(tm.getTimeSeriesData()) +def test_info_empty(): + df = DataFrame() + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + Index: 0 entries + Empty DataFrame""" + ) + assert result == expected + + def test_info_categorical_column(): # make sure it works @@ -459,3 +473,26 @@ def test_info_categorical(): buf = StringIO() df.info(buf=buf) + + +@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") +def test_info_int_columns(): + # GH#37245 + df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) + buf = StringIO() + df.info(null_counts=True, buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + Index: 2 entries, A to B + Data columns (total 2 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 1 2 non-null int64 + 1 2 2 non-null int64 + dtypes: int64(2) + memory usage: 48.0+ bytes + """ + ) + assert result == expected diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 476d75f7d239d..79f9bbace000e 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -28,10 +28,10 @@ def h(x, foo="bar"): self.h = h self.styler = Styler(self.df) - self.attrs = pd.DataFrame({"A": ["color: red", "color: blue"]}) + self.attrs = DataFrame({"A": ["color: red", "color: blue"]}) self.dataframes = [ self.df, - pd.DataFrame( + DataFrame( {"f": [1.0, 2.0], "o": ["a", "b"], "c": pd.Categorical(["a", "b"])} ), ] @@ -110,7 +110,7 @@ def test_clear(self): assert len(s._todo) == 0 def test_render(self): - df = pd.DataFrame({"A": [0, 1]}) + df = DataFrame({"A": [0, 1]}) style = lambda x: pd.Series(["color: red", "color: blue"], name=x.name) s = Styler(df, uuid="AB").apply(style) s.render() @@ -127,7 +127,7 @@ def test_render_empty_dfs(self): # No IndexError raised? def test_render_double(self): - df = pd.DataFrame({"A": [0, 1]}) + df = DataFrame({"A": [0, 1]}) style = lambda x: pd.Series( ["color: red; border: 1px", "color: blue; border: 2px"], name=x.name ) @@ -136,7 +136,7 @@ def test_render_double(self): # it worked? def test_set_properties(self): - df = pd.DataFrame({"A": [0, 1]}) + df = DataFrame({"A": [0, 1]}) result = df.style.set_properties(color="white", size="10px")._compute().ctx # order is deterministic v = ["color: white", "size: 10px"] @@ -146,7 +146,7 @@ def test_set_properties(self): assert sorted(v1) == sorted(v2) def test_set_properties_subset(self): - df = pd.DataFrame({"A": [0, 1]}) + df = DataFrame({"A": [0, 1]}) result = ( df.style.set_properties(subset=pd.IndexSlice[0, "A"], color="white") ._compute() @@ -157,7 +157,7 @@ def test_set_properties_subset(self): def test_empty_index_name_doesnt_display(self): # https://github.com/pandas-dev/pandas/pull/12090#issuecomment-180695902 - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) result = df.style._translate() expected = [ @@ -197,7 +197,7 @@ def test_empty_index_name_doesnt_display(self): def test_index_name(self): # https://github.com/pandas-dev/pandas/issues/11655 - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) result = df.set_index("A").style._translate() expected = [ @@ -235,7 +235,7 @@ def test_index_name(self): def test_multiindex_name(self): # https://github.com/pandas-dev/pandas/issues/11655 - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) result = df.set_index(["A", "B"]).style._translate() expected = [ @@ -274,11 +274,11 @@ def test_multiindex_name(self): def test_numeric_columns(self): # https://github.com/pandas-dev/pandas/issues/12125 # smoke test for _translate - df = pd.DataFrame({0: [1, 2, 3]}) + df = DataFrame({0: [1, 2, 3]}) df.style._translate() def test_apply_axis(self): - df = pd.DataFrame({"A": [0, 0], "B": [1, 1]}) + df = DataFrame({"A": [0, 0], "B": [1, 1]}) f = lambda x: [f"val: {x.max()}" for v in x] result = df.style.apply(f, axis=1) assert len(result._todo) == 1 @@ -373,7 +373,7 @@ def color_negative_red(val): } idx = pd.IndexSlice - df = pd.DataFrame(dic, index=[0, 1]) + df = DataFrame(dic, index=[0, 1]) (df.style.applymap(color_negative_red, subset=idx[:, idx["b", "d"]]).render()) @@ -468,7 +468,7 @@ def g(x): assert result == expected def test_empty(self): - df = pd.DataFrame({"A": [1, 0]}) + df = DataFrame({"A": [1, 0]}) s = df.style s.ctx = {(0, 0): ["color: red"], (1, 0): [""]} @@ -480,7 +480,7 @@ def test_empty(self): assert result == expected def test_duplicate(self): - df = pd.DataFrame({"A": [1, 0]}) + df = DataFrame({"A": [1, 0]}) s = df.style s.ctx = {(0, 0): ["color: red"], (1, 0): ["color: red"]} @@ -491,7 +491,7 @@ def test_duplicate(self): assert result == expected def test_bar_align_left(self): - df = pd.DataFrame({"A": [0, 1, 2]}) + df = DataFrame({"A": [0, 1, 2]}) result = df.style.bar()._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -534,7 +534,7 @@ def test_bar_align_left(self): assert result == expected def test_bar_align_left_0points(self): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) result = df.style.bar()._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -620,7 +620,7 @@ def test_bar_align_left_0points(self): assert result == expected def test_bar_align_mid_pos_and_neg(self): - df = pd.DataFrame({"A": [-10, 0, 20, 90]}) + df = DataFrame({"A": [-10, 0, 20, 90]}) result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx @@ -652,7 +652,7 @@ def test_bar_align_mid_pos_and_neg(self): assert result == expected def test_bar_align_mid_all_pos(self): - df = pd.DataFrame({"A": [10, 20, 50, 100]}) + df = DataFrame({"A": [10, 20, 50, 100]}) result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx @@ -686,7 +686,7 @@ def test_bar_align_mid_all_pos(self): assert result == expected def test_bar_align_mid_all_neg(self): - df = pd.DataFrame({"A": [-100, -60, -30, -20]}) + df = DataFrame({"A": [-100, -60, -30, -20]}) result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx @@ -726,7 +726,7 @@ def test_bar_align_mid_all_neg(self): def test_bar_align_zero_pos_and_neg(self): # See https://github.com/pandas-dev/pandas/pull/14757 - df = pd.DataFrame({"A": [-10, 0, 20, 90]}) + df = DataFrame({"A": [-10, 0, 20, 90]}) result = ( df.style.bar(align="zero", color=["#d65f5f", "#5fba7d"], width=90) @@ -760,7 +760,7 @@ def test_bar_align_zero_pos_and_neg(self): assert result == expected def test_bar_align_left_axis_none(self): - df = pd.DataFrame({"A": [0, 1], "B": [2, 4]}) + df = DataFrame({"A": [0, 1], "B": [2, 4]}) result = df.style.bar(axis=None)._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -786,7 +786,7 @@ def test_bar_align_left_axis_none(self): assert result == expected def test_bar_align_zero_axis_none(self): - df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) result = df.style.bar(align="zero", axis=None)._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -815,7 +815,7 @@ def test_bar_align_zero_axis_none(self): assert result == expected def test_bar_align_mid_axis_none(self): - df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) result = df.style.bar(align="mid", axis=None)._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -843,7 +843,7 @@ def test_bar_align_mid_axis_none(self): assert result == expected def test_bar_align_mid_vmin(self): - df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) result = df.style.bar(align="mid", axis=None, vmin=-6)._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -872,7 +872,7 @@ def test_bar_align_mid_vmin(self): assert result == expected def test_bar_align_mid_vmax(self): - df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) result = df.style.bar(align="mid", axis=None, vmax=8)._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -900,7 +900,7 @@ def test_bar_align_mid_vmax(self): assert result == expected def test_bar_align_mid_vmin_vmax_wide(self): - df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) result = df.style.bar(align="mid", axis=None, vmin=-3, vmax=7)._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -929,7 +929,7 @@ def test_bar_align_mid_vmin_vmax_wide(self): assert result == expected def test_bar_align_mid_vmin_vmax_clipping(self): - df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + df = DataFrame({"A": [0, 1], "B": [-2, 4]}) result = df.style.bar(align="mid", axis=None, vmin=-1, vmax=3)._compute().ctx expected = { (0, 0): ["width: 10em", " height: 80%"], @@ -957,7 +957,7 @@ def test_bar_align_mid_vmin_vmax_clipping(self): assert result == expected def test_bar_align_mid_nans(self): - df = pd.DataFrame({"A": [1, None], "B": [-1, 3]}) + df = DataFrame({"A": [1, None], "B": [-1, 3]}) result = df.style.bar(align="mid", axis=None)._compute().ctx expected = { (0, 0): [ @@ -984,7 +984,7 @@ def test_bar_align_mid_nans(self): assert result == expected def test_bar_align_zero_nans(self): - df = pd.DataFrame({"A": [1, None], "B": [-1, 2]}) + df = DataFrame({"A": [1, None], "B": [-1, 2]}) result = df.style.bar(align="zero", axis=None)._compute().ctx expected = { (0, 0): [ @@ -1012,14 +1012,14 @@ def test_bar_align_zero_nans(self): assert result == expected def test_bar_bad_align_raises(self): - df = pd.DataFrame({"A": [-100, -60, -30, -20]}) + df = DataFrame({"A": [-100, -60, -30, -20]}) msg = "`align` must be one of {'left', 'zero',' mid'}" with pytest.raises(ValueError, match=msg): df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) def test_format_with_na_rep(self): # GH 21527 28358 - df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) ctx = df.style.format(None, na_rep="-")._translate() assert ctx["body"][0][1]["display_value"] == "-" @@ -1037,7 +1037,7 @@ def test_format_with_na_rep(self): def test_init_with_na_rep(self): # GH 21527 28358 - df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) ctx = Styler(df, na_rep="NA")._translate() assert ctx["body"][0][1]["display_value"] == "NA" @@ -1045,7 +1045,7 @@ def test_init_with_na_rep(self): def test_set_na_rep(self): # GH 21527 28358 - df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) ctx = df.style.set_na_rep("NA")._translate() assert ctx["body"][0][1]["display_value"] == "NA" @@ -1061,7 +1061,7 @@ def test_set_na_rep(self): def test_format_non_numeric_na(self): # GH 21527 28358 - df = pd.DataFrame( + df = DataFrame( { "object": [None, np.nan, "foo"], "datetime": [None, pd.NaT, pd.Timestamp("20120101")], @@ -1082,20 +1082,20 @@ def test_format_non_numeric_na(self): def test_format_with_bad_na_rep(self): # GH 21527 28358 - df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) msg = "Expected a string, got -1 instead" with pytest.raises(TypeError, match=msg): df.style.format(None, na_rep=-1) def test_highlight_null(self, null_color="red"): - df = pd.DataFrame({"A": [0, np.nan]}) + df = DataFrame({"A": [0, np.nan]}) result = df.style.highlight_null()._compute().ctx expected = {(1, 0): ["background-color: red"]} assert result == expected def test_highlight_null_subset(self): # GH 31345 - df = pd.DataFrame({"A": [0, np.nan], "B": [0, np.nan]}) + df = DataFrame({"A": [0, np.nan], "B": [0, np.nan]}) result = ( df.style.highlight_null(null_color="red", subset=["A"]) .highlight_null(null_color="green", subset=["B"]) @@ -1109,7 +1109,7 @@ def test_highlight_null_subset(self): assert result == expected def test_nonunique_raises(self): - df = pd.DataFrame([[1, 2]], columns=["A", "A"]) + df = DataFrame([[1, 2]], columns=["A", "A"]) msg = "style is not supported for non-unique indices." with pytest.raises(ValueError, match=msg): df.style @@ -1139,7 +1139,7 @@ def test_uuid(self): def test_unique_id(self): # See https://github.com/pandas-dev/pandas/issues/16780 - df = pd.DataFrame({"a": [1, 3, 5, 6], "b": [2, 4, 12, 21]}) + df = DataFrame({"a": [1, 3, 5, 6], "b": [2, 4, 12, 21]}) result = df.style.render(uuid="test") assert "test" in result ids = re.findall('id="(.*?)"', result) @@ -1178,13 +1178,13 @@ def test_precision(self): def test_apply_none(self): def f(x): - return pd.DataFrame( + return DataFrame( np.where(x == x.max(), "color: red", ""), index=x.index, columns=x.columns, ) - result = pd.DataFrame([[1, 2], [3, 4]]).style.apply(f, axis=None)._compute().ctx + result = DataFrame([[1, 2], [3, 4]]).style.apply(f, axis=None)._compute().ctx assert result[(1, 1)] == ["color: red"] def test_trim(self): @@ -1195,7 +1195,7 @@ def test_trim(self): assert result.count("#") == len(self.df.columns) def test_highlight_max(self): - df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) # max(df) = min(-df) for max_ in [True, False]: if max_: @@ -1246,7 +1246,7 @@ def test_export(self): style2.render() def test_display_format(self): - df = pd.DataFrame(np.random.random(size=(2, 2))) + df = DataFrame(np.random.random(size=(2, 2))) ctx = df.style.format("{:0.1f}")._translate() assert all(["display_value" in c for c in row] for row in ctx["body"]) @@ -1256,7 +1256,7 @@ def test_display_format(self): assert len(ctx["body"][0][1]["display_value"].lstrip("-")) <= 3 def test_display_format_raises(self): - df = pd.DataFrame(np.random.randn(2, 2)) + df = DataFrame(np.random.randn(2, 2)) msg = "Expected a template string or callable, got 5 instead" with pytest.raises(TypeError, match=msg): df.style.format(5) @@ -1267,7 +1267,7 @@ def test_display_format_raises(self): def test_display_set_precision(self): # Issue #13257 - df = pd.DataFrame(data=[[1.0, 2.0090], [3.2121, 4.566]], columns=["a", "b"]) + df = DataFrame(data=[[1.0, 2.0090], [3.2121, 4.566]], columns=["a", "b"]) s = Styler(df) ctx = s.set_precision(1)._translate() @@ -1293,7 +1293,7 @@ def test_display_set_precision(self): assert ctx["body"][1][2]["display_value"] == "4.566" def test_display_subset(self): - df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) + df = DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) ctx = df.style.format( {"a": "{:0.1f}", "b": "{0:.2%}"}, subset=pd.IndexSlice[0, :] )._translate() @@ -1324,7 +1324,7 @@ def test_display_subset(self): assert ctx["body"][1][2]["display_value"] == raw_11 def test_display_dict(self): - df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) + df = DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) ctx = df.style.format({"a": "{:0.1f}", "b": "{0:.2%}"})._translate() assert ctx["body"][0][1]["display_value"] == "0.1" assert ctx["body"][0][2]["display_value"] == "12.34%" @@ -1334,7 +1334,7 @@ def test_display_dict(self): assert ctx["body"][0][3]["display_value"] == "AAA" def test_bad_apply_shape(self): - df = pd.DataFrame([[1, 2], [3, 4]]) + df = DataFrame([[1, 2], [3, 4]]) msg = "returned the wrong shape" with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: "x", subset=pd.IndexSlice[[0, 1], :]) @@ -1356,16 +1356,16 @@ def test_apply_bad_return(self): def f(x): return "" - df = pd.DataFrame([[1, 2], [3, 4]]) + df = DataFrame([[1, 2], [3, 4]]) msg = "must return a DataFrame when passed to `Styler.apply` with axis=None" with pytest.raises(TypeError, match=msg): df.style._apply(f, axis=None) def test_apply_bad_labels(self): def f(x): - return pd.DataFrame(index=[1, 2], columns=["a", "b"]) + return DataFrame(index=[1, 2], columns=["a", "b"]) - df = pd.DataFrame([[1, 2], [3, 4]]) + df = DataFrame([[1, 2], [3, 4]]) msg = "must have identical index and columns as the input" with pytest.raises(ValueError, match=msg): df.style._apply(f, axis=None) @@ -1400,7 +1400,7 @@ def test_get_level_lengths_un_sorted(self): tm.assert_dict_equal(result, expected) def test_mi_sparse(self): - df = pd.DataFrame( + df = DataFrame( {"A": [1, 2]}, index=pd.MultiIndex.from_arrays([["a", "a"], [0, 1]]) ) @@ -1467,7 +1467,7 @@ def test_mi_sparse(self): def test_mi_sparse_disabled(self): with pd.option_context("display.multi_sparse", False): - df = pd.DataFrame( + df = DataFrame( {"A": [1, 2]}, index=pd.MultiIndex.from_arrays([["a", "a"], [0, 1]]) ) result = df.style._translate() @@ -1476,7 +1476,7 @@ def test_mi_sparse_disabled(self): assert "attributes" not in row[0] def test_mi_sparse_index_names(self): - df = pd.DataFrame( + df = DataFrame( {"A": [1, 2]}, index=pd.MultiIndex.from_arrays( [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] @@ -1493,7 +1493,7 @@ def test_mi_sparse_index_names(self): assert head == expected def test_mi_sparse_column_names(self): - df = pd.DataFrame( + df = DataFrame( np.arange(16).reshape(4, 4), index=pd.MultiIndex.from_arrays( [["a", "a", "b", "a"], [0, 1, 1, 2]], @@ -1574,7 +1574,7 @@ def test_hide_single_index(self): def test_hide_multiindex(self): # GH 14194 - df = pd.DataFrame( + df = DataFrame( {"A": [1, 2]}, index=pd.MultiIndex.from_arrays( [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] @@ -1628,7 +1628,7 @@ def test_hide_columns_mult_levels(self): i2 = pd.MultiIndex.from_arrays( [["b", "b"], [0, 1]], names=["col_level_0", "col_level_1"] ) - df = pd.DataFrame([[1, 2], [3, 4]], index=i1, columns=i2) + df = DataFrame([[1, 2], [3, 4]], index=i1, columns=i2) ctx = df.style._translate() # column headers assert ctx["head"][0][2]["is_visible"] @@ -1685,7 +1685,7 @@ def f(a, b, styler): def test_no_cell_ids(self): # GH 35588 # GH 35663 - df = pd.DataFrame(data=[[0]]) + df = DataFrame(data=[[0]]) styler = Styler(df, uuid="_", cell_ids=False) styler.render() s = styler.render() # render twice to ensure ctx is not updated @@ -1714,14 +1714,14 @@ def test_set_data_classes(self, classes): def test_colspan_w3(self): # GH 36223 - df = pd.DataFrame(data=[[1, 2]], columns=[["l0", "l0"], ["l1a", "l1b"]]) + df = DataFrame(data=[[1, 2]], columns=[["l0", "l0"], ["l1a", "l1b"]]) s = Styler(df, uuid="_", cell_ids=False) assert '' in s.render() @pytest.mark.parametrize("len_", [1, 5, 32, 33, 100]) def test_uuid_len(self, len_): # GH 36345 - df = pd.DataFrame(data=[["A"]]) + df = DataFrame(data=[["A"]]) s = Styler(df, uuid_len=len_, cell_ids=False).render() strt = s.find('id="T_') end = s[strt + 6 :].find('"') @@ -1733,7 +1733,7 @@ def test_uuid_len(self, len_): @pytest.mark.parametrize("len_", [-2, "bad", None]) def test_uuid_len_raises(self, len_): # GH 36345 - df = pd.DataFrame(data=[["A"]]) + df = DataFrame(data=[["A"]]) msg = "``uuid_len`` must be an integer in range \\[0, 32\\]." with pytest.raises(TypeError, match=msg): Styler(df, uuid_len=len_, cell_ids=False).render() @@ -1742,7 +1742,7 @@ def test_uuid_len_raises(self, len_): @td.skip_if_no_mpl class TestStylerMatplotlibDep: def test_background_gradient(self): - df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) + df = DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) for c_map in [None, "YlOrRd"]: result = df.style.background_gradient(cmap=c_map)._compute().ctx @@ -1776,13 +1776,13 @@ def test_background_gradient(self): ], ) def test_text_color_threshold(self, c_map, expected): - df = pd.DataFrame([1, 2], columns=["A"]) + df = DataFrame([1, 2], columns=["A"]) result = df.style.background_gradient(cmap=c_map)._compute().ctx assert result == expected @pytest.mark.parametrize("text_color_threshold", [1.1, "1", -1, [2, 2]]) def test_text_color_threshold_raises(self, text_color_threshold): - df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) + df = DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) msg = "`text_color_threshold` must be a value from 0 to 1." with pytest.raises(ValueError, match=msg): df.style.background_gradient( @@ -1791,7 +1791,7 @@ def test_text_color_threshold_raises(self, text_color_threshold): @td.skip_if_no_mpl def test_background_gradient_axis(self): - df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) + df = DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) low = ["background-color: #f7fbff", "color: #000000"] high = ["background-color: #08306b", "color: #f1f1f1"] @@ -1816,7 +1816,7 @@ def test_background_gradient_axis(self): def test_background_gradient_vmin_vmax(self): # GH 12145 - df = pd.DataFrame(range(5)) + df = DataFrame(range(5)) ctx = df.style.background_gradient(vmin=1, vmax=3)._compute().ctx assert ctx[(0, 0)] == ctx[(1, 0)] assert ctx[(4, 0)] == ctx[(3, 0)] @@ -1873,5 +1873,5 @@ def test_from_custom_template(tmpdir): assert issubclass(result, Styler) assert result.env is not Styler.env assert result.template is not Styler.template - styler = result(pd.DataFrame({"A": [1, 2]})) + styler = result(DataFrame({"A": [1, 2]})) assert styler.render() diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index e2ceb95d77053..3584ec047d4d2 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -150,7 +150,7 @@ def test_to_csv_decimal(self): ) # see gh-11553: testing if decimal is taken into account for '0.0' - df = pd.DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) + df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) @@ -165,7 +165,7 @@ def test_to_csv_decimal(self): def test_to_csv_float_format(self): # testing if float_format is taken into account for the index # GH 11553 - df = pd.DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) + df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) @@ -334,7 +334,7 @@ def test_to_csv_single_level_multi_index(self, ind, expected, klass): def test_to_csv_string_array_ascii(self): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] - df = pd.DataFrame(str_array) + df = DataFrame(str_array) expected_ascii = """\ ,names 0,"['foo', 'bar']" @@ -348,7 +348,7 @@ def test_to_csv_string_array_ascii(self): def test_to_csv_string_array_utf8(self): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] - df = pd.DataFrame(str_array) + df = DataFrame(str_array) expected_utf8 = """\ ,names 0,"['foo', 'bar']" @@ -362,7 +362,7 @@ def test_to_csv_string_array_utf8(self): def test_to_csv_string_with_lf(self): # GH 20353 data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} - df = pd.DataFrame(data) + df = DataFrame(data) with tm.ensure_clean("lf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode("utf-8") @@ -396,7 +396,7 @@ def test_to_csv_string_with_lf(self): def test_to_csv_string_with_crlf(self): # GH 20353 data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} - df = pd.DataFrame(data) + df = DataFrame(data) with tm.ensure_clean("crlf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode("utf-8") @@ -434,9 +434,7 @@ def test_to_csv_string_with_crlf(self): def test_to_csv_stdout_file(self, capsys): # GH 21561 - df = pd.DataFrame( - [["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"] - ) + df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"]) expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) @@ -456,7 +454,7 @@ def test_to_csv_stdout_file(self, capsys): ) def test_to_csv_write_to_open_file(self): # GH 21696 - df = pd.DataFrame({"a": ["x", "y", "z"]}) + df = DataFrame({"a": ["x", "y", "z"]}) expected = """\ manual header x @@ -473,7 +471,7 @@ def test_to_csv_write_to_open_file(self): def test_to_csv_write_to_open_file_with_newline_py3(self): # see gh-21696 # see gh-20353 - df = pd.DataFrame({"a": ["x", "y", "z"]}) + df = DataFrame({"a": ["x", "y", "z"]}) expected_rows = ["x", "y", "z"] expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) with tm.ensure_clean("test.txt") as path: @@ -557,7 +555,7 @@ def test_to_csv_zip_arguments(self, compression, archive_name): @pytest.mark.parametrize("df_new_type", ["Int64"]) def test_to_csv_na_rep_long_string(self, df_new_type): # see gh-25099 - df = pd.DataFrame({"c": [float("nan")] * 3}) + df = DataFrame({"c": [float("nan")] * 3}) df = df.astype(df_new_type) expected_rows = ["c", "mynull", "mynull", "mynull"] expected = tm.convert_rows_list_to_csv_str(expected_rows) @@ -635,7 +633,7 @@ def test_to_csv_encoding_binary_handle(self): # example from GH 13068 with tm.ensure_clean() as path: with open(path, "w+b") as handle: - pd.DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig") + DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig") handle.seek(0) assert handle.read().startswith(b'\xef\xbb\xbf""') diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 7acdbfd462874..f4f963d268aeb 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -787,13 +787,13 @@ def test_html_repr_min_rows_default(datapath): # gh-27991 # default setting no truncation even if above min_rows - df = pd.DataFrame({"a": range(20)}) + df = DataFrame({"a": range(20)}) result = df._repr_html_() expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") assert result == expected # default of max_rows 60 triggers truncation if above - df = pd.DataFrame({"a": range(61)}) + df = DataFrame({"a": range(61)}) result = df._repr_html_() expected = expected_html(datapath, "html_repr_min_rows_default_truncated") assert result == expected @@ -815,8 +815,51 @@ def test_html_repr_min_rows_default(datapath): def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): # gh-27991 - df = pd.DataFrame({"a": range(61)}) + df = DataFrame({"a": range(61)}) expected = expected_html(datapath, expected) with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): result = df._repr_html_() assert result == expected + + +def test_to_html_multilevel(multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + + ymd.columns.name = "foo" + ymd.to_html() + ymd.T.to_html() + + +@pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) +def test_to_html_na_rep_and_float_format(na_rep): + # https://github.com/pandas-dev/pandas/issues/13828 + df = DataFrame( + [ + ["A", 1.2225], + ["A", None], + ], + columns=["Group", "Data"], + ) + result = df.to_html(na_rep=na_rep, float_format="{:.2f}".format) + expected = f"""
          l0
          + + + + + + + + + + + + + + + + + + + +
          GroupData
          0A1.22
          1A{na_rep}
          """ + assert result == expected diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index d3d865158309c..7cf7ed3f77609 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -157,7 +157,7 @@ def test_to_latex_series(self): def test_to_latex_midrule_location(self): # GH 18326 - df = pd.DataFrame({"a": [1, 2]}) + df = DataFrame({"a": [1, 2]}) df.index.name = "foo" result = df.to_latex(index_names=False) expected = _dedent( @@ -373,7 +373,7 @@ def test_to_latex_decimal(self): class TestToLatexBold: def test_to_latex_bold_rows(self): # GH 16707 - df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) result = df.to_latex(bold_rows=True) expected = _dedent( r""" @@ -391,7 +391,7 @@ def test_to_latex_bold_rows(self): def test_to_latex_no_bold_rows(self): # GH 16707 - df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) result = df.to_latex(bold_rows=False) expected = _dedent( r""" @@ -414,6 +414,11 @@ def caption_table(self): """Caption for table/tabular LaTeX environment.""" return "a table in a \\texttt{table/tabular} environment" + @pytest.fixture + def short_caption(self): + """Short caption for testing \\caption[short_caption]{full_caption}.""" + return "a table" + @pytest.fixture def label_table(self): """Label for table/tabular LaTeX environment.""" @@ -493,6 +498,107 @@ def test_to_latex_caption_and_label(self, df_short, caption_table, label_table): ) assert result == expected + def test_to_latex_caption_and_shortcaption( + self, + df_short, + caption_table, + short_caption, + ): + result = df_short.to_latex(caption=(caption_table, short_caption)) + expected = _dedent( + r""" + \begin{table} + \centering + \caption[a table]{a table in a \texttt{table/tabular} environment} + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + \end{table} + """ + ) + assert result == expected + + def test_to_latex_caption_and_shortcaption_list_is_ok(self, df_short): + caption = ("Long-long-caption", "Short") + result_tuple = df_short.to_latex(caption=caption) + result_list = df_short.to_latex(caption=list(caption)) + assert result_tuple == result_list + + def test_to_latex_caption_shortcaption_and_label( + self, + df_short, + caption_table, + short_caption, + label_table, + ): + # test when the short_caption is provided alongside caption and label + result = df_short.to_latex( + caption=(caption_table, short_caption), + label=label_table, + ) + expected = _dedent( + r""" + \begin{table} + \centering + \caption[a table]{a table in a \texttt{table/tabular} environment} + \label{tab:table_tabular} + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + \end{table} + """ + ) + assert result == expected + + @pytest.mark.parametrize( + "bad_caption", + [ + ("full_caption", "short_caption", "extra_string"), + ("full_caption", "short_caption", 1), + ("full_caption", "short_caption", None), + ("full_caption",), + (None,), + ], + ) + def test_to_latex_bad_caption_raises(self, bad_caption): + # test that wrong number of params is raised + df = DataFrame({"a": [1]}) + msg = "caption must be either a string or a tuple of two strings" + with pytest.raises(ValueError, match=msg): + df.to_latex(caption=bad_caption) + + def test_to_latex_two_chars_caption(self, df_short): + # test that two chars caption is handled correctly + # it must not be unpacked into long_caption, short_caption. + result = df_short.to_latex(caption="xy") + expected = _dedent( + r""" + \begin{table} + \centering + \caption{xy} + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + \end{table} + """ + ) + assert result == expected + def test_to_latex_longtable_caption_only(self, df_short, caption_longtable): # GH 25436 # test when no caption and no label is provided @@ -595,6 +701,47 @@ def test_to_latex_longtable_caption_and_label( ) assert result == expected + def test_to_latex_longtable_caption_shortcaption_and_label( + self, + df_short, + caption_longtable, + short_caption, + label_longtable, + ): + # test when the caption, the short_caption and the label are provided + result = df_short.to_latex( + longtable=True, + caption=(caption_longtable, short_caption), + label=label_longtable, + ) + expected = _dedent( + r""" + \begin{longtable}{lrl} + \caption[a table]{a table in a \texttt{longtable} environment} + \label{tab:longtable}\\ + \toprule + {} & a & b \\ + \midrule + \endfirsthead + \caption[]{a table in a \texttt{longtable} environment} \\ + \toprule + {} & a & b \\ + \midrule + \endhead + \midrule + \multicolumn{3}{r}{{Continued on next page}} \\ + \midrule + \endfoot + + \bottomrule + \endlastfoot + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} + """ + ) + assert result == expected + class TestToLatexEscape: @pytest.fixture @@ -832,18 +979,18 @@ def multiindex_frame(self): """Multiindex dataframe for testing multirow LaTeX macros.""" yield DataFrame.from_dict( { - ("c1", 0): pd.Series({x: x for x in range(4)}), - ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), - ("c2", 0): pd.Series({x: x for x in range(4)}), - ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), - ("c3", 0): pd.Series({x: x for x in range(4)}), + ("c1", 0): Series({x: x for x in range(4)}), + ("c1", 1): Series({x: x + 4 for x in range(4)}), + ("c2", 0): Series({x: x for x in range(4)}), + ("c2", 1): Series({x: x + 4 for x in range(4)}), + ("c3", 0): Series({x: x for x in range(4)}), } ).T @pytest.fixture def multicolumn_frame(self): """Multicolumn dataframe for testing multicolumn LaTeX macros.""" - yield pd.DataFrame( + yield DataFrame( { ("c1", 0): {x: x for x in range(5)}, ("c1", 1): {x: x + 5 for x in range(5)}, @@ -855,7 +1002,7 @@ def multicolumn_frame(self): def test_to_latex_multindex_header(self): # GH 16718 - df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}) + df = DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}) df = df.set_index(["a", "b"]) observed = df.to_latex(header=["r1", "r2"]) expected = _dedent( @@ -875,7 +1022,7 @@ def test_to_latex_multindex_header(self): def test_to_latex_multiindex_empty_name(self): # GH 18669 mi = pd.MultiIndex.from_product([[1, 2]], names=[""]) - df = pd.DataFrame(-1, index=mi, columns=range(4)) + df = DataFrame(-1, index=mi, columns=range(4)) observed = df.to_latex() expected = _dedent( r""" @@ -968,7 +1115,7 @@ def test_to_latex_multicolumn_tabular(self, multiindex_frame): def test_to_latex_index_has_name_tabular(self): # GH 10660 - df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) + df = DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) result = df.set_index(["a", "b"]).to_latex() expected = _dedent( r""" @@ -989,7 +1136,7 @@ def test_to_latex_index_has_name_tabular(self): def test_to_latex_groupby_tabular(self): # GH 10660 - df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) + df = DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) result = df.groupby("a").describe().to_latex() expected = _dedent( r""" @@ -1015,7 +1162,7 @@ def test_to_latex_multiindex_dupe_level(self): # ONLY happen if all higher order indices (to the left) are # equal too. In this test, 'c' has to be printed both times # because the higher order index 'A' != 'B'. - df = pd.DataFrame( + df = DataFrame( index=pd.MultiIndex.from_tuples([("A", "c"), ("B", "c")]), columns=["col"] ) result = df.to_latex() @@ -1128,7 +1275,7 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): # GH 18667 names = [name0, name1] mi = pd.MultiIndex.from_product([[1, 2], [3, 4]]) - df = pd.DataFrame(-1, index=mi.copy(), columns=mi.copy()) + df = DataFrame(-1, index=mi.copy(), columns=mi.copy()) for idx in axes: df.axes[idx].names = names @@ -1160,7 +1307,7 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): @pytest.mark.parametrize("one_row", [True, False]) def test_to_latex_multiindex_nans(self, one_row): # GH 14249 - df = pd.DataFrame({"a": [None, 1], "b": [2, 3], "c": [4, 5]}) + df = DataFrame({"a": [None, 1], "b": [2, 3], "c": [4, 5]}) if one_row: df = df.iloc[[0]] observed = df.set_index(["a", "b"]).to_latex() @@ -1184,7 +1331,7 @@ def test_to_latex_multiindex_nans(self, one_row): def test_to_latex_non_string_index(self): # GH 19981 - df = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]) + df = DataFrame([[1, 2, 3]] * 2).set_index([0, 1]) result = df.to_latex() expected = _dedent( r""" @@ -1284,3 +1431,24 @@ def test_get_strrow_multindex_multicolumn(self, row_num, expected): ) assert row_string_converter.get_strrow(row_num=row_num) == expected + + @pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) + def test_to_latex_na_rep_and_float_format(self, na_rep): + df = DataFrame( + [ + ["A", 1.2225], + ["A", None], + ], + columns=["Group", "Data"], + ) + result = df.to_latex(na_rep=na_rep, float_format="{:.2f}".format) + expected = f"""\\begin{{tabular}}{{llr}} +\\toprule +{{}} & Group & Data \\\\ +\\midrule +0 & A & 1.22 \\\\ +1 & A & {na_rep} \\\\ +\\bottomrule +\\end{{tabular}} +""" + assert result == expected diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py new file mode 100644 index 0000000000000..a1ed1ba2e8bf5 --- /dev/null +++ b/pandas/tests/io/formats/test_to_string.py @@ -0,0 +1,233 @@ +from datetime import datetime +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame, Series, option_context, to_datetime + + +def test_repr_embedded_ndarray(): + arr = np.empty(10, dtype=[("err", object)]) + for i in range(len(arr)): + arr["err"][i] = np.random.randn(i) + + df = DataFrame(arr) + repr(df["err"]) + repr(df) + df.to_string() + + +def test_repr_tuples(): + buf = StringIO() + + df = DataFrame({"tups": list(zip(range(10), range(10)))}) + repr(df) + df.to_string(col_space=10, buf=buf) + + +def test_to_string_truncate(): + # GH 9784 - dont truncate when calling DataFrame.to_string + df = DataFrame( + [ + { + "a": "foo", + "b": "bar", + "c": "let's make this a very VERY long line that is longer " + "than the default 50 character limit", + "d": 1, + }, + {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + ] + ) + df.set_index(["a", "b", "c"]) + assert df.to_string() == ( + " a b " + " c d\n" + "0 foo bar let's make this a very VERY long line t" + "hat is longer than the default 50 character limit 1\n" + "1 foo bar " + " stuff 1" + ) + with option_context("max_colwidth", 20): + # the display option has no effect on the to_string method + assert df.to_string() == ( + " a b " + " c d\n" + "0 foo bar let's make this a very VERY long line t" + "hat is longer than the default 50 character limit 1\n" + "1 foo bar " + " stuff 1" + ) + assert df.to_string(max_colwidth=20) == ( + " a b c d\n" + "0 foo bar let's make this ... 1\n" + "1 foo bar stuff 1" + ) + + +@pytest.mark.parametrize( + "input_array, expected", + [ + ("a", "a"), + (["a", "b"], "a\nb"), + ([1, "a"], "1\na"), + (1, "1"), + ([0, -1], " 0\n-1"), + (1.0, "1.0"), + ([" a", " b"], " a\n b"), + ([".1", "1"], ".1\n 1"), + (["10", "-10"], " 10\n-10"), + ], +) +def test_format_remove_leading_space_series(input_array, expected): + # GH: 24980 + s = Series(input_array).to_string(index=False) + assert s == expected + + +@pytest.mark.parametrize( + "input_array, expected", + [ + ({"A": ["a"]}, "A\na"), + ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"), + ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"), + ], +) +def test_format_remove_leading_space_dataframe(input_array, expected): + # GH: 24980 + df = DataFrame(input_array).to_string(index=False) + assert df == expected + + +def test_to_string_unicode_columns(float_frame): + df = DataFrame({"\u03c3": np.arange(10.0)}) + + buf = StringIO() + df.to_string(buf=buf) + buf.getvalue() + + buf = StringIO() + df.info(buf=buf) + buf.getvalue() + + result = float_frame.to_string() + assert isinstance(result, str) + + +def test_to_string_utf8_columns(): + n = "\u05d0".encode() + + with option_context("display.max_rows", 1): + df = DataFrame([1, 2], columns=[n]) + repr(df) + + +def test_to_string_unicode_two(): + dm = DataFrame({"c/\u03c3": []}) + buf = StringIO() + dm.to_string(buf) + + +def test_to_string_unicode_three(): + dm = DataFrame(["\xc2"]) + buf = StringIO() + dm.to_string(buf) + + +def test_to_string_with_formatters(): + df = DataFrame( + { + "int": [1, 2, 3], + "float": [1.0, 2.0, 3.0], + "object": [(1, 2), True, False], + }, + columns=["int", "float", "object"], + ) + + formatters = [ + ("int", lambda x: f"0x{x:x}"), + ("float", lambda x: f"[{x: 4.1f}]"), + ("object", lambda x: f"-{x!s}-"), + ] + result = df.to_string(formatters=dict(formatters)) + result2 = df.to_string(formatters=list(zip(*formatters))[1]) + assert result == ( + " int float object\n" + "0 0x1 [ 1.0] -(1, 2)-\n" + "1 0x2 [ 2.0] -True-\n" + "2 0x3 [ 3.0] -False-" + ) + assert result == result2 + + +def test_to_string_with_datetime64_monthformatter(): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({"months": months}) + + def format_func(x): + return x.strftime("%Y-%m") + + result = x.to_string(formatters={"months": format_func}) + expected = "months\n0 2016-01\n1 2016-02" + assert result.strip() == expected + + +def test_to_string_with_datetime64_hourformatter(): + + x = DataFrame( + {"hod": to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f")} + ) + + def format_func(x): + return x.strftime("%H:%M") + + result = x.to_string(formatters={"hod": format_func}) + expected = "hod\n0 10:10\n1 12:12" + assert result.strip() == expected + + +def test_to_string_with_formatters_unicode(): + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + result = df.to_string(formatters={"c/\u03c3": str}) + assert result == " c/\u03c3\n" + "0 1\n1 2\n2 3" + + +def test_to_string_complex_number_trims_zeros(): + s = Series([1.000000 + 1.000000j, 1.0 + 1.0j, 1.05 + 1.0j]) + result = s.to_string() + expected = "0 1.00+1.00j\n1 1.00+1.00j\n2 1.05+1.00j" + assert result == expected + + +def test_nullable_float_to_string(float_ea_dtype): + # https://github.com/pandas-dev/pandas/issues/36775 + dtype = float_ea_dtype + s = Series([0.0, 1.0, None], dtype=dtype) + result = s.to_string() + expected = """0 0.0 +1 1.0 +2 """ + assert result == expected + + +def test_nullable_int_to_string(any_nullable_int_dtype): + # https://github.com/pandas-dev/pandas/issues/36775 + dtype = any_nullable_int_dtype + s = Series([0, 1, None], dtype=dtype) + result = s.to_string() + expected = """0 0 +1 1 +2 """ + assert result == expected + + +@pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) +def test_to_string_na_rep_and_float_format(na_rep): + # GH 13828 + df = DataFrame([["A", 1.2225], ["A", None]], columns=["Group", "Data"]) + result = df.to_string(na_rep=na_rep, float_format="{:.2f}".format) + expected = f""" Group Data +0 A 1.22 +1 A {na_rep}""" + assert result == expected diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 8f1ed193b100f..71698a02285f9 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -240,7 +240,7 @@ def test_build_series(self): def test_read_json_from_to_json_results(self): # GH32383 - df = pd.DataFrame( + df = DataFrame( { "_id": {"row_0": 0}, "category": {"row_0": "Goods"}, @@ -616,13 +616,13 @@ def test_set_names_unset(self, idx, nm, prop): ) def test_warns_non_roundtrippable_names(self, idx): # GH 19130 - df = pd.DataFrame(index=idx) + df = DataFrame(index=idx) df.index.name = "index" with tm.assert_produces_warning(): set_default_names(df) def test_timestamp_in_columns(self): - df = pd.DataFrame( + df = DataFrame( [[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")] ) result = df.to_json(orient="table") @@ -634,8 +634,8 @@ def test_timestamp_in_columns(self): "case", [ pd.Series([1], index=pd.Index([1], name="a"), name="a"), - pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")), - pd.DataFrame( + DataFrame({"A": [1]}, index=pd.Index([1], name="A")), + DataFrame( {"A": [1]}, index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"]), ), @@ -647,7 +647,7 @@ def test_overlapping_names(self, case): def test_mi_falsey_name(self): # GH 16203 - df = pd.DataFrame( + df = DataFrame( np.random.randn(4, 4), index=pd.MultiIndex.from_product([("A", "B"), ("a", "b")]), ) @@ -730,7 +730,7 @@ def test_comprehensive(self): ) def test_multiindex(self, index_names): # GH 18912 - df = pd.DataFrame( + df = DataFrame( [["Arr", "alpha", [1, 2, 3, 4]], ["Bee", "Beta", [10, 20, 30, 40]]], index=[["A", "B"], ["Null", "Eins"]], columns=["Aussprache", "Griechisch", "Args"], @@ -742,7 +742,7 @@ def test_multiindex(self, index_names): def test_empty_frame_roundtrip(self): # GH 21287 - df = pd.DataFrame(columns=["a", "b", "c"]) + df = DataFrame(columns=["a", "b", "c"]) expected = df.copy() out = df.to_json(orient="table") result = pd.read_json(out, orient="table") diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e12424888f4af..92cc0f969ec87 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -106,7 +106,7 @@ def test_frame_non_unique_columns(self, orient, data): df.to_json(orient=orient), orient=orient, convert_dates=["x"] ) if orient == "values": - expected = pd.DataFrame(data) + expected = DataFrame(data) if expected.iloc[:, 0].dtype == "datetime64[ns]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, @@ -373,7 +373,7 @@ def test_frame_infinity(self, orient, inf, dtype): ], ) def test_frame_to_json_float_precision(self, value, precision, expected_val): - df = pd.DataFrame([dict(a_float=value)]) + df = DataFrame([dict(a_float=value)]) encoded = df.to_json(double_precision=precision) assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}' @@ -390,7 +390,7 @@ def test_frame_empty(self): read_json(df.to_json(), dtype=dict(df.dtypes)), df, check_index_type=False ) # GH 7445 - result = pd.DataFrame({"test": []}, index=[]).to_json(orient="columns") + result = DataFrame({"test": []}, index=[]).to_json(orient="columns") expected = '{"test":{}}' assert result == expected @@ -599,7 +599,7 @@ def __str__(self) -> str: def test_label_overflow(self): # GH14256: buffer length not checked when writing label - result = pd.DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json() + result = DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json() expected = f'{{"{"bar" * 100000}":{{"0":1}},"foo":{{"0":1337}}}}' assert result == expected @@ -796,7 +796,7 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): if as_object: data.append("a") - ser = pd.Series(data, index=data) + ser = Series(data, index=data) result = ser.to_json(date_format=date_format) if date_format == "epoch": @@ -1042,7 +1042,7 @@ def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): if as_object: data.append("a") - ser = pd.Series(data, index=data) + ser = Series(data, index=data) if date_format == "iso": expected = ( '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' @@ -1143,14 +1143,14 @@ def test_datetime_tz(self): def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks - df = pd.DataFrame(np.random.randn(10, 4)) + df = DataFrame(np.random.randn(10, 4)) df.loc[:8] = np.nan sdf = df.astype("Sparse") expected = df.to_json() assert expected == sdf.to_json() - s = pd.Series(np.random.randn(10)) + s = Series(np.random.randn(10)) s.loc[:8] = np.nan ss = s.astype("Sparse") @@ -1366,7 +1366,7 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): def test_from_json_to_json_table_dtypes(self): # GH21345 - expected = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) + expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) dfjson = expected.to_json(orient="table") result = pd.read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) @@ -1374,7 +1374,7 @@ def test_from_json_to_json_table_dtypes(self): @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 - df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) + df = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) dfjson = df.to_json(orient="table") msg = "cannot pass both dtype and orient='table'" with pytest.raises(ValueError, match=msg): @@ -1459,7 +1459,7 @@ def test_index_false_error_to_json(self, orient): # GH 17394 # Testing error message from to_json with index=False - df = pd.DataFrame([[1, 2], [4, 5]], columns=["a", "b"]) + df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"]) msg = "'index=False' is only valid when 'orient' is 'split' or 'table'" with pytest.raises(ValueError, match=msg): @@ -1487,7 +1487,7 @@ def test_read_timezone_information(self): "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] ) def test_timedelta_as_label(self, date_format, key): - df = pd.DataFrame([[1]], columns=[pd.Timedelta("1D")]) + df = DataFrame([[1]], columns=[pd.Timedelta("1D")]) expected = f'{{"{key}":{{"0":1}}}}' result = df.to_json(date_format=date_format) @@ -1506,14 +1506,14 @@ def test_timedelta_as_label(self, date_format, key): ) def test_tuple_labels(self, orient, expected): # GH 20500 - df = pd.DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) + df = DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) result = df.to_json(orient=orient) assert result == expected @pytest.mark.parametrize("indent", [1, 2, 4]) def test_to_json_indent(self, indent): # GH 12004 - df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) + df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) result = df.to_json(indent=indent) spaces = " " * indent @@ -1649,19 +1649,19 @@ def test_to_json_indent(self, indent): ) def test_json_indent_all_orients(self, orient, expected): # GH 12004 - df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) + df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) result = df.to_json(orient=orient, indent=4) assert result == expected def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): - pd.DataFrame().to_json(indent=-1) + DataFrame().to_json(indent=-1) def test_emca_262_nan_inf_support(self): # GH 12213 data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' result = pd.read_json(data) - expected = pd.DataFrame( + expected = DataFrame( ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] ) tm.assert_frame_equal(result, expected) @@ -1684,7 +1684,7 @@ def test_frame_int_overflow(self): "dataframe,expected", [ ( - pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}), + DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}), '{"(0, \'x\')":1,"(0, \'y\')":"a","(1, \'x\')":2,' '"(1, \'y\')":"b","(2, \'x\')":3,"(2, \'y\')":"c"}', ) @@ -1719,16 +1719,16 @@ def test_to_s3(self, s3_resource, s3so): def test_json_pandas_na(self): # GH 31615 - result = pd.DataFrame([[pd.NA]]).to_json() + result = DataFrame([[pd.NA]]).to_json() assert result == '{"0":{"0":null}}' def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 - result = pd.DataFrame([[nulls_fixture]]).to_json() + result = DataFrame([[nulls_fixture]]).to_json() assert result == '{"0":{"0":null}}' def test_readjson_bool_series(self): # GH31464 result = read_json("[true, true, false]", typ="series") - expected = pd.Series([True, True, False]) + expected = Series([True, True, False]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index a6ffa7e97d375..933bdc462e3f8 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -12,7 +12,7 @@ @pytest.fixture def lines_json_df(): - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) return df.to_json(lines=True, orient="records") @@ -112,7 +112,7 @@ def test_readjson_each_chunk(lines_json_df): def test_readjson_chunks_from_file(): with tm.ensure_clean("test.json") as path: - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) unchunked = pd.read_json(path, lines=True) @@ -122,7 +122,7 @@ def test_readjson_chunks_from_file(): @pytest.mark.parametrize("chunksize", [None, 1]) def test_readjson_chunks_closes(chunksize): with tm.ensure_clean("test.json") as path: - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") reader = JsonReader( path, @@ -173,7 +173,7 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): {"A":3,"B":6} """ - orig = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) test = pd.read_json(j, lines=True, chunksize=chunksize) if chunksize is not None: test = pd.concat(test) @@ -187,7 +187,7 @@ def test_readjson_unicode(monkeypatch): f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') result = read_json(path) - expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) + expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) tm.assert_frame_equal(result, expected) @@ -200,7 +200,7 @@ def test_readjson_nrows(nrows): {"a": 5, "b": 6} {"a": 7, "b": 8}""" result = pd.read_json(jsonl, lines=True, nrows=nrows) - expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(result, expected) @@ -214,7 +214,7 @@ def test_readjson_nrows_chunks(nrows, chunksize): {"a": 7, "b": 8}""" reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) chunked = pd.concat(reader) - expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(chunked, expected) @@ -234,9 +234,9 @@ def test_readjson_lines_chunks_fileurl(datapath): # GH 27135 # Test reading line-format JSON from file url df_list_expected = [ - pd.DataFrame([[1, 2]], columns=["a", "b"], index=[0]), - pd.DataFrame([[3, 4]], columns=["a", "b"], index=[1]), - pd.DataFrame([[5, 6]], columns=["a", "b"], index=[2]), + DataFrame([[1, 2]], columns=["a", "b"], index=[0]), + DataFrame([[3, 4]], columns=["a", "b"], index=[1]), + DataFrame([[5, 6]], columns=["a", "b"], index=[2]), ] os_path = datapath("io", "json", "data", "line_delimited.json") file_url = Path(os_path).as_uri() diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index ae63b6af3a8b6..eee111dd4579c 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -13,6 +13,7 @@ import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import ParserError import pandas.util._test_decorators as td @@ -717,7 +718,10 @@ def test_float_precision_options(c_parser_only): df3 = parser.read_csv(StringIO(s), float_precision="legacy") - assert not df.iloc[0, 0] == df3.iloc[0, 0] + if IS64: + assert not df.iloc[0, 0] == df3.iloc[0, 0] + else: + assert df.iloc[0, 0] == df3.iloc[0, 0] msg = "Unrecognized float_precision option: junk" diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index edf1d780ef107..b33289213e258 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -719,7 +719,7 @@ def test_iterator_stop_on_chunksize(all_parsers): "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)] ) def test_iterator_skipfooter_errors(all_parsers, kwargs): - msg = "'skipfooter' not supported for 'iteration'" + msg = "'skipfooter' not supported for iteration" parser = all_parsers data = "a\n1\n2" diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 6ac310e3b2227..861aeba60cab7 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -577,7 +577,7 @@ def test_boolean_dtype(all_parsers): ) result = parser.read_csv(StringIO(data), dtype="boolean") - expected = pd.DataFrame( + expected = DataFrame( { "a": pd.array( [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index d45317aaa3458..4796cf0b79fae 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -509,7 +509,7 @@ def test_dtype(dtype): colspecs = [(0, 5), (5, 10), (10, None)] result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype) - expected = pd.DataFrame( + expected = DataFrame( {"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"] ) @@ -625,7 +625,7 @@ def test_binary_mode(): """ data = """aas aas aas bba bab b a""" - df_reference = pd.DataFrame( + df_reference = DataFrame( [["bba", "bab", "b a"]], columns=["aas", "aas.1", "aas.2"], index=[0] ) with tm.ensure_clean() as path: @@ -653,5 +653,5 @@ def test_encoding_mmap(memory_map): memory_map=memory_map, ) data.seek(0) - df_reference = pd.DataFrame([[1, "A", "Ä", 2]]) + df_reference = DataFrame([[1, "A", "Ä", 2]]) tm.assert_frame_equal(df, df_reference) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 42614f1eee8af..2437ba77532fa 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -64,7 +64,7 @@ @pytest.mark.single class TestHDFStore: def test_format_type(self, setup_path): - df = pd.DataFrame({"A": [1, 2]}) + df = DataFrame({"A": [1, 2]}) with ensure_clean_path(setup_path) as path: with HDFStore(path) as store: store.put("a", df, format="fixed") @@ -300,7 +300,7 @@ def checksum(filename, hash_factory=hashlib.md5, chunk_num_blocks=128): def create_h5_and_return_checksum(track_times): with ensure_clean_path(setup_path) as path: - df = pd.DataFrame({"a": [1]}) + df = DataFrame({"a": [1]}) with pd.HDFStore(path, mode="w") as hdf: hdf.put( @@ -657,11 +657,11 @@ def test_get(self, setup_path): def test_walk(self, where, expected, setup_path): # GH10143 objs = { - "df1": pd.DataFrame([1, 2, 3]), - "df2": pd.DataFrame([4, 5, 6]), - "df3": pd.DataFrame([6, 7, 8]), - "df4": pd.DataFrame([9, 10, 11]), - "s1": pd.Series([10, 9, 8]), + "df1": DataFrame([1, 2, 3]), + "df2": DataFrame([4, 5, 6]), + "df3": DataFrame([6, 7, 8]), + "df4": DataFrame([9, 10, 11]), + "s1": Series([10, 9, 8]), # Next 3 items aren't pandas objects and should be ignored "a1": np.array([[1, 2, 3], [4, 5, 6]]), "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), @@ -1113,7 +1113,7 @@ def test_latin_encoding(self, setup_path, dtype, val): key = "data" val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] - ser = pd.Series(val, dtype=dtype) + ser = Series(val, dtype=dtype) with ensure_clean_path(setup_path) as store: ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) @@ -1267,7 +1267,7 @@ def test_append_all_nans(self, setup_path): def test_read_missing_key_close_store(self, setup_path): # GH 25766 with ensure_clean_path(setup_path) as path: - df = pd.DataFrame({"a": range(2), "b": range(2)}) + df = DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") with pytest.raises(KeyError, match="'No object named k2 in the file'"): @@ -1280,7 +1280,7 @@ def test_read_missing_key_close_store(self, setup_path): def test_read_missing_key_opened_store(self, setup_path): # GH 28699 with ensure_clean_path(setup_path) as path: - df = pd.DataFrame({"a": range(2), "b": range(2)}) + df = DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") with pd.HDFStore(path, "r") as store: @@ -1509,7 +1509,7 @@ def test_to_hdf_with_min_itemsize(self, setup_path): def test_to_hdf_errors(self, format, setup_path): data = ["\ud800foo"] - ser = pd.Series(data, index=pd.Index(data)) + ser = Series(data, index=Index(data)) with ensure_clean_path(setup_path) as path: # GH 20835 ser.to_hdf(path, "table", format=format, errors="surrogatepass") @@ -1921,7 +1921,7 @@ def test_mi_data_columns(self, setup_path): idx = pd.MultiIndex.from_arrays( [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] ) - df = pd.DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + df = DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=True) @@ -2533,15 +2533,15 @@ def test_store_index_name(self, setup_path): @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(self, table_format, setup_path): # GH #13492 - idx = pd.Index( + idx = Index( pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), name="cols\u05d2", ) - idx1 = pd.Index( + idx1 = Index( pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), name="rows\u05d0", ) - df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) + df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) # This used to fail, returning numpy strings instead of python strings. with ensure_clean_path(setup_path) as path: @@ -3683,7 +3683,7 @@ def test_append_to_multiple_dropna_false(self, setup_path): def test_append_to_multiple_min_itemsize(self, setup_path): # GH 11238 - df = pd.DataFrame( + df = DataFrame( { "IX": np.arange(1, 21), "Num": np.arange(1, 21), @@ -4136,10 +4136,10 @@ def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" ) as store: result = store.select("df") - expected = pd.DataFrame( + expected = DataFrame( [[1, 2, 3, "D"]], columns=["A", "B", "C", "D"], - index=pd.Index(["ABC"], name="INDEX_NAME"), + index=Index(["ABC"], name="INDEX_NAME"), ) tm.assert_frame_equal(expected, result) @@ -4151,10 +4151,10 @@ def test_legacy_table_fixed_format_read_datetime_py2(self, datapath, setup_path) mode="r", ) as store: result = store.select("df") - expected = pd.DataFrame( + expected = DataFrame( [[pd.Timestamp("2020-02-06T18:00")]], columns=["A"], - index=pd.Index(["date"]), + index=Index(["date"]), ) tm.assert_frame_equal(expected, result) @@ -4166,7 +4166,7 @@ def test_legacy_table_read_py2(self, datapath, setup_path): ) as store: result = store.select("table") - expected = pd.DataFrame({"a": ["a", "b"], "b": [2, 3]}) + expected = DataFrame({"a": ["a", "b"], "b": [2, 3]}) tm.assert_frame_equal(expected, result) def test_copy(self, setup_path): @@ -4286,13 +4286,13 @@ def test_unicode_index(self, setup_path): def test_unicode_longer_encoded(self, setup_path): # GH 11234 char = "\u0394" - df = pd.DataFrame({"A": [char]}) + df = DataFrame({"A": [char]}) with ensure_clean_store(setup_path) as store: store.put("df", df, format="table", encoding="utf-8") result = store.get("df") tm.assert_frame_equal(result, df) - df = pd.DataFrame({"A": ["a", char], "B": ["b", "b"]}) + df = DataFrame({"A": ["a", char], "B": ["b", "b"]}) with ensure_clean_store(setup_path) as store: store.put("df", df, format="table", encoding="utf-8") result = store.get("df") @@ -4497,12 +4497,12 @@ def test_categorical_nan_only_columns(self, setup_path): # GH18413 # Check that read_hdf with categorical columns with NaN-only values can # be read back. - df = pd.DataFrame( + df = DataFrame( { "a": ["a", "b", "c", np.nan], "b": [np.nan, np.nan, np.nan, np.nan], "c": [1, 2, 3, 4], - "d": pd.Series([None] * 4, dtype=object), + "d": Series([None] * 4, dtype=object), } ) df["a"] = df.a.astype("category") @@ -4734,7 +4734,7 @@ def test_read_from_py_localpath(self, setup_path): def test_query_long_float_literal(self, setup_path): # GH 14241 - df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) + df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) with ensure_clean_store(setup_path) as store: store.append("test", df, format="table", data_columns=True) @@ -4755,7 +4755,7 @@ def test_query_long_float_literal(self, setup_path): def test_query_compare_column_type(self, setup_path): # GH 15492 - df = pd.DataFrame( + df = DataFrame( { "date": ["2014-01-01", "2014-01-02"], "real_date": date_range("2014-01-01", periods=2), @@ -4824,11 +4824,11 @@ def test_read_py2_hdf_file_in_py3(self, datapath): # the file was generated in Python 2.7 like so: # - # df = pd.DataFrame([1.,2,3], index=pd.PeriodIndex( + # df = DataFrame([1.,2,3], index=pd.PeriodIndex( # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') - expected = pd.DataFrame( + expected = DataFrame( [1.0, 2, 3], index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), ) @@ -4850,7 +4850,7 @@ def test_select_empty_where(self, where): # while reading from HDF store raises # "SyntaxError: only a single expression is allowed" - df = pd.DataFrame([1, 2, 3]) + df = DataFrame([1, 2, 3]) with ensure_clean_path("empty_where.h5") as path: with pd.HDFStore(path) as store: store.put("df", df, "t") @@ -4867,7 +4867,7 @@ def test_select_empty_where(self, where): def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path): # GH 7775 mi = MultiIndex.from_arrays([idx, idx]) - df = pd.DataFrame(0, index=mi, columns=["a"]) + df = DataFrame(0, index=mi, columns=["a"]) with ensure_clean_path(setup_path) as path: with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): df.to_hdf(path, "df") diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index dc28e9543f19e..e137bc2dca48e 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -213,7 +213,7 @@ def test_append_with_timezones_pytz(setup_path): def test_roundtrip_tz_aware_index(setup_path): # GH 17618 time = pd.Timestamp("2000-01-01 01:00:00", tz="US/Eastern") - df = pd.DataFrame(data=[0], index=[time]) + df = DataFrame(data=[0], index=[time]) with ensure_clean_store(setup_path) as store: store.put("frame", df, format="fixed") @@ -224,7 +224,7 @@ def test_roundtrip_tz_aware_index(setup_path): def test_store_index_name_with_tz(setup_path): # GH 13884 - df = pd.DataFrame({"A": [1, 2]}) + df = DataFrame({"A": [1, 2]}) df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) df.index = df.index.tz_localize("UTC") df.index.name = "foo" @@ -268,7 +268,7 @@ def test_tseries_select_index_column(setup_path): assert rng.tz == result.dt.tz -def test_timezones_fixed(setup_path): +def test_timezones_fixed_format_frame_non_empty(setup_path): with ensure_clean_store(setup_path) as store: # index @@ -296,6 +296,16 @@ def test_timezones_fixed(setup_path): tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("dtype", ["datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"]) +def test_timezones_fixed_format_frame_empty(setup_path, dtype): + with ensure_clean_store(setup_path) as store: + s = Series(dtype=dtype) + df = DataFrame({"A": s}) + store["df"] = df + result = store["df"] + tm.assert_frame_equal(result, df) + + def test_fixed_offset_tz(setup_path): rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) @@ -377,7 +387,7 @@ def test_read_with_where_tz_aware_index(setup_path): periods = 10 dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) - expected = pd.DataFrame({"MYCOL": 0}, index=mi) + expected = DataFrame({"MYCOL": 0}, index=mi) key = "mykey" with ensure_clean_path(setup_path) as path: diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index b627e0e1cad54..a454d3b855cdf 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -38,11 +38,11 @@ def df(request): data_type = request.param if data_type == "delims": - return pd.DataFrame({"a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"]}) + return DataFrame({"a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"]}) elif data_type == "utf8": - return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]}) + return DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]}) elif data_type == "utf16": - return pd.DataFrame( + return DataFrame( {"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]} ) elif data_type == "string": @@ -61,7 +61,7 @@ def df(request): r_idx_names=[None], ) elif data_type == "nonascii": - return pd.DataFrame({"en": "in English".split(), "es": "en español".split()}) + return DataFrame({"en": "in English".split(), "es": "en español".split()}) elif data_type == "colwidth": _cw = get_option("display.max_colwidth") + 1 return tm.makeCustomDataframe( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 67ee9348394dd..285601b37b80f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -512,6 +512,17 @@ def test_basic_subset_columns(self, pa, df_full): read_kwargs={"columns": ["string", "int"]}, ) + def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): + # GH 37105 + + buf_bytes = df_full.to_parquet(engine=pa) + assert isinstance(buf_bytes, bytes) + + buf_stream = BytesIO(buf_bytes) + res = pd.read_parquet(buf_stream) + + tm.assert_frame_equal(df_full, res) + def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() @@ -614,16 +625,20 @@ def test_s3_roundtrip_for_dir( # read_table uses the new Arrow Datasets API since pyarrow 1.0.0 # Previous behaviour was pyarrow partitioned columns become 'category' dtypes # These are added to back of dataframe on read. In new API category dtype is - # only used if partition field is string. - legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0") - if partition_col and legacy_read_table: - partition_col_type = "category" - else: - partition_col_type = "int32" - - expected_df[partition_col] = expected_df[partition_col].astype( - partition_col_type + # only used if partition field is string, but this changed again to use + # category dtype for all types (not only strings) in pyarrow 2.0.0 + pa10 = (LooseVersion(pyarrow.__version__) >= LooseVersion("1.0.0")) and ( + LooseVersion(pyarrow.__version__) < LooseVersion("2.0.0") ) + if partition_col: + if pa10: + partition_col_type = "int32" + else: + partition_col_type = "category" + + expected_df[partition_col] = expected_df[partition_col].astype( + partition_col_type + ) check_round_trip( df_compat, @@ -765,6 +780,10 @@ def test_timestamp_nanoseconds(self, pa): check_round_trip(df, pa, write_kwargs={"version": "2.0"}) def test_timezone_aware_index(self, pa, timezone_aware_date_list): + if LooseVersion(pyarrow.__version__) >= LooseVersion("2.0.0"): + # temporary skip this test until it is properly resolved + # https://github.com/pandas-dev/pandas/issues/37286 + pytest.skip() idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 2241fe7013568..890146f0789ae 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -12,6 +12,7 @@ """ import bz2 import datetime +import functools import glob import gzip import io @@ -22,13 +23,14 @@ from warnings import catch_warnings, simplefilter import zipfile +import numpy as np import pytest -from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian +from pandas.compat import PY38, get_lzma_file, import_lzma, is_platform_little_endian import pandas.util._test_decorators as td import pandas as pd -from pandas import Index +from pandas import Index, Series, period_range import pandas._testing as tm from pandas.tseries.offsets import Day, MonthEnd @@ -155,28 +157,43 @@ def test_pickles(current_pickle_data, legacy_pickle): compare(current_pickle_data, legacy_pickle, version) -def test_round_trip_current(current_pickle_data): - def python_pickler(obj, path): - with open(path, "wb") as fh: - pickle.dump(obj, fh, protocol=-1) +def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) + + +def python_unpickler(path): + with open(path, "rb") as fh: + fh.seek(0) + return pickle.load(fh) - def python_unpickler(path): - with open(path, "rb") as fh: - fh.seek(0) - return pickle.load(fh) +@pytest.mark.parametrize( + "pickle_writer", + [ + pytest.param(python_pickler, id="python"), + pytest.param(pd.to_pickle, id="pandas_proto_default"), + pytest.param( + functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL), + id="pandas_proto_highest", + ), + pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"), + pytest.param( + functools.partial(pd.to_pickle, protocol=5), + id="pandas_proto_5", + marks=pytest.mark.skipif(not PY38, reason="protocol 5 not supported"), + ), + ], +) +def test_round_trip_current(current_pickle_data, pickle_writer): data = current_pickle_data for typ, dv in data.items(): for dt, expected in dv.items(): for writer in [pd.to_pickle, python_pickler]: - if writer is None: - continue - with tm.ensure_clean() as path: - # test writing with each pickler - writer(expected, path) + pickle_writer(expected, path) # test reading with each unpickler result = pd.read_pickle(path) @@ -483,7 +500,7 @@ def __init__(self): def test_read_pickle_with_subclass(): # GH 12163 - expected = pd.Series(dtype=object), MyTz() + expected = Series(dtype=object), MyTz() result = tm.round_trip_pickle(expected) tm.assert_series_equal(result[0], expected[0]) @@ -515,3 +532,42 @@ def test_pickle_binary_object_compression(compression): read_df = pd.read_pickle(buffer, compression=compression) buffer.seek(0) tm.assert_frame_equal(df, read_df) + + +def test_pickle_dataframe_with_multilevel_index( + multiindex_year_month_day_dataframe_random_data, + multiindex_dataframe_random_data, +): + ymd = multiindex_year_month_day_dataframe_random_data + frame = multiindex_dataframe_random_data + + def _test_roundtrip(frame): + unpickled = tm.round_trip_pickle(frame) + tm.assert_frame_equal(frame, unpickled) + + _test_roundtrip(frame) + _test_roundtrip(frame.T) + _test_roundtrip(ymd) + _test_roundtrip(ymd.T) + + +def test_pickle_timeseries_periodindex(): + # GH#2891 + prng = period_range("1/1/2011", "1/1/2012", freq="M") + ts = Series(np.random.randn(len(prng)), prng) + new_ts = tm.round_trip_pickle(ts) + assert new_ts.index.freq == "M" + + +@pytest.mark.parametrize( + "name", [777, 777.0, "name", datetime.datetime(2001, 11, 11), (1, 2)] +) +def test_pickle_preserve_name(name): + def _pickle_roundtrip_name(obj): + with tm.ensure_clean() as path: + obj.to_pickle(path) + unpickled = pd.read_pickle(path) + return unpickled + + unpickled = _pickle_roundtrip_name(tm.makeTimeSeries(name=name)) + assert unpickled.name == name diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 32a15e6201037..d6506d434d6a7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1490,10 +1490,10 @@ def test_datetime_with_timezone_roundtrip(self): def test_out_of_bounds_datetime(self): # GH 26761 - data = pd.DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) + data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) data.to_sql("test_datetime_obb", self.conn, index=False) result = sql.read_sql_table("test_datetime_obb", self.conn) - expected = pd.DataFrame([pd.NaT], columns=["date"]) + expected = DataFrame([pd.NaT], columns=["date"]) tm.assert_frame_equal(result, expected) def test_naive_datetimeindex_roundtrip(self): @@ -1820,7 +1820,7 @@ def main(connectable): def test_to_sql_with_negative_npinf(self, input): # GH 34431 - df = pd.DataFrame(input) + df = DataFrame(input) if self.flavor == "mysql": msg = "inf cannot be used with MySQL" diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 88f61390957a6..fecffd75f9478 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -32,7 +32,7 @@ @pytest.fixture() def mixed_frame(): - return pd.DataFrame( + return DataFrame( { "a": [1, 2, 3, 4], "b": [1.0, 3.0, 27.0, 81.0], @@ -385,7 +385,7 @@ def test_stata_doc_examples(self): def test_write_preserves_original(self): # 9795 np.random.seed(423) - df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) + df = DataFrame(np.random.randn(5, 4), columns=list("abcd")) df.loc[2, "a":"c"] = np.nan df_copy = df.copy() with tm.ensure_clean() as path: @@ -636,7 +636,7 @@ def test_105(self): dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") df = pd.read_stata(dpath) df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]] - df0 = pd.DataFrame(df0) + df0 = DataFrame(df0) df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"] df0["clustnum"] = df0["clustnum"].astype(np.int16) df0["pri_schl"] = df0["pri_schl"].astype(np.int8) @@ -1059,7 +1059,7 @@ def test_categorical_order(self, file): (col, pd.Categorical.from_codes(codes, labels, ordered=True)) ) else: - cols.append((col, pd.Series(labels, dtype=np.float32))) + cols.append((col, Series(labels, dtype=np.float32))) expected = DataFrame.from_dict(dict(cols)) # Read with and with out categoricals, ensure order is identical @@ -1089,7 +1089,7 @@ def test_categorical_sorting(self, file): cat = pd.Categorical.from_codes( codes=codes, categories=categories, ordered=True ) - expected = pd.Series(cat, name="srh") + expected = Series(cat, name="srh") tm.assert_series_equal(expected, parsed["srh"]) @pytest.mark.parametrize("file", ["dta19_115", "dta19_117"]) @@ -1358,7 +1358,7 @@ def test_default_date_conversion(self): dt.datetime(2012, 12, 21, 12, 21, 12, 21000), dt.datetime(1776, 7, 4, 7, 4, 7, 4000), ] - original = pd.DataFrame( + original = DataFrame( { "nums": [1.0, 2.0, 3.0], "strs": ["apple", "banana", "cherry"], @@ -1381,7 +1381,7 @@ def test_default_date_conversion(self): tm.assert_frame_equal(reread, direct) def test_unsupported_type(self): - original = pd.DataFrame({"a": [1 + 2j, 2 + 4j]}) + original = DataFrame({"a": [1 + 2j, 2 + 4j]}) msg = "Data type complex128 not supported" with pytest.raises(NotImplementedError, match=msg): @@ -1394,7 +1394,7 @@ def test_unsupported_datetype(self): dt.datetime(2012, 12, 21, 12, 21, 12, 21000), dt.datetime(1776, 7, 4, 7, 4, 7, 4000), ] - original = pd.DataFrame( + original = DataFrame( { "nums": [1.0, 2.0, 3.0], "strs": ["apple", "banana", "cherry"], @@ -1408,7 +1408,7 @@ def test_unsupported_datetype(self): original.to_stata(path, convert_dates={"dates": "tC"}) dates = pd.date_range("1-1-1990", periods=3, tz="Asia/Hong_Kong") - original = pd.DataFrame( + original = DataFrame( { "nums": [1.0, 2.0, 3.0], "strs": ["apple", "banana", "cherry"], @@ -1439,7 +1439,7 @@ def test_stata_111(self): # SAS when exporting to Stata format. We do not know of any # on-line documentation for this version. df = read_stata(self.dta24_111) - original = pd.DataFrame( + original = DataFrame( { "y": [1, 1, 1, 1, 1, 0, 0, np.NaN, 0, 0], "x": [1, 2, 1, 3, np.NaN, 4, 3, 5, 1, 6], @@ -1527,7 +1527,7 @@ def test_pickle_path_localpath(self): def test_value_labels_iterator(self, write_index): # GH 16923 d = {"A": ["B", "E", "C", "A", "E"]} - df = pd.DataFrame(data=d) + df = DataFrame(data=d) df["A"] = df["A"].astype("category") with tm.ensure_clean() as path: df.to_stata(path, write_index=write_index) @@ -1658,7 +1658,7 @@ def test_invalid_date_conversion(self): dt.datetime(2012, 12, 21, 12, 21, 12, 21000), dt.datetime(1776, 7, 4, 7, 4, 7, 4000), ] - original = pd.DataFrame( + original = DataFrame( { "nums": [1.0, 2.0, 3.0], "strs": ["apple", "banana", "cherry"], @@ -1709,14 +1709,14 @@ def test_unicode_dta_118(self): ["", "", "s", "", "s"], ["", "", " ", "", " "], ] - expected = pd.DataFrame(values, columns=columns) + expected = DataFrame(values, columns=columns) tm.assert_frame_equal(unicode_df, expected) def test_mixed_string_strl(self): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] - output = pd.DataFrame(output) + output = DataFrame(output) output.number = output.number.astype("int32") with tm.ensure_clean() as path: @@ -1737,7 +1737,7 @@ def test_mixed_string_strl(self): @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_all_none_exception(self, version): output = [{"none": "none", "number": 0}, {"none": None, "number": 1}] - output = pd.DataFrame(output) + output = DataFrame(output) output.loc[:, "none"] = None with tm.ensure_clean() as path: with pytest.raises(ValueError, match="Column `none` cannot be exported"): @@ -1791,7 +1791,7 @@ def test_encoding_latin1_118(self): assert len(w) == 151 assert w[0].message.args[0] == msg - expected = pd.DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"]) + expected = DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"]) tm.assert_frame_equal(encoded, expected) @pytest.mark.slow @@ -1808,7 +1808,7 @@ def test_stata_119(self): @pytest.mark.parametrize("version", [118, 119, None]) def test_utf8_writer(self, version): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) - data = pd.DataFrame( + data = DataFrame( [ [1.0, 1, "ᴬ", "ᴀ relatively long ŝtring"], [2.0, 2, "ᴮ", ""], @@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath): StataReader(dta_file, chunksize=0) with pytest.raises(ValueError, match="chunksize must be a positive"): StataReader(dta_file, chunksize="apple") - with pytest.raises(ValueError, match="chunksize must be set to a positive"): - with StataReader(dta_file) as reader: - reader.__next__() def test_iterator_value_labels(): @@ -1983,3 +1980,20 @@ def test_iterator_value_labels(): for i in range(2): tm.assert_index_equal(chunk.dtypes[i].categories, expected) tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) + + +def test_precision_loss(): + df = DataFrame( + [[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]], + columns=["big", "little"], + ) + with tm.ensure_clean() as path: + with tm.assert_produces_warning( + PossiblePrecisionLoss, match="Column converted from int64 to float64" + ): + df.to_stata(path, write_index=False) + reread = read_stata(path) + expected_dt = Series([np.float64, np.float64], index=["big", "little"]) + tm.assert_series_equal(reread.dtypes, expected_dt) + assert reread.loc[0, "little"] == df.loc[0, "little"] + assert reread.loc[0, "big"] == float(df.loc[0, "big"]) diff --git a/pandas/tests/libs/__init__.py b/pandas/tests/libs/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/test_join.py b/pandas/tests/libs/test_join.py similarity index 62% rename from pandas/tests/test_join.py rename to pandas/tests/libs/test_join.py index 129dc275c4d5a..95d6dcbaf3baf 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from pandas._libs import join as _join +from pandas._libs import join as libjoin +from pandas._libs.join import inner_join, left_outer_join -from pandas import Categorical, DataFrame, Index, merge import pandas._testing as tm @@ -12,7 +12,7 @@ class TestIndexer: "dtype", ["int32", "int64", "float32", "float64", "object"] ) def test_outer_join_indexer(self, dtype): - indexer = _join.outer_join_indexer + indexer = libjoin.outer_join_indexer left = np.arange(3, dtype=dtype) right = np.arange(2, 5, dtype=dtype) @@ -42,12 +42,104 @@ def test_outer_join_indexer(self, dtype): exp = np.array([-1, -1, -1], dtype=np.int64) tm.assert_numpy_array_equal(rindexer, exp) + def test_cython_left_outer_join(self): + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + ls, rs = left_outer_join(left, right, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = np.array( + [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1] + ) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_right_outer_join(self): + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + rs, ls = left_outer_join(right, left, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + # 0 1 1 1 + exp_li = np.array( + [ + 0, + 1, + 2, + 3, + 4, + 5, + 3, + 4, + 5, + 3, + 4, + 5, + # 2 2 4 + 6, + 7, + 8, + 6, + 7, + 8, + -1, + ] + ) + exp_ri = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_inner_join(self): + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + max_group = 5 + + ls, rs = inner_join(left, right, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) + exp_ri = np.array([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + def test_left_join_indexer_unique(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - result = _join.left_join_indexer_unique(b, a) + result = libjoin.left_join_indexer_unique(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) @@ -162,7 +254,7 @@ def test_left_outer_join_bug(): right = np.array([3, 1], dtype=np.int64) max_groups = 4 - lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) + lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False) exp_lidx = np.arange(len(left), dtype=np.int64) exp_ridx = -np.ones(len(left), dtype=np.int64) @@ -178,7 +270,7 @@ def test_inner_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _join.inner_join_indexer(a, b) + index, ares, bres = libjoin.inner_join_indexer(a, b) index_exp = np.array([3, 5], dtype=np.int64) tm.assert_almost_equal(index, index_exp) @@ -191,7 +283,7 @@ def test_inner_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _join.inner_join_indexer(a, b) + index, ares, bres = libjoin.inner_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -201,7 +293,7 @@ def test_outer_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _join.outer_join_indexer(a, b) + index, ares, bres = libjoin.outer_join_indexer(a, b) index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) tm.assert_almost_equal(index, index_exp) @@ -214,7 +306,7 @@ def test_outer_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _join.outer_join_indexer(a, b) + index, ares, bres = libjoin.outer_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -224,7 +316,7 @@ def test_left_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _join.left_join_indexer(a, b) + index, ares, bres = libjoin.left_join_indexer(a, b) tm.assert_almost_equal(index, a) @@ -236,17 +328,17 @@ def test_left_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _join.left_join_indexer(a, b) + index, ares, bres = libjoin.left_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) def test_left_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = np.array([1, 1, 2, 5], dtype=np.int64) + idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64) - res, lidx, ridx = _join.left_join_indexer(idx2.values, idx.values) + res, lidx, ridx = libjoin.left_join_indexer(idx2, idx) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) tm.assert_almost_equal(res, exp_res) @@ -259,10 +351,10 @@ def test_left_join_indexer2(): def test_outer_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = np.array([1, 1, 2, 5], dtype=np.int64) + idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64) - res, lidx, ridx = _join.outer_join_indexer(idx2.values, idx.values) + res, lidx, ridx = libjoin.outer_join_indexer(idx2, idx) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) tm.assert_almost_equal(res, exp_res) @@ -275,10 +367,10 @@ def test_outer_join_indexer2(): def test_inner_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = np.array([1, 1, 2, 5], dtype=np.int64) + idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64) - res, lidx, ridx = _join.inner_join_indexer(idx2.values, idx.values) + res, lidx, ridx = libjoin.inner_join_indexer(idx2, idx) exp_res = np.array([1, 1, 2, 5], dtype=np.int64) tm.assert_almost_equal(res, exp_res) @@ -288,59 +380,3 @@ def test_inner_join_indexer2(): exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) tm.assert_almost_equal(ridx, exp_ridx) - - -def test_merge_join_categorical_multiindex(): - # From issue 16627 - a = { - "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]), - "Int1": [0, 1, 0, 1, 0, 0], - } - a = DataFrame(a) - - b = { - "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]), - "Int": [0, 0, 0, 1, 1, 1], - "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], - } - b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] - - expected = merge( - a, - b.reset_index(), - left_on=["Cat1", "Int1"], - right_on=["Cat", "Int"], - how="left", - ) - result = a.join(b, on=["Cat1", "Int1"]) - expected = expected.drop(["Cat", "Int"], axis=1) - tm.assert_frame_equal(expected, result) - - # Same test, but with ordered categorical - a = { - "Cat1": Categorical( - ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True - ), - "Int1": [0, 1, 0, 1, 0, 0], - } - a = DataFrame(a) - - b = { - "Cat": Categorical( - ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True - ), - "Int": [0, 0, 0, 1, 1, 1], - "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], - } - b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] - - expected = merge( - a, - b.reset_index(), - left_on=["Cat1", "Int1"], - right_on=["Cat", "Int"], - how="left", - ) - result = a.join(b, on=["Cat1", "Int1"]) - expected = expected.drop(["Cat", "Int"], axis=1) - tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/test_lib.py b/pandas/tests/libs/test_lib.py similarity index 100% rename from pandas/tests/test_lib.py rename to pandas/tests/libs/test_lib.py diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 78aa1887f5611..66463a4a2358a 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -291,9 +291,16 @@ def test_irreg_hf(self): _, ax = self.plt.subplots() df2 = df.copy() df2.index = df.index.astype(object) - df2.plot(ax=ax) - diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() - assert (np.fabs(diffs[1:] - sec) < 1e-8).all() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # This warning will be emitted + # pandas/core/frame.py:3216: + # FutureWarning: Automatically casting object-dtype Index of datetimes + # to DatetimeIndex is deprecated and will be removed in a future version. + # Explicitly cast to DatetimeIndex instead. + # return klass(values, index=self.index, name=name, fastpath=True) + df2.plot(ax=ax) + diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() + assert (np.fabs(diffs[1:] - sec) < 1e-8).all() def test_irregular_datetime64_repr_bug(self): ser = tm.makeTimeSeries() @@ -1028,9 +1035,16 @@ def test_irreg_dtypes(self): # np.datetime64 idx = date_range("1/1/2000", periods=10) idx = idx[[0, 2, 5, 9]].astype(object) - df = DataFrame(np.random.randn(len(idx), 3), idx) - _, ax = self.plt.subplots() - _check_plot_works(df.plot, ax=ax) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # This warning will be emitted + # pandas/core/frame.py:3216: + # FutureWarning: Automatically casting object-dtype Index of datetimes + # to DatetimeIndex is deprecated and will be removed in a future version. + # Explicitly cast to DatetimeIndex instead. + # return klass(values, index=self.index, name=name, fastpath=True) + df = DataFrame(np.random.randn(len(idx), 3), idx) + _, ax = self.plt.subplots() + _check_plot_works(df.plot, ax=ax) @pytest.mark.slow def test_time(self): diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index d4d2256d209cf..ba59fc1a3cc3f 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -617,7 +617,7 @@ def test_subplots_timeseries_y_axis_not_supported(self): pd.to_datetime("2017-08-02 00:00:00"), ], } - testdata = pd.DataFrame(data) + testdata = DataFrame(data) ax_period = testdata.plot(x="numeric", y="period") assert ( ax_period.get_lines()[0].get_data()[1] == testdata["period"].values @@ -987,7 +987,7 @@ def test_bar_colors(self): tm.close() def test_bar_user_colors(self): - df = pd.DataFrame( + df = DataFrame( {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} ) # This should *only* work when `y` is specified, else @@ -1149,13 +1149,13 @@ def test_bar_nan(self): @pytest.mark.slow def test_bar_categorical(self): # GH 13019 - df1 = pd.DataFrame( + df1 = DataFrame( np.random.randn(6, 5), index=pd.Index(list("ABCDEF")), columns=pd.Index(list("abcde")), ) # categorical index must behave the same - df2 = pd.DataFrame( + df2 = DataFrame( np.random.randn(6, 5), index=pd.CategoricalIndex(list("ABCDEF")), columns=pd.CategoricalIndex(list("abcde")), @@ -1198,7 +1198,7 @@ def test_plot_scatter(self): def test_raise_error_on_datetime_time_data(self): # GH 8113, datetime.time type is not supported by matplotlib in scatter - df = pd.DataFrame(np.random.randn(10), columns=["a"]) + df = DataFrame(np.random.randn(10), columns=["a"]) df["dtime"] = pd.date_range(start="2014-01-01", freq="h", periods=10).time msg = "must be a string or a number, not 'datetime.time'" @@ -1209,19 +1209,19 @@ def test_scatterplot_datetime_data(self): # GH 30391 dates = pd.date_range(start=date(2019, 1, 1), periods=12, freq="W") vals = np.random.normal(0, 1, len(dates)) - df = pd.DataFrame({"dates": dates, "vals": vals}) + df = DataFrame({"dates": dates, "vals": vals}) _check_plot_works(df.plot.scatter, x="dates", y="vals") _check_plot_works(df.plot.scatter, x=0, y=1) def test_scatterplot_object_data(self): # GH 18755 - df = pd.DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) + df = DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) _check_plot_works(df.plot.scatter, x="a", y="b") _check_plot_works(df.plot.scatter, x=0, y=1) - df = pd.DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) + df = DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) _check_plot_works(df.plot.scatter, x="a", y="b") _check_plot_works(df.plot.scatter, x=0, y=1) @@ -1232,7 +1232,7 @@ def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = DataFrame(random_array, columns=["A label", "B label", "C label"]) ax1 = df.plot.scatter(x="A label", y="B label") ax2 = df.plot.scatter(x="A label", y="B label", c="C label") @@ -1255,7 +1255,7 @@ def test_if_hexbin_xaxis_label_is_visible(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = DataFrame(random_array, columns=["A label", "B label", "C label"]) ax = df.plot.hexbin("A label", "B label", gridsize=12) assert all(vis.get_visible() for vis in ax.xaxis.get_minorticklabels()) @@ -1267,7 +1267,7 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): import matplotlib.pyplot as plt random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + df = DataFrame(random_array, columns=["A label", "B label", "C label"]) fig, axes = plt.subplots(1, 2) df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) @@ -1284,9 +1284,7 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): @pytest.mark.slow def test_plot_scatter_with_categorical_data(self, x, y): # after fixing GH 18755, should be able to plot categorical data - df = pd.DataFrame( - {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} - ) + df = DataFrame({"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])}) _check_plot_works(df.plot.scatter, x=x, y=y) @@ -1345,7 +1343,7 @@ def test_plot_scatter_with_c(self): @pytest.mark.parametrize("cmap", [None, "Greys"]) def test_scatter_with_c_column_name_with_colors(self, cmap): # https://github.com/pandas-dev/pandas/issues/34316 - df = pd.DataFrame( + df = DataFrame( [[5.1, 3.5], [4.9, 3.0], [7.0, 3.2], [6.4, 3.2], [5.9, 3.0]], columns=["length", "width"], ) @@ -1383,7 +1381,7 @@ def test_scatter_colorbar_different_cmap(self): # GH 33389 import matplotlib.pyplot as plt - df = pd.DataFrame({"x": [1, 2, 3], "y": [1, 3, 2], "c": [1, 2, 3]}) + df = DataFrame({"x": [1, 2, 3], "y": [1, 3, 2], "c": [1, 2, 3]}) df["x2"] = df["x"] + 1 fig, ax = plt.subplots() @@ -1750,7 +1748,7 @@ def test_hist_df(self): def test_hist_weights(self, weights): # GH 33173 np.random.seed(0) - df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100)))) + df = DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100)))) ax1 = _check_plot_works(df.plot, kind="hist", weights=weights) ax2 = _check_plot_works(df.plot, kind="hist") @@ -1991,9 +1989,7 @@ def test_df_legend_labels(self): def test_missing_marker_multi_plots_on_same_ax(self): # GH 18222 - df = pd.DataFrame( - data=[[1, 1, 1, 1], [2, 2, 4, 8]], columns=["x", "r", "g", "b"] - ) + df = DataFrame(data=[[1, 1, 1, 1], [2, 2, 4, 8]], columns=["x", "r", "g", "b"]) fig, ax = self.plt.subplots(nrows=1, ncols=3) # Left plot df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[0]) @@ -2125,7 +2121,7 @@ def test_line_colors(self): @pytest.mark.slow def test_dont_modify_colors(self): colors = ["r", "g", "b"] - pd.DataFrame(np.random.rand(10, 2)).plot(color=colors) + DataFrame(np.random.rand(10, 2)).plot(color=colors) assert len(colors) == 3 @pytest.mark.slow @@ -3253,7 +3249,7 @@ def test_passed_bar_colors(self): color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] colormap = mpl.colors.ListedColormap(color_tuples) - barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) + barplot = DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) assert color_tuples == [c.get_facecolor() for c in barplot.patches] def test_rcParams_bar_colors(self): @@ -3261,14 +3257,14 @@ def test_rcParams_bar_colors(self): color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): - barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar") + barplot = DataFrame([[1, 2, 3]]).plot(kind="bar") assert color_tuples == [c.get_facecolor() for c in barplot.patches] @pytest.mark.parametrize("method", ["line", "barh", "bar"]) def test_secondary_axis_font_size(self, method): # GH: 12565 df = ( - pd.DataFrame(np.random.randn(15, 2), columns=list("AB")) + DataFrame(np.random.randn(15, 2), columns=list("AB")) .assign(C=lambda df: df.B.cumsum()) .assign(D=lambda df: df.C * 1.1) ) @@ -3284,7 +3280,7 @@ def test_secondary_axis_font_size(self, method): def test_x_string_values_ticks(self): # Test if string plot index have a fixed xtick position # GH: 7612, GH: 22334 - df = pd.DataFrame( + df = DataFrame( { "sales": [3, 2, 3], "visits": [20, 42, 28], @@ -3305,7 +3301,7 @@ def test_x_multiindex_values_ticks(self): # Test if multiindex plot index have a fixed xtick position # GH: 15912 index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]]) - df = pd.DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) + df = DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) ax = df.plot() ax.set_xlim(-1, 4) xticklabels = [t.get_text() for t in ax.get_xticklabels()] @@ -3320,7 +3316,7 @@ def test_x_multiindex_values_ticks(self): def test_xlim_plot_line(self, kind): # test if xlim is set correctly in plot.line and plot.area # GH 27686 - df = pd.DataFrame([2, 4], index=[1, 2]) + df = DataFrame([2, 4], index=[1, 2]) ax = df.plot(kind=kind) xlims = ax.get_xlim() assert xlims[0] < 1 @@ -3332,7 +3328,7 @@ def test_xlim_plot_line_correctly_in_mixed_plot_type(self): fig, ax = self.plt.subplots() indexes = ["k1", "k2", "k3", "k4"] - df = pd.DataFrame( + df = DataFrame( { "s1": [1000, 2000, 1500, 2000], "s2": [900, 1400, 2000, 3000], @@ -3355,7 +3351,7 @@ def test_xlim_plot_line_correctly_in_mixed_plot_type(self): def test_subplots_sharex_false(self): # test when sharex is set to False, two plots should have different # labels, GH 25160 - df = pd.DataFrame(np.random.rand(10, 2)) + df = DataFrame(np.random.rand(10, 2)) df.iloc[5:, 1] = np.nan df.iloc[:5, 0] = np.nan @@ -3370,7 +3366,7 @@ def test_subplots_sharex_false(self): def test_plot_no_rows(self): # GH 27758 - df = pd.DataFrame(columns=["foo"], dtype=int) + df = DataFrame(columns=["foo"], dtype=int) assert df.empty ax = df.plot() assert len(ax.get_lines()) == 1 @@ -3379,13 +3375,13 @@ def test_plot_no_rows(self): assert len(line.get_ydata()) == 0 def test_plot_no_numeric_data(self): - df = pd.DataFrame(["a", "b", "c"]) + df = DataFrame(["a", "b", "c"]) with pytest.raises(TypeError): df.plot() def test_missing_markers_legend(self): # 14958 - df = pd.DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"]) + df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"]) ax = df.plot(y=["A"], marker="x", linestyle="solid") df.plot(y=["B"], marker="o", linestyle="dotted", ax=ax) df.plot(y=["C"], marker="<", linestyle="dotted", ax=ax) @@ -3395,7 +3391,7 @@ def test_missing_markers_legend(self): def test_missing_markers_legend_using_style(self): # 14563 - df = pd.DataFrame( + df = DataFrame( { "A": [1, 2, 3, 4, 5, 6], "B": [2, 4, 1, 3, 2, 4], @@ -3414,8 +3410,8 @@ def test_missing_markers_legend_using_style(self): def test_colors_of_columns_with_same_name(self): # ISSUE 11136 -> https://github.com/pandas-dev/pandas/issues/11136 # Creating a DataFrame with duplicate column labels and testing colors of them. - df = pd.DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]}) - df1 = pd.DataFrame({"a": [2, 4, 6]}) + df = DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]}) + df1 = DataFrame({"a": [2, 4, 6]}) df_concat = pd.concat([df, df1], axis=1) result = df_concat.plot() for legend, line in zip(result.get_legend().legendHandles, result.lines): @@ -3436,7 +3432,7 @@ def test_xlabel_ylabel_dataframe_single_plot( self, kind, index_name, old_label, new_label ): # GH 9093 - df = pd.DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) + df = DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) df.index.name = index_name # default is the ylabel is not shown and xlabel is index name @@ -3449,6 +3445,27 @@ def test_xlabel_ylabel_dataframe_single_plot( assert ax.get_ylabel() == str(new_label) assert ax.get_xlabel() == str(new_label) + @pytest.mark.parametrize( + "xlabel, ylabel", + [ + (None, None), + ("X Label", None), + (None, "Y Label"), + ("X Label", "Y Label"), + ], + ) + @pytest.mark.parametrize("kind", ["scatter", "hexbin"]) + def test_xlabel_ylabel_dataframe_plane_plot(self, kind, xlabel, ylabel): + # GH 37001 + xcol = "Type A" + ycol = "Type B" + df = DataFrame([[1, 2], [2, 5]], columns=[xcol, ycol]) + + # default is the labels are column names + ax = df.plot(kind=kind, x=xcol, y=ycol, xlabel=xlabel, ylabel=ylabel) + assert ax.get_xlabel() == (xcol if xlabel is None else xlabel) + assert ax.get_ylabel() == (ycol if ylabel is None else ylabel) + @pytest.mark.parametrize( "index_name, old_label, new_label", [ @@ -3464,7 +3481,7 @@ def test_xlabel_ylabel_dataframe_subplots( self, kind, index_name, old_label, new_label ): # GH 9093 - df = pd.DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) + df = DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) df.index.name = index_name # default is the ylabel is not shown and xlabel is index name diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index d56c882471a9a..271cf65433afe 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -901,7 +901,7 @@ def test_xticklabels(self): def test_xtick_barPlot(self): # GH28172 - s = pd.Series(range(10), index=[f"P{i:02d}" for i in range(10)]) + s = Series(range(10), index=[f"P{i:02d}" for i in range(10)]) ax = s.plot.bar(xticks=range(0, 11, 2)) exp = np.array(list(range(0, 11, 2))) tm.assert_numpy_array_equal(exp, ax.get_xticks()) @@ -947,7 +947,7 @@ def test_plot_xlim_for_series(self, kind): def test_plot_no_rows(self): # GH 27758 - df = pd.Series(dtype=int) + df = Series(dtype=int) assert df.empty ax = df.plot() assert len(ax.get_lines()) == 1 @@ -956,12 +956,12 @@ def test_plot_no_rows(self): assert len(line.get_ydata()) == 0 def test_plot_no_numeric_data(self): - df = pd.Series(["a", "b", "c"]) + df = Series(["a", "b", "c"]) with pytest.raises(TypeError): df.plot() def test_style_single_ok(self): - s = pd.Series([1, 2]) + s = Series([1, 2]) ax = s.plot(style="s", color="C3") assert ax.lines[0].get_color() == "C3" @@ -972,7 +972,7 @@ def test_style_single_ok(self): @pytest.mark.parametrize("kind", ["line", "area", "bar"]) def test_xlabel_ylabel_series(self, kind, index_name, old_label, new_label): # GH 9093 - ser = pd.Series([1, 2, 3, 4]) + ser = Series([1, 2, 3, 4]) ser.index.name = index_name # default is the ylabel is not shown and xlabel is index name diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fe97925c2bb74..cc86436ee8fa9 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -643,40 +643,40 @@ def test_empty(self, method, unit, use_bottleneck, dtype): df = DataFrame(np.empty((10, 0)), dtype=dtype) assert (getattr(df, method)(1) == unit).all() - s = pd.Series([1], dtype=dtype) + s = Series([1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) result = getattr(s, method)(skipna=False, min_count=2) assert pd.isna(result) - s = pd.Series([np.nan], dtype=dtype) + s = Series([np.nan], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan, 1], dtype=dtype) + s = Series([np.nan, 1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) def test_empty_multi(self, method, unit): - s = pd.Series( + s = Series( [1, np.nan, np.nan, np.nan], index=pd.MultiIndex.from_product([("a", "b"), (0, 1)]), ) # 1 / 0 by default result = getattr(s, method)(level=0) - expected = pd.Series([1, unit], index=["a", "b"]) + expected = Series([1, unit], index=["a", "b"]) tm.assert_series_equal(result, expected) # min_count=0 result = getattr(s, method)(level=0, min_count=0) - expected = pd.Series([1, unit], index=["a", "b"]) + expected = Series([1, unit], index=["a", "b"]) tm.assert_series_equal(result, expected) # min_count=1 result = getattr(s, method)(level=0, min_count=1) - expected = pd.Series([1, np.nan], index=["a", "b"]) + expected = Series([1, np.nan], index=["a", "b"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["mean", "median", "std", "var"]) @@ -844,13 +844,13 @@ def test_idxmax(self): # Float64Index # GH#5914 - s = pd.Series([1, 2, 3], [1.1, 2.1, 3.1]) + s = Series([1, 2, 3], [1.1, 2.1, 3.1]) result = s.idxmax() assert result == 3.1 result = s.idxmin() assert result == 1.1 - s = pd.Series(s.index, s.index) + s = Series(s.index, s.index) result = s.idxmax() assert result == 3.1 result = s.idxmin() @@ -876,7 +876,7 @@ def test_all_any_params(self): assert not s2.any(skipna=True) # Check level. - s = pd.Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) + s = Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) tm.assert_series_equal(s.all(level=0), Series([False, True, False])) tm.assert_series_equal(s.any(level=0), Series([False, True, True])) @@ -908,7 +908,7 @@ def test_all_any_boolean(self): assert not s4.any(skipna=False) # Check level TODO(GH-33449) result should also be boolean - s = pd.Series( + s = Series( [False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2], dtype="boolean", @@ -918,9 +918,9 @@ def test_all_any_boolean(self): def test_any_axis1_bool_only(self): # GH#32432 - df = pd.DataFrame({"A": [True, False], "B": [1, 2]}) + df = DataFrame({"A": [True, False], "B": [1, 2]}) result = df.any(axis=1, bool_only=True) - expected = pd.Series([True, False]) + expected = Series([True, False]) tm.assert_series_equal(result, expected) def test_timedelta64_analytics(self): @@ -968,12 +968,12 @@ def test_timedelta64_analytics(self): @pytest.mark.parametrize( "test_input,error_type", [ - (pd.Series([], dtype="float64"), ValueError), + (Series([], dtype="float64"), ValueError), # For strings, or any Series with dtype 'O' - (pd.Series(["foo", "bar", "baz"]), TypeError), - (pd.Series([(1,), (2,)]), TypeError), + (Series(["foo", "bar", "baz"]), TypeError), + (Series([(1,), (2,)]), TypeError), # For mixed data types - (pd.Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]), TypeError), + (Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]), TypeError), ], ) def test_assert_idxminmax_raises(self, test_input, error_type): @@ -991,7 +991,7 @@ def test_assert_idxminmax_raises(self, test_input, error_type): def test_idxminmax_with_inf(self): # For numeric data with NA and Inf (GH #13595) - s = pd.Series([0, -np.inf, np.inf, np.nan]) + s = Series([0, -np.inf, np.inf, np.nan]) assert s.idxmin() == 1 assert np.isnan(s.idxmin(skipna=False)) @@ -1031,9 +1031,9 @@ def test_minmax_nat_series(self, nat_ser): @pytest.mark.parametrize( "nat_df", [ - pd.DataFrame([pd.NaT, pd.NaT]), - pd.DataFrame([pd.NaT, pd.Timedelta("nat")]), - pd.DataFrame([pd.Timedelta("nat"), pd.Timedelta("nat")]), + DataFrame([pd.NaT, pd.NaT]), + DataFrame([pd.NaT, pd.Timedelta("nat")]), + DataFrame([pd.Timedelta("nat"), pd.Timedelta("nat")]), ], ) def test_minmax_nat_dataframe(self, nat_df): diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 28d33ebb23c20..7389fa31109f8 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,7 +3,6 @@ import numpy as np import pytest -import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm from pandas.core.groupby.groupby import DataError @@ -120,7 +119,7 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): index = _asfreq_compat(empty_series_dti.index, freq) - expected = pd.Series([], dtype="int64", index=index, name=empty_series_dti.name) + expected = Series([], dtype="int64", index=index, name=empty_series_dti.name) tm.assert_series_equal(result, expected) @@ -158,7 +157,7 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): index = _asfreq_compat(empty_frame_dti.index, freq) - expected = pd.DataFrame({"a": []}, dtype="int64", index=index) + expected = DataFrame({"a": []}, dtype="int64", index=index) tm.assert_frame_equal(result, expected) @@ -174,7 +173,7 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): index = _asfreq_compat(empty_frame_dti.index, freq) - expected = pd.Series([], dtype="int64", index=index) + expected = Series([], dtype="int64", index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 9475dcc6981ff..19e5a5dd7f5e7 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -111,7 +111,7 @@ def test_resample_basic(series, closed, expected): def test_resample_integerarray(): # GH 25580, resample on IntegerArray - ts = pd.Series( + ts = Series( range(9), index=pd.date_range("1/1/2000", periods=9, freq="T"), dtype="Int64" ) result = ts.resample("3T").sum() @@ -849,7 +849,7 @@ def test_resample_origin_epoch_with_tz_day_vs_24h(): start, end = "2000-10-01 23:30:00+0500", "2000-12-02 00:30:00+0500" rng = pd.date_range(start, end, freq="7min") random_values = np.random.randn(len(rng)) - ts_1 = pd.Series(random_values, index=rng) + ts_1 = Series(random_values, index=rng) result_1 = ts_1.resample("D", origin="epoch").mean() result_2 = ts_1.resample("24H", origin="epoch").mean() @@ -865,7 +865,7 @@ def test_resample_origin_epoch_with_tz_day_vs_24h(): # check that we have the similar results with two different timezones (+2H and +5H) start, end = "2000-10-01 23:30:00+0200", "2000-12-02 00:30:00+0200" rng = pd.date_range(start, end, freq="7min") - ts_2 = pd.Series(random_values, index=rng) + ts_2 = Series(random_values, index=rng) result_5 = ts_2.resample("D", origin="epoch").mean() result_6 = ts_2.resample("24H", origin="epoch").mean() tm.assert_series_equal(result_1.tz_localize(None), result_5.tz_localize(None)) @@ -877,7 +877,7 @@ def test_resample_origin_with_day_freq_on_dst(): tz = "America/Chicago" def _create_series(values, timestamps, freq="D"): - return pd.Series( + return Series( values, index=pd.DatetimeIndex( [Timestamp(t, tz=tz) for t in timestamps], freq=freq, ambiguous=True @@ -888,7 +888,7 @@ def _create_series(values, timestamps, freq="D"): start = pd.Timestamp("2013-11-02", tz=tz) end = pd.Timestamp("2013-11-03 23:59", tz=tz) rng = pd.date_range(start, end, freq="1h") - ts = pd.Series(np.ones(len(rng)), index=rng) + ts = Series(np.ones(len(rng)), index=rng) expected = _create_series([24.0, 25.0], ["2013-11-02", "2013-11-03"]) for origin in ["epoch", "start", "start_day", start, None]: @@ -899,7 +899,7 @@ def _create_series(values, timestamps, freq="D"): start = pd.Timestamp("2013-11-03", tz=tz) end = pd.Timestamp("2013-11-03 23:59", tz=tz) rng = pd.date_range(start, end, freq="1h") - ts = pd.Series(np.ones(len(rng)), index=rng) + ts = Series(np.ones(len(rng)), index=rng) expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"] expected = _create_series([23.0, 2.0], expected_ts) @@ -1442,14 +1442,14 @@ def test_groupby_with_dst_time_change(): [1478064900001000000, 1480037118776792000], tz="UTC" ).tz_convert("America/Chicago") - df = pd.DataFrame([1, 2], index=index) + df = DataFrame([1, 2], index=index) result = df.groupby(pd.Grouper(freq="1d")).last() expected_index_values = pd.date_range( "2016-11-02", "2016-11-24", freq="d", tz="America/Chicago" ) index = pd.DatetimeIndex(expected_index_values) - expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) + expected = DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) tm.assert_frame_equal(result, expected) @@ -1586,7 +1586,7 @@ def test_downsample_dst_at_midnight(): index = pd.date_range(start, end, freq="1H") index = index.tz_localize("UTC").tz_convert("America/Havana") data = list(range(len(index))) - dataframe = pd.DataFrame(data, index=index) + dataframe = DataFrame(data, index=index) result = dataframe.groupby(pd.Grouper(freq="1D")).mean() dti = date_range("2018-11-03", periods=3).tz_localize( @@ -1663,7 +1663,7 @@ def f(data, add_arg): tm.assert_series_equal(result, expected) # Testing dataframe - df = pd.DataFrame({"A": 1, "B": 2}, index=pd.date_range("2017", periods=10)) + df = DataFrame({"A": 1, "B": 2}, index=pd.date_range("2017", periods=10)) result = df.groupby("A").resample("D").agg(f, multiplier) expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) @@ -1689,9 +1689,7 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): # GH 24127 n1_ = n1 * k n2_ = n2 * k - s = pd.Series( - 0, index=pd.date_range("19910905 13:00", "19911005 07:00", freq=freq1) - ) + s = Series(0, index=pd.date_range("19910905 13:00", "19911005 07:00", freq=freq1)) s = s + range(len(s)) result1 = s.resample(str(n1_) + freq1).mean() @@ -1781,9 +1779,9 @@ def test_resample_calendar_day_with_dst( first: str, last: str, freq_in: str, freq_out: str, exp_last: str ): # GH 35219 - ts = pd.Series(1.0, pd.date_range(first, last, freq=freq_in, tz="Europe/Amsterdam")) + ts = Series(1.0, pd.date_range(first, last, freq=freq_in, tz="Europe/Amsterdam")) result = ts.resample(freq_out).pad() - expected = pd.Series( + expected = Series( 1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam") ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 8b3adbf08d157..6523c53cfd2a1 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -41,7 +41,7 @@ def test_deprecating_on_loffset_and_base(): # GH 31809 idx = pd.date_range("2001-01-01", periods=4, freq="T") - df = pd.DataFrame(data=4 * [range(2)], index=idx, columns=["a", "b"]) + df = DataFrame(data=4 * [range(2)], index=idx, columns=["a", "b"]) with tm.assert_produces_warning(FutureWarning): pd.Grouper(freq="10s", base=0) @@ -226,7 +226,7 @@ def test_loffset_returns_datetimeindex(frame, kind, agg_arg): ) def test_resample_with_non_zero_base(start, end, start_freq, end_freq, base, offset): # GH 23882 - s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) + s = Series(0, index=pd.period_range(start, end, freq=start_freq)) s = s + np.arange(len(s)) with tm.assert_produces_warning(FutureWarning): result = s.resample(end_freq, base=base).mean() diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index fe02eaef8ba82..8bdaad285e3f6 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -566,7 +566,7 @@ def test_resample_with_dst_time_change(self): .tz_localize("UTC") .tz_convert("America/Chicago") ) - df = pd.DataFrame([1, 2], index=index) + df = DataFrame([1, 2], index=index) result = df.resample("12h", closed="right", label="right").last().ffill() expected_index_values = [ @@ -588,7 +588,7 @@ def test_resample_with_dst_time_change(self): "America/Chicago" ) index = pd.DatetimeIndex(index, freq="12h") - expected = pd.DataFrame( + expected = DataFrame( [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0], index=index, ) @@ -816,7 +816,7 @@ def test_resample_with_only_nat(self): ) def test_resample_with_offset(self, start, end, start_freq, end_freq, offset): # GH 23882 & 31809 - s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) + s = Series(0, index=pd.period_range(start, end, freq=start_freq)) s = s + np.arange(len(s)) result = s.resample(end_freq, offset=offset).mean() result = result.to_timestamp(end_freq) @@ -861,9 +861,9 @@ def test_sum_min_count(self): index = pd.date_range(start="2018", freq="M", periods=6) data = np.ones(6) data[3:6] = np.nan - s = pd.Series(data, index).to_period() + s = Series(data, index).to_period() result = s.resample("Q").sum(min_count=1) - expected = pd.Series( + expected = Series( [3.0, np.nan], index=PeriodIndex(["2018Q1", "2018Q2"], freq="Q-DEC") ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index e4af5d93ff771..29f2aea1648ec 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -571,7 +571,7 @@ def test_agg_with_datetime_index_list_agg_func(col_name): # date parser. Some would result in OutOfBoundsError (ValueError) while # others would result in OverflowError when passed into Timestamp. # We catch these errors and move on to the correct branch. - df = pd.DataFrame( + df = DataFrame( list(range(200)), index=pd.date_range( start="2017-01-01", freq="15min", periods=200, tz="Europe/Berlin" @@ -579,7 +579,7 @@ def test_agg_with_datetime_index_list_agg_func(col_name): columns=[col_name], ) result = df.resample("1d").aggregate(["mean"]) - expected = pd.DataFrame( + expected = DataFrame( [47.5, 143.5, 195.5], index=pd.date_range( start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" @@ -595,10 +595,10 @@ def test_resample_agg_readonly(): arr = np.zeros_like(index) arr.setflags(write=False) - ser = pd.Series(arr, index=index) + ser = Series(arr, index=index) rs = ser.resample("1D") - expected = pd.Series([pd.Timestamp(0), pd.Timestamp(0)], index=index[::24]) + expected = Series([pd.Timestamp(0), pd.Timestamp(0)], index=index[::24]) result = rs.agg("last") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 73bf7dafac254..ca31ef684257d 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -126,7 +126,7 @@ def test_getitem_multiple(): def test_groupby_resample_on_api_with_getitem(): # GH 17813 - df = pd.DataFrame( + df = DataFrame( {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1} ) exp = df.set_index("date").groupby("id").resample("2D")["data"].sum() @@ -142,7 +142,7 @@ def test_groupby_with_origin(): middle = "1/15/2000 00:00:00" rng = pd.date_range(start, end, freq="1231min") # prime number - ts = pd.Series(np.random.randn(len(rng)), index=rng) + ts = Series(np.random.randn(len(rng)), index=rng) ts2 = ts[middle:end] # proves that grouper without a fixed origin does not work @@ -347,3 +347,18 @@ def test_median_duplicate_columns(): result = df.resample("5s").median() expected.columns = result.columns tm.assert_frame_equal(result, expected) + + +def test_apply_to_one_column_of_df(): + # GH: 36951 + df = DataFrame( + {"col": range(10), "col1": range(10, 20)}, + index=pd.date_range("2012-01-01", periods=10, freq="20min"), + ) + result = df.resample("H").apply(lambda group: group.col.sum()) + expected = Series( + [3, 12, 21, 9], index=pd.date_range("2012-01-01", periods=4, freq="H") + ) + tm.assert_series_equal(result, expected) + result = df.resample("H").apply(lambda group: group["col"].sum()) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index f638706207679..0832724110203 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -167,9 +167,9 @@ def test_aggregate_normal(resample_method): ], ) def test_resample_entirely_nat_window(method, method_args, unit): - s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4)) + s = Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4)) result = methodcaller(method, **method_args)(s.resample("2d")) - expected = pd.Series( + expected = Series( [0.0, unit], index=pd.DatetimeIndex(["2017-01-01", "2017-01-03"], freq="2D") ) tm.assert_series_equal(result, expected) @@ -278,14 +278,14 @@ def test_repr(): ], ) def test_upsample_sum(method, method_args, expected_values): - s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H")) + s = Series(1, index=pd.date_range("2017", periods=2, freq="H")) resampled = s.resample("30T") index = pd.DatetimeIndex( ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"], freq="30T", ) result = methodcaller(method, **method_args)(resampled) - expected = pd.Series(expected_values, index=index) + expected = Series(expected_values, index=index) tm.assert_series_equal(result, expected) @@ -293,7 +293,7 @@ def test_groupby_resample_interpolate(): # GH 35325 d = {"price": [10, 11, 9], "volume": [50, 60, 50]} - df = pd.DataFrame(d) + df = DataFrame(d) df["week_starting"] = pd.date_range("01/01/2018", periods=3, freq="W") @@ -324,7 +324,7 @@ def test_groupby_resample_interpolate(): ], names=["volume", "week_starting"], ) - expected = pd.DataFrame( + expected = DataFrame( data={ "price": [ 10.0, diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 3fa85e62d028c..4783d806f8023 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -144,7 +144,7 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq): # GH 33498 # check that the timedelta bins does not contains an extra bin idx = pd.timedelta_range(start=start, end=end, freq=freq) - s = pd.Series(np.arange(len(idx)), index=idx) + s = Series(np.arange(len(idx)), index=idx) result = s.resample(resample_freq).min() expected_index = pd.timedelta_range(freq=resample_freq, start=start, end=end) tm.assert_index_equal(result.index, expected_index) @@ -154,13 +154,13 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq): def test_resample_with_timedelta_yields_no_empty_groups(): # GH 10603 - df = pd.DataFrame( + df = DataFrame( np.random.normal(size=(10000, 4)), index=pd.timedelta_range(start="0s", periods=10000, freq="3906250n"), ) result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) - expected = pd.DataFrame( + expected = DataFrame( [[768.0] * 4] * 12 + [[528.0] * 4], index=pd.timedelta_range(start="1s", periods=13, freq="3s"), ) diff --git a/pandas/tests/reshape/concat/__init__.py b/pandas/tests/reshape/concat/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/reshape/concat/conftest.py b/pandas/tests/reshape/concat/conftest.py new file mode 100644 index 0000000000000..62b8c59ba8855 --- /dev/null +++ b/pandas/tests/reshape/concat/conftest.py @@ -0,0 +1,7 @@ +import pytest + + +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py new file mode 100644 index 0000000000000..ffeda703cd890 --- /dev/null +++ b/pandas/tests/reshape/concat/test_append.py @@ -0,0 +1,377 @@ +import datetime as dt +from datetime import datetime +from itertools import combinations + +import dateutil +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, Timestamp, concat, isna +import pandas._testing as tm + + +class TestAppend: + def test_append(self, sort, float_frame): + mixed_frame = float_frame.copy() + mixed_frame["foo"] = "bar" + + begin_index = float_frame.index[:5] + end_index = float_frame.index[5:] + + begin_frame = float_frame.reindex(begin_index) + end_frame = float_frame.reindex(end_index) + + appended = begin_frame.append(end_frame) + tm.assert_almost_equal(appended["A"], float_frame["A"]) + + del end_frame["A"] + partial_appended = begin_frame.append(end_frame, sort=sort) + assert "A" in partial_appended + + partial_appended = end_frame.append(begin_frame, sort=sort) + assert "A" in partial_appended + + # mixed type handling + appended = mixed_frame[:5].append(mixed_frame[5:]) + tm.assert_frame_equal(appended, mixed_frame) + + # what to test here + mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort) + mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort) + + # all equal except 'foo' column + tm.assert_frame_equal( + mixed_appended.reindex(columns=["A", "B", "C", "D"]), + mixed_appended2.reindex(columns=["A", "B", "C", "D"]), + ) + + def test_append_empty(self, float_frame): + empty = DataFrame() + + appended = float_frame.append(empty) + tm.assert_frame_equal(float_frame, appended) + assert appended is not float_frame + + appended = empty.append(float_frame) + tm.assert_frame_equal(float_frame, appended) + assert appended is not float_frame + + def test_append_overlap_raises(self, float_frame): + msg = "Indexes have overlapping values" + with pytest.raises(ValueError, match=msg): + float_frame.append(float_frame, verify_integrity=True) + + def test_append_new_columns(self): + # see gh-6129: new columns + df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) + row = Series([5, 6, 7], index=["a", "b", "c"], name="z") + expected = DataFrame( + { + "a": {"x": 1, "y": 2, "z": 5}, + "b": {"x": 3, "y": 4, "z": 6}, + "c": {"z": 7}, + } + ) + result = df.append(row) + tm.assert_frame_equal(result, expected) + + def test_append_length0_frame(self, sort): + df = DataFrame(columns=["A", "B", "C"]) + df3 = DataFrame(index=[0, 1], columns=["A", "B"]) + df5 = df.append(df3, sort=sort) + + expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) + tm.assert_frame_equal(df5, expected) + + def test_append_records(self): + arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] + + arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] + + df1 = DataFrame(arr1) + df2 = DataFrame(arr2) + + result = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate((arr1, arr2))) + tm.assert_frame_equal(result, expected) + + # rewrite sort fixture, since we also want to test default of None + def test_append_sorts(self, sort): + df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) + df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) + + with tm.assert_produces_warning(None): + result = df1.append(df2, sort=sort) + + # for None / True + expected = DataFrame( + {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, + columns=["a", "b", "c"], + ) + if sort is False: + expected = expected[["b", "a", "c"]] + tm.assert_frame_equal(result, expected) + + def test_append_different_columns(self, sort): + df = DataFrame( + { + "bools": np.random.randn(10) > 0, + "ints": np.random.randint(0, 10, 10), + "floats": np.random.randn(10), + "strings": ["foo", "bar"] * 5, + } + ) + + a = df[:5].loc[:, ["bools", "ints", "floats"]] + b = df[5:].loc[:, ["strings", "ints", "floats"]] + + appended = a.append(b, sort=sort) + assert isna(appended["strings"][0:4]).all() + assert isna(appended["bools"][5:]).all() + + def test_append_many(self, sort, float_frame): + chunks = [ + float_frame[:5], + float_frame[5:10], + float_frame[10:15], + float_frame[15:], + ] + + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result, float_frame) + + chunks[-1] = chunks[-1].copy() + chunks[-1]["foo"] = "bar" + result = chunks[0].append(chunks[1:], sort=sort) + tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) + assert (result["foo"][15:] == "bar").all() + assert result["foo"][:15].isna().all() + + def test_append_preserve_index_name(self): + # #980 + df1 = DataFrame(columns=["A", "B", "C"]) + df1 = df1.set_index(["A"]) + df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) + df2 = df2.set_index(["A"]) + + result = df1.append(df2) + assert result.index.name == "A" + + indexes_can_append = [ + pd.RangeIndex(3), + Index([4, 5, 6]), + Index([4.5, 5.5, 6.5]), + Index(list("abc")), + pd.CategoricalIndex("A B C".split()), + pd.CategoricalIndex("D E F".split(), ordered=True), + pd.IntervalIndex.from_breaks([7, 8, 9, 10]), + pd.DatetimeIndex( + [ + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 3, 7, 12), + ] + ), + ] + + indexes_cannot_append_with_other = [ + pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]) + ] + + all_indexes = indexes_can_append + indexes_cannot_append_with_other + + @pytest.mark.parametrize("index", all_indexes, ids=lambda x: type(x).__name__) + def test_append_same_columns_type(self, index): + # GH18359 + + # df wider than ser + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) + ser_index = index[:2] + ser = Series([7, 8], index=ser_index, name=2) + result = df.append(ser) + expected = DataFrame( + [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index + ) + tm.assert_frame_equal(result, expected) + + # ser wider than df + ser_index = index + index = index[:2] + df = DataFrame([[1, 2], [4, 5]], columns=index) + ser = Series([7, 8, 9], index=ser_index, name=2) + result = df.append(ser) + expected = DataFrame( + [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], + index=[0, 1, 2], + columns=ser_index, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "df_columns, series_index", + combinations(indexes_can_append, r=2), + ids=lambda x: type(x).__name__, + ) + def test_append_different_columns_types(self, df_columns, series_index): + # GH18359 + # See also test 'test_append_different_columns_types_raises' below + # for errors raised when appending + + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) + ser = Series([7, 8, 9], index=series_index, name=2) + + result = df.append(ser) + idx_diff = ser.index.difference(df_columns) + combined_columns = Index(df_columns.tolist()).append(idx_diff) + expected = DataFrame( + [ + [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], + [4, 5, 6, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 7, 8, 9], + ], + index=[0, 1, 2], + columns=combined_columns, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "index_can_append", indexes_can_append, ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "index_cannot_append_with_other", + indexes_cannot_append_with_other, + ids=lambda x: type(x).__name__, + ) + def test_append_different_columns_types_raises( + self, index_can_append, index_cannot_append_with_other + ): + # GH18359 + # Dataframe.append will raise if MultiIndex appends + # or is appended to a different index type + # + # See also test 'test_append_different_columns_types' above for + # appending without raising. + + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) + ser = Series([7, 8, 9], index=index_cannot_append_with_other, name=2) + msg = ( + r"Expected tuple, got (int|long|float|str|" + r"pandas._libs.interval.Interval)|" + r"object of type '(int|float|Timestamp|" + r"pandas._libs.interval.Interval)' has no len\(\)|" + ) + with pytest.raises(TypeError, match=msg): + df.append(ser) + + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other) + ser = Series([7, 8, 9], index=index_can_append, name=2) + + with pytest.raises(TypeError, match=msg): + df.append(ser) + + def test_append_dtype_coerce(self, sort): + + # GH 4993 + # appending with datetime will incorrectly convert datetime64 + + df1 = DataFrame( + index=[1, 2], + data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], + columns=["start_time"], + ) + df2 = DataFrame( + index=[4, 5], + data=[ + [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)], + ], + columns=["start_time", "end_time"], + ) + + expected = concat( + [ + Series( + [ + pd.NaT, + pd.NaT, + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10), + ], + name="end_time", + ), + Series( + [ + dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0), + ], + name="start_time", + ), + ], + axis=1, + sort=sort, + ) + result = df1.append(df2, ignore_index=True, sort=sort) + if sort: + expected = expected[["end_time", "start_time"]] + else: + expected = expected[["start_time", "end_time"]] + + tm.assert_frame_equal(result, expected) + + def test_append_missing_column_proper_upcast(self, sort): + df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) + df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) + + appended = df1.append(df2, ignore_index=True, sort=sort) + assert appended["A"].dtype == "f8" + assert appended["B"].dtype == "O" + + def test_append_empty_frame_to_series_with_dateutil_tz(self): + # GH 23682 + date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) + s = Series({"date": date, "a": 1.0, "b": 2.0}) + df = DataFrame(columns=["c", "d"]) + result_a = df.append(s, ignore_index=True) + expected = DataFrame( + [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] + ) + # These columns get cast to object after append + expected["c"] = expected["c"].astype(object) + expected["d"] = expected["d"].astype(object) + tm.assert_frame_equal(result_a, expected) + + expected = DataFrame( + [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"] + ) + expected["c"] = expected["c"].astype(object) + expected["d"] = expected["d"].astype(object) + + result_b = result_a.append(s, ignore_index=True) + tm.assert_frame_equal(result_b, expected) + + # column order is different + expected = expected[["c", "d", "date", "a", "b"]] + result = df.append([s, s], ignore_index=True) + tm.assert_frame_equal(result, expected) + + def test_append_empty_tz_frame_with_datetime64ns(self): + # https://github.com/pandas-dev/pandas/issues/35460 + df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + + # pd.NaT gets inferred as tz-naive, so append result is tz-naive + result = df.append({"a": pd.NaT}, ignore_index=True) + expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + + # also test with typed value to append + df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + result = df.append( + Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True + ) + expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py new file mode 100644 index 0000000000000..395673e9a47ab --- /dev/null +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -0,0 +1,750 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, DataFrame, Index, Series +import pandas._testing as tm + + +class TestConcatAppendCommon: + """ + Test common dtype coercion rules between concat and append. + """ + + def setup_method(self, method): + + dt_data = [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ] + tz_data = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ] + + td_data = [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ] + + period_data = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + self.data = { + "bool": [True, False, True], + "int64": [1, 2, 3], + "float64": [1.1, np.nan, 3.3], + "category": pd.Categorical(["X", "Y", "Z"]), + "object": ["a", "b", "c"], + "datetime64[ns]": dt_data, + "datetime64[ns, US/Eastern]": tz_data, + "timedelta64[ns]": td_data, + "period[M]": period_data, + } + + def _check_expected_dtype(self, obj, label): + """ + Check whether obj has expected dtype depending on label + considering not-supported dtypes + """ + if isinstance(obj, Index): + if label == "bool": + assert obj.dtype == "object" + else: + assert obj.dtype == label + elif isinstance(obj, Series): + if label.startswith("period"): + assert obj.dtype == "Period[M]" + else: + assert obj.dtype == label + else: + raise ValueError + + def test_dtypes(self): + # to confirm test case covers intended dtypes + for typ, vals in self.data.items(): + self._check_expected_dtype(Index(vals), typ) + self._check_expected_dtype(Series(vals), typ) + + def test_concatlike_same_dtypes(self): + # GH 13660 + for typ1, vals1 in self.data.items(): + + vals2 = vals1 + vals3 = vals1 + + if typ1 == "category": + exp_data = pd.Categorical(list(vals1) + list(vals2)) + exp_data3 = pd.Categorical(list(vals1) + list(vals2) + list(vals3)) + else: + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = Index(vals1).append(Index(vals2)) + exp = Index(exp_data) + tm.assert_index_equal(res, exp) + + # 3 elements + res = Index(vals1).append([Index(vals2), Index(vals3)]) + exp = Index(exp_data3) + tm.assert_index_equal(res, exp) + + # index.append name mismatch + i1 = Index(vals1, name="x") + i2 = Index(vals2, name="y") + res = i1.append(i2) + exp = Index(exp_data) + tm.assert_index_equal(res, exp) + + # index.append name match + i1 = Index(vals1, name="x") + i2 = Index(vals2, name="x") + res = i1.append(i2) + exp = Index(exp_data, name="x") + tm.assert_index_equal(res, exp) + + # cannot append non-index + with pytest.raises(TypeError, match="all inputs must be Index"): + Index(vals1).append(vals2) + + with pytest.raises(TypeError, match="all inputs must be Index"): + Index(vals1).append([Index(vals2), vals3]) + + # ----- Series ----- # + + # series.append + res = Series(vals1).append(Series(vals2), ignore_index=True) + exp = Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = Series(vals1).append( + [Series(vals2), Series(vals3)], ignore_index=True + ) + exp = Series(exp_data3) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [Series(vals1), Series(vals2), Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) + + # name mismatch + s1 = Series(vals1, name="x") + s2 = Series(vals2, name="y") + res = s1.append(s2, ignore_index=True) + exp = Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # name match + s1 = Series(vals1, name="x") + s2 = Series(vals2, name="x") + res = s1.append(s2, ignore_index=True) + exp = Series(exp_data, name="x") + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # cannot append non-index + msg = ( + r"cannot concatenate object of type '.+'; " + "only Series and DataFrame objs are valid" + ) + with pytest.raises(TypeError, match=msg): + Series(vals1).append(vals2) + + with pytest.raises(TypeError, match=msg): + Series(vals1).append([Series(vals2), vals3]) + + with pytest.raises(TypeError, match=msg): + pd.concat([Series(vals1), vals2]) + + with pytest.raises(TypeError, match=msg): + pd.concat([Series(vals1), Series(vals2), vals3]) + + def test_concatlike_dtypes_coercion(self): + # GH 13660 + for typ1, vals1 in self.data.items(): + for typ2, vals2 in self.data.items(): + + vals3 = vals2 + + # basically infer + exp_index_dtype = None + exp_series_dtype = None + + if typ1 == typ2: + # same dtype is tested in test_concatlike_same_dtypes + continue + elif typ1 == "category" or typ2 == "category": + # TODO: suspicious + continue + + # specify expected dtype + if typ1 == "bool" and typ2 in ("int64", "float64"): + # series coerces to numeric based on numpy rule + # index doesn't because bool is object dtype + exp_series_dtype = typ2 + elif typ2 == "bool" and typ1 in ("int64", "float64"): + exp_series_dtype = typ1 + elif ( + typ1 == "datetime64[ns, US/Eastern]" + or typ2 == "datetime64[ns, US/Eastern]" + or typ1 == "timedelta64[ns]" + or typ2 == "timedelta64[ns]" + ): + exp_index_dtype = object + exp_series_dtype = object + + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = Index(vals1).append(Index(vals2)) + exp = Index(exp_data, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # 3 elements + res = Index(vals1).append([Index(vals2), Index(vals3)]) + exp = Index(exp_data3, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # ----- Series ----- # + + # series.append + res = Series(vals1).append(Series(vals2), ignore_index=True) + exp = Series(exp_data, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = Series(vals1).append( + [Series(vals2), Series(vals3)], ignore_index=True + ) + exp = Series(exp_data3, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [Series(vals1), Series(vals2), Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) + + def test_concatlike_common_coerce_to_pandas_object(self): + # GH 13626 + # result must be Timestamp/Timedelta, not datetime.datetime/timedelta + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + + exp = Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ] + ) + + res = dti.append(tdi) + tm.assert_index_equal(res, exp) + assert isinstance(res[0], pd.Timestamp) + assert isinstance(res[-1], pd.Timedelta) + + dts = Series(dti) + tds = Series(tdi) + res = dts.append(tds) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + assert isinstance(res.iloc[0], pd.Timestamp) + assert isinstance(res.iloc[-1], pd.Timedelta) + + res = pd.concat([dts, tds]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + assert isinstance(res.iloc[0], pd.Timestamp) + assert isinstance(res.iloc[-1], pd.Timedelta) + + def test_concatlike_datetimetz(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH 7795 + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz) + + exp = pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz + ) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = Series(dti1) + dts2 = Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"]) + def test_concatlike_datetimetz_short(self, tz): + # GH#7795 + ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz) + ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz) + df1 = DataFrame(0, index=ix1, columns=["A", "B"]) + df2 = DataFrame(0, index=ix2, columns=["A", "B"]) + + exp_idx = pd.DatetimeIndex( + ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], + tz=tz, + ) + exp = DataFrame(0, index=exp_idx, columns=["A", "B"]) + + tm.assert_frame_equal(df1.append(df2), exp) + tm.assert_frame_equal(pd.concat([df1, df2]), exp) + + def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH 13660 + + # different tz coerces to object + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"]) + + exp = Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-02"), + ], + dtype=object, + ) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = Series(dti1) + dts2 = Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + # different tz + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + + exp = Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01", tz="US/Pacific"), + pd.Timestamp("2012-01-02", tz="US/Pacific"), + ], + dtype=object, + ) + + res = dti1.append(dti3) + # tm.assert_index_equal(res, exp) + + dts1 = Series(dti1) + dts3 = Series(dti3) + res = dts1.append(dts3) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts3]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period(self): + # GH 13660 + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M") + + exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M") + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = Series(pi1) + ps2 = Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_diff_freq_to_object(self): + # GH 13221 + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") + + exp = Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.Period("2012-02-01", freq="D"), + ], + dtype=object, + ) + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = Series(pi1) + ps2 = Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_mixed_dt_to_object(self): + # GH 13221 + # different datetimelike + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + exp = Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ], + dtype=object, + ) + + res = pi1.append(tdi) + tm.assert_index_equal(res, exp) + + ps1 = Series(pi1) + tds = Series(tdi) + res = ps1.append(tds) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, tds]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + # inverse + exp = Index( + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + ], + dtype=object, + ) + + res = tdi.append(pi1) + tm.assert_index_equal(res, exp) + + ps1 = Series(pi1) + tds = Series(tdi) + res = tds.append(ps1) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([tds, ps1]) + tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) + + def test_concat_categorical(self): + # GH 13524 + + # same categories -> category + s1 = Series([1, 2, np.nan], dtype="category") + s2 = Series([2, 1, 2], dtype="category") + + exp = Series([1, 2, np.nan, 2, 1, 2], dtype="category") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # partially different categories => not-category + s1 = Series([3, 2], dtype="category") + s2 = Series([2, 1], dtype="category") + + exp = Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # completely different categories (same dtype) => not-category + s1 = Series([10, 11, np.nan], dtype="category") + s2 = Series([np.nan, 1, 3, 2], dtype="category") + + exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_union_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19096 + a = Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"])) + b = Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"])) + result = pd.concat([a, b], ignore_index=True) + expected = Series( + Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]) + ) + tm.assert_series_equal(result, expected) + + def test_concat_categorical_coercion(self): + # GH 13524 + + # category + not-category => not-category + s1 = Series([1, 2, np.nan], dtype="category") + s2 = Series([2, 1, 2]) + + exp = Series([1, 2, np.nan, 2, 1, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # result shouldn't be affected by 1st elem dtype + exp = Series([2, 1, 2, 1, 2, np.nan], dtype="object") + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all values are not in category => not-category + s1 = Series([3, 2], dtype="category") + s2 = Series([2, 1]) + + exp = Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series([2, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # completely different categories => not-category + s1 = Series([10, 11, np.nan], dtype="category") + s2 = Series([1, 3, 2]) + + exp = Series([10, 11, np.nan, 1, 3, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series([1, 3, 2, 10, 11, np.nan], dtype="object") + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # different dtype => not-category + s1 = Series([10, 11, np.nan], dtype="category") + s2 = Series(["a", "b", "c"]) + + exp = Series([10, 11, np.nan, "a", "b", "c"]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series(["a", "b", "c", 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # if normal series only contains NaN-likes => not-category + s1 = Series([10, 11], dtype="category") + s2 = Series([np.nan, np.nan, np.nan]) + + exp = Series([10, 11, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series([np.nan, np.nan, np.nan, 10, 11]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + def test_concat_categorical_3elem_coercion(self): + # GH 13524 + + # mixed dtypes => not-category + s1 = Series([1, 2, np.nan], dtype="category") + s2 = Series([2, 1, 2], dtype="category") + s3 = Series([1, 2, 1, 2, np.nan]) + + exp = Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = Series([4, 5, 6], dtype="category") + s2 = Series([1, 2, 3], dtype="category") + s3 = Series([1, 3, 4]) + + exp = Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = Series([4, 5, 6], dtype="category") + s2 = Series([1, 2, 3], dtype="category") + s3 = Series([10, 11, 12]) + + exp = Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + def test_concat_categorical_multi_coercion(self): + # GH 13524 + + s1 = Series([1, 3], dtype="category") + s2 = Series([3, 4], dtype="category") + s3 = Series([2, 3]) + s4 = Series([2, 2], dtype="category") + s5 = Series([1, np.nan]) + s6 = Series([1, 3, 2], dtype="category") + + # mixed dtype, values are all in categories => not-category + exp = Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) + res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) + res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + + def test_concat_categorical_ordered(self): + # GH 13524 + + s1 = Series(pd.Categorical([1, 2, np.nan], ordered=True)) + s2 = Series(pd.Categorical([2, 1, 2], ordered=True)) + + exp = Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = Series( + pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True) + ) + tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) + + def test_concat_categorical_coercion_nan(self): + # GH 13524 + + # some edge cases + # category + not-category => not category + s1 = Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category") + s2 = Series([np.nan, 1]) + + exp = Series([np.nan, np.nan, np.nan, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + s1 = Series([1, np.nan], dtype="category") + s2 = Series([np.nan, np.nan]) + + exp = Series([1, np.nan, np.nan, np.nan], dtype="float") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # mixed dtype, all nan-likes => not-category + s1 = Series([np.nan, np.nan], dtype="category") + s2 = Series([np.nan, np.nan]) + + exp = Series([np.nan, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all category nan-likes => category + s1 = Series([np.nan, np.nan], dtype="category") + s2 = Series([np.nan, np.nan], dtype="category") + + exp = Series([np.nan, np.nan, np.nan, np.nan], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_empty(self): + # GH 13524 + + s1 = Series([], dtype="category") + s2 = Series([1, 2], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = Series([], dtype="category") + s2 = Series([], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + s1 = Series([], dtype="category") + s2 = Series([], dtype="object") + + # different dtype => not-category + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = Series([], dtype="category") + s2 = Series([np.nan, np.nan]) + + # empty Series is ignored + exp = Series([np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + def test_categorical_concat_append(self): + cat = Categorical(["a", "b"], categories=["a", "b"]) + vals = [1, 2] + df = DataFrame({"cats": cat, "vals": vals}) + cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"]) + vals2 = [1, 2, 1, 2] + exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) + + tm.assert_frame_equal(pd.concat([df, df]), exp) + tm.assert_frame_equal(df.append(df), exp) + + # GH 13524 can concat different categories + cat3 = Categorical(["a", "b"], categories=["a", "b", "c"]) + vals3 = [1, 2] + df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) + + res = pd.concat([df, df_different_categories], ignore_index=True) + exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) + tm.assert_frame_equal(res, exp) + + res = df.append(df_different_categories, ignore_index=True) + tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py new file mode 100644 index 0000000000000..388575c5a3b86 --- /dev/null +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -0,0 +1,195 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Series +import pandas._testing as tm + + +class TestCategoricalConcat: + def test_categorical_concat(self, sort): + # See GH 10177 + df1 = DataFrame( + np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"] + ) + + df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"]) + + cat_values = ["one", "one", "two", "one", "two", "two", "one"] + df2["h"] = Series(Categorical(cat_values)) + + res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) + exp = DataFrame( + { + "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + "b": [ + 1, + 4, + 7, + 10, + 13, + 16, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + "h": [None] * 6 + cat_values, + } + ) + tm.assert_frame_equal(res, exp) + + def test_categorical_concat_dtypes(self): + + # GH8143 + index = ["cat", "obj", "num"] + cat = Categorical(["a", "b", "c"]) + obj = Series(["a", "b", "c"]) + num = Series([1, 2, 3]) + df = pd.concat([Series(cat), obj, num], axis=1, keys=index) + + result = df.dtypes == "object" + expected = Series([False, True, False], index=index) + tm.assert_series_equal(result, expected) + + result = df.dtypes == "int64" + expected = Series([False, False, True], index=index) + tm.assert_series_equal(result, expected) + + result = df.dtypes == "category" + expected = Series([True, False, False], index=index) + tm.assert_series_equal(result, expected) + + def test_concat_categoricalindex(self): + # GH 16111, categories that aren't lexsorted + categories = [9, 0, 1, 2, 3] + + a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) + b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) + c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) + + result = pd.concat([a, b, c], axis=1) + + exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) + exp = DataFrame( + { + 0: [1, 1, np.nan, np.nan], + 1: [np.nan, 2, 2, np.nan], + 2: [np.nan, np.nan, 3, 3], + }, + columns=[0, 1, 2], + index=exp_idx, + ) + tm.assert_frame_equal(result, exp) + + def test_categorical_concat_preserve(self): + + # GH 8641 series concat not preserving category dtype + # GH 13524 can concat different categories + s = Series(list("abc"), dtype="category") + s2 = Series(list("abd"), dtype="category") + + exp = Series(list("abcabd")) + res = pd.concat([s, s2], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = Series(list("abcabc"), dtype="category") + res = pd.concat([s, s], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category") + res = pd.concat([s, s]) + tm.assert_series_equal(res, exp) + + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) + + df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))}) + res = pd.concat([df2, df2]) + exp = DataFrame( + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ) + tm.assert_frame_equal(res, exp) + + def test_categorical_index_preserver(self): + + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) + + df2 = DataFrame( + {"A": a, "B": b.astype(CategoricalDtype(list("cab")))} + ).set_index("B") + result = pd.concat([df2, df2]) + expected = DataFrame( + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") + tm.assert_frame_equal(result, expected) + + # wrong categories + df3 = DataFrame( + {"A": a, "B": Categorical(b, categories=list("abe"))} + ).set_index("B") + msg = "categories must match existing categories when appending" + with pytest.raises(TypeError, match=msg): + pd.concat([df2, df3]) + + def test_concat_categorical_tz(self): + # GH-23816 + a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific")) + b = Series(["a", "b"], dtype="category") + result = pd.concat([a, b], ignore_index=True) + expected = Series( + [ + pd.Timestamp("2017-01-01", tz="US/Pacific"), + pd.Timestamp("2017-01-02", tz="US/Pacific"), + "a", + "b", + ] + ) + tm.assert_series_equal(result, expected) + + def test_concat_categorical_unchanged(self): + # GH-12007 + # test fix for when concat on categorical and float + # coerces dtype categorical -> float + df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A")) + ser = Series([0, 1, 2], index=[0, 1, 3], name="B") + result = pd.concat([df, ser], axis=1) + expected = DataFrame( + { + "A": Series(["a", "b", "c", np.nan], dtype="category"), + "B": Series([0, 1, np.nan, 2], dtype="float"), + } + ) + tm.assert_equal(result, expected) + + def test_categorical_concat_gh7864(self): + # GH 7864 + # make sure ordering is preserved + df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")}) + df["grade"] = Categorical(df["raw_grade"]) + df["grade"].cat.set_categories(["e", "a", "b"]) + + df1 = df[0:3] + df2 = df[3:] + + tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories) + + dfx = pd.concat([df1, df2]) + tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) + + dfa = df1.append(df2) + tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py new file mode 100644 index 0000000000000..90172abefb8c4 --- /dev/null +++ b/pandas/tests/reshape/concat/test_concat.py @@ -0,0 +1,558 @@ +from collections import abc, deque +from decimal import Decimal +from warnings import catch_warnings + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range +import pandas._testing as tm +from pandas.core.arrays import SparseArray +from pandas.core.construction import create_series_with_explicit_dtype +from pandas.tests.extension.decimal import to_decimal + + +class TestConcatenate: + def test_concat_copy(self): + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) + df3 = DataFrame({5: "foo"}, index=range(4)) + + # These are actual copies. + result = concat([df, df2, df3], axis=1, copy=True) + + for b in result._mgr.blocks: + assert b.values.base is None + + # These are the same. + result = concat([df, df2, df3], axis=1, copy=False) + + for b in result._mgr.blocks: + if b.is_float: + assert b.values.base is df._mgr.blocks[0].values.base + elif b.is_integer: + assert b.values.base is df2._mgr.blocks[0].values.base + elif b.is_object: + assert b.values.base is not None + + # Float block was consolidated. + df4 = DataFrame(np.random.randn(4, 1)) + result = concat([df, df2, df3, df4], axis=1, copy=False) + for b in result._mgr.blocks: + if b.is_float: + assert b.values.base is None + elif b.is_integer: + assert b.values.base is df2._mgr.blocks[0].values.base + elif b.is_object: + assert b.values.base is not None + + def test_concat_with_group_keys(self): + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + # axis=0 + df = DataFrame(np.random.randn(3, 4)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1]) + exp_index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]] + ) + expected = DataFrame(np.r_[df.values, df2.values], index=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1]) + exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) + tm.assert_frame_equal(result, expected) + + # axis=1 + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_specific_levels(self): + df = DataFrame(np.random.randn(10, 4)) + pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]] + level = ["three", "two", "one", "zero"] + result = concat( + pieces, + axis=1, + keys=["one", "two", "three"], + levels=[level], + names=["group_key"], + ) + + tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) + tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) + + assert result.columns.names == ["group_key", None] + + @pytest.mark.parametrize("mapping", ["mapping", "dict"]) + def test_concat_mapping(self, mapping, non_dict_mapping_subclass): + constructor = dict if mapping == "dict" else non_dict_mapping_subclass + frames = constructor( + { + "foo": DataFrame(np.random.randn(4, 3)), + "bar": DataFrame(np.random.randn(4, 3)), + "baz": DataFrame(np.random.randn(4, 3)), + "qux": DataFrame(np.random.randn(4, 3)), + } + ) + + sorted_keys = list(frames.keys()) + + result = concat(frames) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) + tm.assert_frame_equal(result, expected) + + result = concat(frames, axis=1) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1) + tm.assert_frame_equal(result, expected) + + keys = ["baz", "foo", "bar"] + result = concat(frames, keys=keys) + expected = concat([frames[k] for k in keys], keys=keys) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_and_levels(self): + df = DataFrame(np.random.randn(1, 3)) + df2 = DataFrame(np.random.randn(1, 4)) + + levels = [["foo", "baz"], ["one", "two"]] + names = ["first", "second"] + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + levels=levels, + names=names, + ) + expected = concat([df, df2, df, df2]) + exp_index = MultiIndex( + levels=levels + [[0]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]], + names=names + [None], + ) + expected.index = exp_index + + tm.assert_frame_equal(result, expected) + + # no names + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + levels=levels, + ) + assert result.index.names == (None,) * 3 + + # no levels + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + names=["first", "second"], + ) + assert result.index.names == ("first", "second", None) + tm.assert_index_equal( + result.index.levels[0], Index(["baz", "foo"], name="first") + ) + + def test_concat_keys_levels_no_overlap(self): + # GH #1406 + df = DataFrame(np.random.randn(1, 3), index=["a"]) + df2 = DataFrame(np.random.randn(1, 4), index=["b"]) + + msg = "Values not found in passed level" + with pytest.raises(ValueError, match=msg): + concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) + + msg = "Key one not in level" + with pytest.raises(ValueError, match=msg): + concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) + + def test_crossed_dtypes_weird_corner(self): + columns = ["A", "B", "C", "D"] + df1 = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="f8"), + "B": np.array([1, 2, 3, 4], dtype="i8"), + "C": np.array([1, 2, 3, 4], dtype="f8"), + "D": np.array([1, 2, 3, 4], dtype="i8"), + }, + columns=columns, + ) + + df2 = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="i8"), + "B": np.array([1, 2, 3, 4], dtype="f8"), + "C": np.array([1, 2, 3, 4], dtype="i8"), + "D": np.array([1, 2, 3, 4], dtype="f8"), + }, + columns=columns, + ) + + appended = df1.append(df2, ignore_index=True) + expected = DataFrame( + np.concatenate([df1.values, df2.values], axis=0), columns=columns + ) + tm.assert_frame_equal(appended, expected) + + df = DataFrame(np.random.randn(1, 3), index=["a"]) + df2 = DataFrame(np.random.randn(1, 4), index=["b"]) + result = concat([df, df2], keys=["one", "two"], names=["first", "second"]) + assert result.index.names == ("first", "second") + + def test_with_mixed_tuples(self, sort): + # 10697 + # columns have mixed tuples, so handle properly + df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2)) + df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2)) + + # it works + concat([df1, df2], sort=sort) + + def test_concat_mixed_objs(self): + + # concat mixed series/frames + # G2385 + + # axis 1 + index = date_range("01-Jan-2013", periods=10, freq="H") + arr = np.arange(10, dtype="int64") + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1, 1), index=index) + + expected = DataFrame( + np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0] + ) + result = concat([df, df], axis=1) + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1] + ) + result = concat([s1, s2], axis=1) + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] + ) + result = concat([s1, s2, s1], axis=1) + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3] + ) + result = concat([s1, df, s2, s2, s1], axis=1) + tm.assert_frame_equal(result, expected) + + # with names + s1.name = "foo" + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0] + ) + result = concat([s1, df, s2], axis=1) + tm.assert_frame_equal(result, expected) + + s2.name = "bar" + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"] + ) + result = concat([s1, df, s2], axis=1) + tm.assert_frame_equal(result, expected) + + # ignore index + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] + ) + result = concat([s1, df, s2], axis=1, ignore_index=True) + tm.assert_frame_equal(result, expected) + + # axis 0 + expected = DataFrame( + np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0] + ) + result = concat([s1, df, s2]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) + result = concat([s1, df, s2], ignore_index=True) + tm.assert_frame_equal(result, expected) + + def test_dtype_coerceion(self): + + # 12411 + df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) + + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) + + # 12045 + import datetime + + df = DataFrame( + {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} + ) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) + + # 11594 + df = DataFrame({"text": ["some words"] + [None] * 9}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) + + def test_concat_single_with_key(self): + df = DataFrame(np.random.randn(10, 4)) + + result = concat([df], keys=["foo"]) + expected = concat([df, df], keys=["foo", "bar"]) + tm.assert_frame_equal(result, expected[:10]) + + def test_concat_exclude_none(self): + df = DataFrame(np.random.randn(10, 4)) + + pieces = [df[:5], None, None, df[5:]] + result = concat(pieces) + tm.assert_frame_equal(result, df) + with pytest.raises(ValueError, match="All objects passed were None"): + concat([None, None]) + + def test_concat_keys_with_none(self): + # #1649 + df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) + + result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) + expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) + tm.assert_frame_equal(result, expected) + + result = concat( + [None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"] + ) + expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"]) + tm.assert_frame_equal(result, expected) + + def test_concat_bug_1719(self): + ts1 = tm.makeTimeSeries() + ts2 = tm.makeTimeSeries()[::2] + + # to join with union + # these two are of different length! + left = concat([ts1, ts2], join="outer", axis=1) + right = concat([ts2, ts1], join="outer", axis=1) + + assert len(left) == len(right) + + def test_concat_bug_2972(self): + ts0 = Series(np.zeros(5)) + ts1 = Series(np.ones(5)) + ts0.name = ts1.name = "same name" + result = concat([ts0, ts1], axis=1) + + expected = DataFrame({0: ts0, 1: ts1}) + expected.columns = ["same name", "same name"] + tm.assert_frame_equal(result, expected) + + def test_concat_bug_3602(self): + + # GH 3602, duplicate columns + df1 = DataFrame( + { + "firmNo": [0, 0, 0, 0], + "prc": [6, 6, 6, 6], + "stringvar": ["rrr", "rrr", "rrr", "rrr"], + } + ) + df2 = DataFrame( + {"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]} + ) + expected = DataFrame( + [ + [0, 6, "rrr", 9, 1, 6], + [0, 6, "rrr", 10, 2, 6], + [0, 6, "rrr", 11, 3, 6], + [0, 6, "rrr", 12, 4, 6], + ] + ) + expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"] + + result = concat([df1, df2], axis=1) + tm.assert_frame_equal(result, expected) + + def test_concat_iterables(self): + # GH8645 check concat works with tuples, list, generators, and weird + # stuff like deque and custom iterables + df1 = DataFrame([1, 2, 3]) + df2 = DataFrame([4, 5, 6]) + expected = DataFrame([1, 2, 3, 4, 5, 6]) + tm.assert_frame_equal(concat((df1, df2), ignore_index=True), expected) + tm.assert_frame_equal(concat([df1, df2], ignore_index=True), expected) + tm.assert_frame_equal( + concat((df for df in (df1, df2)), ignore_index=True), expected + ) + tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) + + class CustomIterator1: + def __len__(self) -> int: + return 2 + + def __getitem__(self, index): + try: + return {0: df1, 1: df2}[index] + except KeyError as err: + raise IndexError from err + + tm.assert_frame_equal(pd.concat(CustomIterator1(), ignore_index=True), expected) + + class CustomIterator2(abc.Iterable): + def __iter__(self): + yield df1 + yield df2 + + tm.assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) + + def test_concat_order(self): + # GH 17344 + dfs = [DataFrame(index=range(3), columns=["a", 1, None])] + dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for i in range(100)] + + result = pd.concat(dfs, sort=True).columns + expected = dfs[0].columns + tm.assert_index_equal(result, expected) + + def test_concat_different_extension_dtypes_upcasts(self): + a = Series(pd.core.arrays.integer_array([1, 2])) + b = Series(to_decimal([1, 2])) + + result = pd.concat([a, b], ignore_index=True) + expected = Series([1, 2, Decimal(1), Decimal(2)], dtype=object) + tm.assert_series_equal(result, expected) + + def test_concat_ordered_dict(self): + # GH 21510 + expected = pd.concat( + [Series(range(3)), Series(range(4))], keys=["First", "Another"] + ) + result = pd.concat( + dict([("First", Series(range(3))), ("Another", Series(range(4)))]) + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pdt", [Series, pd.DataFrame]) +@pytest.mark.parametrize("dt", np.sctypes["float"]) +def test_concat_no_unnecessary_upcast(dt, pdt): + # GH 13247 + dims = pdt(dtype=object).ndim + + dfs = [ + pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], dtype=dt, ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims)), + ] + x = pd.concat(dfs) + assert x.values.dtype == dt + + +@pytest.mark.parametrize("pdt", [create_series_with_explicit_dtype, pd.DataFrame]) +@pytest.mark.parametrize("dt", np.sctypes["int"]) +def test_concat_will_upcast(dt, pdt): + with catch_warnings(record=True): + dims = pdt().ndim + dfs = [ + pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims)), + ] + x = pd.concat(dfs) + assert x.values.dtype == "float64" + + +def test_concat_empty_and_non_empty_frame_regression(): + # GH 18178 regression test + df1 = DataFrame({"foo": [1]}) + df2 = DataFrame({"foo": []}) + expected = DataFrame({"foo": [1.0]}) + result = pd.concat([df1, df2]) + tm.assert_frame_equal(result, expected) + + +def test_concat_sparse(): + # GH 23557 + a = Series(SparseArray([0, 1, 2])) + expected = DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( + pd.SparseDtype(np.int64, 0) + ) + result = pd.concat([a, a], axis=1) + tm.assert_frame_equal(result, expected) + + +def test_concat_dense_sparse(): + # GH 30668 + a = Series(pd.arrays.SparseArray([1, None]), dtype=float) + b = Series([1], dtype=float) + expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype( + pd.SparseDtype(np.float64, None) + ) + result = pd.concat([a, b], axis=0) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keys", [["e", "f", "f"], ["f", "e", "f"]]) +def test_duplicate_keys(keys): + # GH 33654 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + s1 = Series([7, 8, 9], name="c") + s2 = Series([10, 11, 12], name="d") + result = concat([df, s1, s2], axis=1, keys=keys) + expected_values = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]] + expected_columns = pd.MultiIndex.from_tuples( + [(keys[0], "a"), (keys[0], "b"), (keys[1], "c"), (keys[2], "d")] + ) + expected = DataFrame(expected_values, columns=expected_columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "obj", + [ + tm.SubclassedDataFrame({"A": np.arange(0, 10)}), + tm.SubclassedSeries(np.arange(0, 10), name="A"), + ], +) +def test_concat_preserves_subclass(obj): + # GH28330 -- preserve subclass + + result = concat([obj, obj]) + assert isinstance(result, type(obj)) + + +def test_concat_frame_axis0_extension_dtypes(): + # preserve extension dtype (through common_dtype mechanism) + df1 = DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")}) + df2 = DataFrame({"a": np.array([4, 5, 6])}) + + result = pd.concat([df1, df2], ignore_index=True) + expected = DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64") + tm.assert_frame_equal(result, expected) + + result = pd.concat([df2, df1], ignore_index=True) + expected = DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") + tm.assert_frame_equal(result, expected) + + +def test_concat_preserves_extension_int64_dtype(): + # GH 24768 + df_a = DataFrame({"a": [-1]}, dtype="Int64") + df_b = DataFrame({"b": [1]}, dtype="Int64") + result = pd.concat([df_a, df_b], ignore_index=True) + expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/reshape/concat/test_dataframe.py similarity index 57% rename from pandas/tests/frame/test_combine_concat.py rename to pandas/tests/reshape/concat/test_dataframe.py index 7eba2b873c4f4..295846ee1b264 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -1,17 +1,15 @@ -from datetime import datetime - import numpy as np import pytest import pandas as pd -from pandas import DataFrame, Index, Series, Timestamp, date_range +from pandas import DataFrame, Index, Series, concat import pandas._testing as tm class TestDataFrameConcat: def test_concat_multiple_frames_dtypes(self): - # GH 2759 + # GH#2759 A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64) B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) results = pd.concat((A, B), axis=1).dtypes @@ -21,73 +19,12 @@ def test_concat_multiple_frames_dtypes(self): ) tm.assert_series_equal(results, expected) - def test_concat_multiple_tzs(self): - # GH 12467 - # combining datetime tz-aware and naive DataFrames - ts1 = Timestamp("2015-01-01", tz=None) - ts2 = Timestamp("2015-01-01", tz="UTC") - ts3 = Timestamp("2015-01-01", tz="EST") - - df1 = DataFrame(dict(time=[ts1])) - df2 = DataFrame(dict(time=[ts2])) - df3 = DataFrame(dict(time=[ts3])) - - results = pd.concat([df1, df2]).reset_index(drop=True) - expected = DataFrame(dict(time=[ts1, ts2]), dtype=object) - tm.assert_frame_equal(results, expected) - - results = pd.concat([df1, df3]).reset_index(drop=True) - expected = DataFrame(dict(time=[ts1, ts3]), dtype=object) - tm.assert_frame_equal(results, expected) - - results = pd.concat([df2, df3]).reset_index(drop=True) - expected = DataFrame(dict(time=[ts2, ts3])) - tm.assert_frame_equal(results, expected) - - @pytest.mark.parametrize( - "t1", - [ - "2015-01-01", - pytest.param( - pd.NaT, - marks=pytest.mark.xfail( - reason="GH23037 incorrect dtype when concatenating" - ), - ), - ], - ) - def test_concat_tz_NaT(self, t1): - # GH 22796 - # Concating tz-aware multicolumn DataFrames - ts1 = Timestamp(t1, tz="UTC") - ts2 = Timestamp("2015-01-01", tz="UTC") - ts3 = Timestamp("2015-01-01", tz="UTC") - - df1 = DataFrame([[ts1, ts2]]) - df2 = DataFrame([[ts3]]) - - result = pd.concat([df1, df2]) - expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0]) - - tm.assert_frame_equal(result, expected) - - def test_concat_tz_not_aligned(self): - # GH 22796 - ts = pd.to_datetime([1, 2]).tz_localize("UTC") - a = pd.DataFrame({"A": ts}) - b = pd.DataFrame({"A": ts, "B": ts}) - result = pd.concat([a, b], sort=True, ignore_index=True) - expected = pd.DataFrame( - {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)} - ) - tm.assert_frame_equal(result, expected) - def test_concat_tuple_keys(self): - # GH 14438 - df1 = pd.DataFrame(np.ones((2, 2)), columns=list("AB")) - df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list("AB")) + # GH#14438 + df1 = DataFrame(np.ones((2, 2)), columns=list("AB")) + df2 = DataFrame(np.ones((3, 2)) * 2, columns=list("AB")) results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")]) - expected = pd.DataFrame( + expected = DataFrame( { "A": { ("bee", "bah", 0): 1.0, @@ -108,11 +45,11 @@ def test_concat_tuple_keys(self): tm.assert_frame_equal(results, expected) def test_concat_named_keys(self): - # GH 14252 - df = pd.DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) + # GH#14252 + df = DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) index = Index(["a", "b"], name="baz") concatted_named_from_keys = pd.concat([df, df], keys=index) - expected_named = pd.DataFrame( + expected_named = DataFrame( {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]), ) @@ -125,19 +62,19 @@ def test_concat_named_keys(self): tm.assert_frame_equal(concatted_named_from_names, expected_named) concatted_unnamed = pd.concat([df, df], keys=index_no_name) - expected_unnamed = pd.DataFrame( + expected_unnamed = DataFrame( {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]), ) tm.assert_frame_equal(concatted_unnamed, expected_unnamed) def test_concat_axis_parameter(self): - # GH 14369 - df1 = pd.DataFrame({"A": [0.1, 0.2]}, index=range(2)) - df2 = pd.DataFrame({"A": [0.3, 0.4]}, index=range(2)) + # GH#14369 + df1 = DataFrame({"A": [0.1, 0.2]}, index=range(2)) + df2 = DataFrame({"A": [0.3, 0.4]}, index=range(2)) # Index/row/0 DataFrame - expected_index = pd.DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) + expected_index = DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) concatted_index = pd.concat([df1, df2], axis="index") tm.assert_frame_equal(concatted_index, expected_index) @@ -149,7 +86,7 @@ def test_concat_axis_parameter(self): tm.assert_frame_equal(concatted_0, expected_index) # Columns/1 DataFrame - expected_columns = pd.DataFrame( + expected_columns = DataFrame( [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"] ) @@ -159,11 +96,11 @@ def test_concat_axis_parameter(self): concatted_1 = pd.concat([df1, df2], axis=1) tm.assert_frame_equal(concatted_1, expected_columns) - series1 = pd.Series([0.1, 0.2]) - series2 = pd.Series([0.3, 0.4]) + series1 = Series([0.1, 0.2]) + series2 = Series([0.3, 0.4]) # Index/row/0 Series - expected_index_series = pd.Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) + expected_index_series = Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) concatted_index_series = pd.concat([series1, series2], axis="index") tm.assert_series_equal(concatted_index_series, expected_index_series) @@ -175,7 +112,7 @@ def test_concat_axis_parameter(self): tm.assert_series_equal(concatted_0_series, expected_index_series) # Columns/1 Series - expected_columns_series = pd.DataFrame( + expected_columns_series = DataFrame( [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1] ) @@ -190,8 +127,8 @@ def test_concat_axis_parameter(self): pd.concat([series1, series2], axis="something") def test_concat_numerical_names(self): - # #15262 # #12223 - df = pd.DataFrame( + # GH#15262, GH#12223 + df = DataFrame( {"col": range(9)}, dtype="int32", index=( @@ -201,7 +138,7 @@ def test_concat_numerical_names(self): ), ) result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) - expected = pd.DataFrame( + expected = DataFrame( {"col": [0, 1, 7, 8]}, dtype="int32", index=pd.MultiIndex.from_tuples( @@ -211,26 +148,22 @@ def test_concat_numerical_names(self): tm.assert_frame_equal(result, expected) def test_concat_astype_dup_col(self): - # gh 23049 - df = pd.DataFrame([{"a": "b"}]) + # GH#23049 + df = DataFrame([{"a": "b"}]) df = pd.concat([df, df], axis=1) result = df.astype("category") - expected = pd.DataFrame( + expected = DataFrame( np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"] ).astype("category") tm.assert_frame_equal(result, expected) - def test_concat_datetime_datetime64_frame(self): - # #2624 - rows = [] - rows.append([datetime(2010, 1, 1), 1]) - rows.append([datetime(2010, 1, 2), "hi"]) - - df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) - - ind = date_range(start="2000/1/1", freq="D", periods=10) - df1 = DataFrame({"date": ind, "test": range(10)}) + def test_concat_dataframe_keys_bug(self, sort): + t1 = DataFrame( + {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))} + ) + t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) - # it works! - pd.concat([df1, df2_obj]) + # it works + result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) + assert list(result.columns) == [("t1", "value"), ("t2", "value")] diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py new file mode 100644 index 0000000000000..8783f539faa65 --- /dev/null +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -0,0 +1,525 @@ +import datetime as dt +from datetime import datetime + +import dateutil +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + to_timedelta, +) +import pandas._testing as tm + + +class TestDatetimeConcat: + def test_concat_datetime64_block(self): + from pandas.core.indexes.datetimes import date_range + + rng = date_range("1/1/2000", periods=10) + + df = DataFrame({"time": rng}) + + result = concat([df, df]) + assert (result.iloc[:10]["time"] == rng).all() + assert (result.iloc[10:]["time"] == rng).all() + + def test_concat_datetime_datetime64_frame(self): + # GH#2624 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), "hi"]) + + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) + + ind = date_range(start="2000/1/1", freq="D", periods=10) + df1 = DataFrame({"date": ind, "test": range(10)}) + + # it works! + pd.concat([df1, df2_obj]) + + def test_concat_datetime_timezone(self): + # GH 18523 + idx1 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") + idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq="H") + df1 = DataFrame({"a": [1, 2, 3]}, index=idx1) + df2 = DataFrame({"b": [1, 2, 3]}, index=idx2) + result = pd.concat([df1, df2], axis=1) + + exp_idx = ( + DatetimeIndex( + [ + "2011-01-01 00:00:00+01:00", + "2011-01-01 01:00:00+01:00", + "2011-01-01 02:00:00+01:00", + ], + freq="H", + ) + .tz_convert("UTC") + .tz_convert("Europe/Paris") + ) + + expected = DataFrame( + [[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"] + ) + + tm.assert_frame_equal(result, expected) + + idx3 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") + df3 = DataFrame({"b": [1, 2, 3]}, index=idx3) + result = pd.concat([df1, df3], axis=1) + + exp_idx = DatetimeIndex( + [ + "2010-12-31 15:00:00+00:00", + "2010-12-31 16:00:00+00:00", + "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", + ] + ) + + expected = DataFrame( + [ + [np.nan, 1], + [np.nan, 2], + [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], + ], + index=exp_idx, + columns=["a", "b"], + ) + + tm.assert_frame_equal(result, expected) + + # GH 13783: Concat after resample + result = pd.concat( + [df1.resample("H").mean(), df2.resample("H").mean()], sort=True + ) + expected = DataFrame( + {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, + index=idx1.append(idx1), + ) + tm.assert_frame_equal(result, expected) + + def test_concat_datetimeindex_freq(self): + # GH 3232 + # Monotonic index result + dr = pd.date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") + data = list(range(100)) + expected = DataFrame(data, index=dr) + result = pd.concat([expected[:50], expected[50:]]) + tm.assert_frame_equal(result, expected) + + # Non-monotonic index result + result = pd.concat([expected[50:], expected[:50]]) + expected = DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) + expected.index._data.freq = None + tm.assert_frame_equal(result, expected) + + def test_concat_multiindex_datetime_object_index(self): + # https://github.com/pandas-dev/pandas/issues/11058 + s = Series( + ["a", "b"], + index=MultiIndex.from_arrays( + [ + [1, 2], + Index([dt.date(2013, 1, 1), dt.date(2014, 1, 1)], dtype="object"), + ], + names=["first", "second"], + ), + ) + s2 = Series( + ["a", "b"], + index=MultiIndex.from_arrays( + [ + [1, 2], + Index([dt.date(2013, 1, 1), dt.date(2015, 1, 1)], dtype="object"), + ], + names=["first", "second"], + ), + ) + expected = DataFrame( + [["a", "a"], ["b", np.nan], [np.nan, "b"]], + index=MultiIndex.from_arrays( + [ + [1, 2, 2], + DatetimeIndex( + ["2013-01-01", "2014-01-01", "2015-01-01"], + dtype="datetime64[ns]", + freq=None, + ), + ], + names=["first", "second"], + ), + ) + result = concat([s, s2], axis=1) + tm.assert_frame_equal(result, expected) + + def test_concat_NaT_series(self): + # GH 11693 + # test for merging NaT series with datetime series. + x = Series( + date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern") + ) + y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + expected = Series([x[0], x[1], pd.NaT, pd.NaT]) + + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT with tz + expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]") + result = pd.concat([y, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # without tz + x = Series(pd.date_range("20151124 08:00", "20151124 09:00", freq="1h")) + y = Series(pd.date_range("20151124 10:00", "20151124 11:00", freq="1h")) + y[:] = pd.NaT + expected = Series([x[0], x[1], pd.NaT, pd.NaT]) + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT without tz + x[:] = pd.NaT + expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns]") + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_concat_NaT_dataframes(self, tz): + # GH 12396 + + first = DataFrame([[pd.NaT], [pd.NaT]]) + first = first.apply(lambda x: x.dt.tz_localize(tz)) + second = DataFrame( + [[pd.Timestamp("2015/01/01", tz=tz)], [pd.Timestamp("2016/01/01", tz=tz)]], + index=[2, 3], + ) + expected = DataFrame( + [ + pd.NaT, + pd.NaT, + pd.Timestamp("2015/01/01", tz=tz), + pd.Timestamp("2016/01/01", tz=tz), + ] + ) + + result = pd.concat([first, second], axis=0) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) + @pytest.mark.parametrize("s", [pd.NaT, pd.Timestamp("20150101")]) + def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): + # GH 12396 + + # tz-naive + first = DataFrame([[pd.NaT], [pd.NaT]]).apply(lambda x: x.dt.tz_localize(tz1)) + second = DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) + + result = pd.concat([first, second], axis=0) + expected = DataFrame(Series([pd.NaT, pd.NaT, s], index=[0, 1, 0])) + expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) + if tz1 != tz2: + expected = expected.astype(object) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) + def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): + # GH 12396 + + first = DataFrame(Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)) + second = DataFrame(Series([pd.NaT]).dt.tz_localize(tz2), columns=[1]) + expected = DataFrame( + { + 0: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), + 1: Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2), + } + ) + result = pd.concat([first, second], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) + def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): + # GH 12396 + + # tz-naive + first = Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) + second = DataFrame( + [ + [pd.Timestamp("2015/01/01", tz=tz2)], + [pd.Timestamp("2016/01/01", tz=tz2)], + ], + index=[2, 3], + ) + + expected = DataFrame( + [ + pd.NaT, + pd.NaT, + pd.Timestamp("2015/01/01", tz=tz2), + pd.Timestamp("2016/01/01", tz=tz2), + ] + ) + if tz1 != tz2: + expected = expected.astype(object) + + result = pd.concat([first, second]) + tm.assert_frame_equal(result, expected) + + +class TestTimezoneConcat: + def test_concat_tz_series(self): + # gh-11755: tz and no tz + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(date_range("2012-01-01", "2012-01-02")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # gh-11887: concat tz and object + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(["a", "b"]) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # see gh-12217 and gh-12306 + # Concatenating two UTC times + first = DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize("UTC") + + second = DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize("UTC") + + result = pd.concat([first, second]) + assert result[0].dtype == "datetime64[ns, UTC]" + + # Concatenating two London times + first = DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize("Europe/London") + + second = DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize("Europe/London") + + result = pd.concat([first, second]) + assert result[0].dtype == "datetime64[ns, Europe/London]" + + # Concatenating 2+1 London times + first = DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) + first[0] = first[0].dt.tz_localize("Europe/London") + + second = DataFrame([[datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize("Europe/London") + + result = pd.concat([first, second]) + assert result[0].dtype == "datetime64[ns, Europe/London]" + + # Concat'ing 1+2 London times + first = DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize("Europe/London") + + second = DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize("Europe/London") + + result = pd.concat([first, second]) + assert result[0].dtype == "datetime64[ns, Europe/London]" + + def test_concat_tz_series_tzlocal(self): + # see gh-13583 + x = [ + pd.Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()), + pd.Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()), + ] + y = [ + pd.Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()), + pd.Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()), + ] + + result = concat([Series(x), Series(y)], ignore_index=True) + tm.assert_series_equal(result, Series(x + y)) + assert result.dtype == "datetime64[ns, tzlocal()]" + + def test_concat_tz_series_with_datetimelike(self): + # see gh-12620: tz and timedelta + x = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-02-01", tz="US/Eastern"), + ] + y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] + result = concat([Series(x), Series(y)], ignore_index=True) + tm.assert_series_equal(result, Series(x + y, dtype="object")) + + # tz and period + y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")] + result = concat([Series(x), Series(y)], ignore_index=True) + tm.assert_series_equal(result, Series(x + y, dtype="object")) + + def test_concat_tz_frame(self): + df2 = DataFrame( + dict( + A=pd.Timestamp("20130102", tz="US/Eastern"), + B=pd.Timestamp("20130603", tz="CET"), + ), + index=range(5), + ) + + # concat + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + tm.assert_frame_equal(df2, df3) + + def test_concat_multiple_tzs(self): + # GH#12467 + # combining datetime tz-aware and naive DataFrames + ts1 = Timestamp("2015-01-01", tz=None) + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="EST") + + df1 = DataFrame(dict(time=[ts1])) + df2 = DataFrame(dict(time=[ts2])) + df3 = DataFrame(dict(time=[ts3])) + + results = pd.concat([df1, df2]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts1, ts2]), dtype=object) + tm.assert_frame_equal(results, expected) + + results = pd.concat([df1, df3]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts1, ts3]), dtype=object) + tm.assert_frame_equal(results, expected) + + results = pd.concat([df2, df3]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts2, ts3])) + tm.assert_frame_equal(results, expected) + + def test_concat_multiindex_with_tz(self): + # GH 6606 + df = DataFrame( + { + "dt": [ + datetime(2014, 1, 1), + datetime(2014, 1, 2), + datetime(2014, 1, 3), + ], + "b": ["A", "B", "C"], + "c": [1, 2, 3], + "d": [4, 5, 6], + } + ) + df["dt"] = df["dt"].apply(lambda d: Timestamp(d, tz="US/Pacific")) + df = df.set_index(["dt", "b"]) + + exp_idx1 = DatetimeIndex( + ["2014-01-01", "2014-01-02", "2014-01-03"] * 2, tz="US/Pacific", name="dt" + ) + exp_idx2 = Index(["A", "B", "C"] * 2, name="b") + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame( + {"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"] + ) + + result = concat([df, df]) + tm.assert_frame_equal(result, expected) + + def test_concat_tz_not_aligned(self): + # GH#22796 + ts = pd.to_datetime([1, 2]).tz_localize("UTC") + a = DataFrame({"A": ts}) + b = DataFrame({"A": ts, "B": ts}) + result = pd.concat([a, b], sort=True, ignore_index=True) + expected = DataFrame( + {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)} + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "t1", + [ + "2015-01-01", + pytest.param( + pd.NaT, + marks=pytest.mark.xfail( + reason="GH23037 incorrect dtype when concatenating" + ), + ), + ], + ) + def test_concat_tz_NaT(self, t1): + # GH#22796 + # Concating tz-aware multicolumn DataFrames + ts1 = Timestamp(t1, tz="UTC") + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="UTC") + + df1 = DataFrame([[ts1, ts2]]) + df2 = DataFrame([[ts3]]) + + result = pd.concat([df1, df2]) + expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0]) + + tm.assert_frame_equal(result, expected) + + +class TestPeriodConcat: + def test_concat_period_series(self): + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + def test_concat_period_multiple_freq_series(self): + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + assert result.dtype == "object" + + def test_concat_period_other_series(self): + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + assert result.dtype == "object" + + # non-period + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.DatetimeIndex(["2015-11-01", "2015-12-01"])) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + assert result.dtype == "object" + + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(["A", "B"]) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + assert result.dtype == "object" + + +def test_concat_timedelta64_block(): + rng = to_timedelta(np.arange(10), unit="s") + + df = DataFrame({"time": rng}) + + result = concat([df, df]) + tm.assert_frame_equal(result.iloc[:10], df) + tm.assert_frame_equal(result.iloc[10:], df) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py new file mode 100644 index 0000000000000..5c540124de8e6 --- /dev/null +++ b/pandas/tests/reshape/concat/test_empty.py @@ -0,0 +1,251 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, concat, date_range +import pandas._testing as tm + + +class TestEmptyConcat: + def test_handle_empty_objects(self, sort): + df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) + + baz = df[:5].copy() + baz["foo"] = "bar" + empty = df[5:5] + + frames = [baz, empty, empty, df[5:]] + concatted = concat(frames, axis=0, sort=sort) + + expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) + expected["foo"] = expected["foo"].astype("O") + expected.loc[0:4, "foo"] = "bar" + + tm.assert_frame_equal(concatted, expected) + + # empty as first element with time series + # GH3259 + df = DataFrame( + dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") + ) + empty = DataFrame() + result = concat([df, empty], axis=1) + tm.assert_frame_equal(result, df) + result = concat([empty, df], axis=1) + tm.assert_frame_equal(result, df) + + result = concat([df, empty]) + tm.assert_frame_equal(result, df) + result = concat([empty, df]) + tm.assert_frame_equal(result, df) + + def test_concat_empty_series(self): + # GH 11082 + s1 = Series([1, 2, 3], name="x") + s2 = Series(name="y", dtype="float64") + res = pd.concat([s1, s2], axis=1) + exp = DataFrame( + {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, + index=Index([0, 1, 2], dtype="O"), + ) + tm.assert_frame_equal(res, exp) + + s1 = Series([1, 2, 3], name="x") + s2 = Series(name="y", dtype="float64") + res = pd.concat([s1, s2], axis=0) + # name will be reset + exp = Series([1, 2, 3]) + tm.assert_series_equal(res, exp) + + # empty Series with no name + s1 = Series([1, 2, 3], name="x") + s2 = Series(name=None, dtype="float64") + res = pd.concat([s1, s2], axis=1) + exp = DataFrame( + {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=["x", 0], + index=Index([0, 1, 2], dtype="O"), + ) + tm.assert_frame_equal(res, exp) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + @pytest.mark.parametrize("values", [[], [1, 2, 3]]) + def test_concat_empty_series_timelike(self, tz, values): + # GH 18447 + + first = Series([], dtype="M8[ns]").dt.tz_localize(tz) + dtype = None if values else np.float64 + second = Series(values, dtype=dtype) + + expected = DataFrame( + { + 0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), + 1: values, + } + ) + result = concat([first, second], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "left,right,expected", + [ + # booleans + (np.bool_, np.int32, np.int32), + (np.bool_, np.float32, np.object_), + # datetime-like + ("m8[ns]", np.bool_, np.object_), + ("m8[ns]", np.int64, np.object_), + ("M8[ns]", np.bool_, np.object_), + ("M8[ns]", np.int64, np.object_), + # categorical + ("category", "category", "category"), + ("category", "object", "object"), + ], + ) + def test_concat_empty_series_dtypes(self, left, right, expected): + result = pd.concat([Series(dtype=left), Series(dtype=right)]) + assert result.dtype == expected + + @pytest.mark.parametrize( + "dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"] + ) + def test_concat_empty_series_dtypes_match_roundtrips(self, dtype): + dtype = np.dtype(dtype) + + result = pd.concat([Series(dtype=dtype)]) + assert result.dtype == dtype + + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype)]) + assert result.dtype == dtype + + def test_concat_empty_series_dtypes_roundtrips(self): + + # round-tripping with self & like self + dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) + + def int_result_type(dtype, dtype2): + typs = {dtype.kind, dtype2.kind} + if not len(typs - {"i", "u", "b"}) and ( + dtype.kind == "i" or dtype2.kind == "i" + ): + return "i" + elif not len(typs - {"u", "b"}) and ( + dtype.kind == "u" or dtype2.kind == "u" + ): + return "u" + return None + + def float_result_type(dtype, dtype2): + typs = {dtype.kind, dtype2.kind} + if not len(typs - {"f", "i", "u"}) and ( + dtype.kind == "f" or dtype2.kind == "f" + ): + return "f" + return None + + def get_result_type(dtype, dtype2): + result = float_result_type(dtype, dtype2) + if result is not None: + return result + result = int_result_type(dtype, dtype2) + if result is not None: + return result + return "O" + + for dtype in dtypes: + for dtype2 in dtypes: + if dtype == dtype2: + continue + + expected = get_result_type(dtype, dtype2) + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype + assert result.kind == expected + + def test_concat_empty_series_dtypes_triple(self): + + assert ( + pd.concat( + [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] + ).dtype + == np.object_ + ) + + def test_concat_empty_series_dtype_category_with_array(self): + # GH#18515 + assert ( + pd.concat( + [Series(np.array([]), dtype="category"), Series(dtype="float64")] + ).dtype + == "float64" + ) + + def test_concat_empty_series_dtypes_sparse(self): + result = pd.concat( + [ + Series(dtype="float64").astype("Sparse"), + Series(dtype="float64").astype("Sparse"), + ] + ) + assert result.dtype == "Sparse[float64]" + + result = pd.concat( + [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] + ) + # TODO: release-note: concat sparse dtype + expected = pd.SparseDtype(np.float64) + assert result.dtype == expected + + result = pd.concat( + [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] + ) + # TODO: release-note: concat sparse dtype + expected = pd.SparseDtype("object") + assert result.dtype == expected + + def test_concat_empty_df_object_dtype(self): + # GH 9149 + df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) + df_2 = DataFrame(columns=df_1.columns) + result = pd.concat([df_1, df_2], axis=0) + expected = df_1.astype(object) + tm.assert_frame_equal(result, expected) + + def test_concat_empty_dataframe_dtypes(self): + df = DataFrame(columns=list("abc")) + df["a"] = df["a"].astype(np.bool_) + df["b"] = df["b"].astype(np.int32) + df["c"] = df["c"].astype(np.float64) + + result = pd.concat([df, df]) + assert result["a"].dtype == np.bool_ + assert result["b"].dtype == np.int32 + assert result["c"].dtype == np.float64 + + result = pd.concat([df, df.astype(np.float64)]) + assert result["a"].dtype == np.object_ + assert result["b"].dtype == np.float64 + assert result["c"].dtype == np.float64 + + def test_concat_inner_join_empty(self): + # GH 15328 + df_empty = DataFrame() + df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") + df_expected = DataFrame({"a": []}, index=[], dtype="int64") + + for how, expected in [("inner", df_expected), ("outer", df_a)]: + result = pd.concat([df_a, df_empty], axis=1, join=how) + tm.assert_frame_equal(result, expected) + + def test_empty_dtype_coerce(self): + + # xref to #12411 + # xref to #12045 + # xref to #11594 + # see below + + # 10571 + df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"]) + df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"]) + result = concat([df1, df2]) + expected = df1.dtypes + tm.assert_series_equal(result.dtypes, expected) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py new file mode 100644 index 0000000000000..e283212b4e60c --- /dev/null +++ b/pandas/tests/reshape/concat/test_index.py @@ -0,0 +1,261 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, concat +import pandas._testing as tm + + +class TestIndexConcat: + def test_concat_ignore_index(self, sort): + frame1 = DataFrame( + {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]} + ) + frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) + frame1.index = Index(["x", "y", "z"]) + frame2.index = Index(["x", "y", "q"]) + + v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort) + + nan = np.nan + expected = DataFrame( + [ + [nan, nan, nan, 4.3], + ["a", 1, 4.5, 5.2], + ["b", 2, 3.2, 2.2], + ["c", 3, 1.2, nan], + ], + index=Index(["q", "x", "y", "z"]), + ) + if not sort: + expected = expected.loc[["x", "y", "z", "q"]] + + tm.assert_frame_equal(v1, expected) + + @pytest.mark.parametrize( + "name_in1,name_in2,name_in3,name_out", + [ + ("idx", "idx", "idx", "idx"), + ("idx", "idx", None, None), + ("idx", None, None, None), + ("idx1", "idx2", None, None), + ("idx1", "idx1", "idx2", None), + ("idx1", "idx2", "idx3", None), + (None, None, None, None), + ], + ) + def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out): + # GH13475 + indices = [ + Index(["a", "b", "c"], name=name_in1), + Index(["b", "c", "d"], name=name_in2), + Index(["c", "d", "e"], name=name_in3), + ] + frames = [ + DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"]) + ] + result = pd.concat(frames, axis=1) + + exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out) + expected = DataFrame( + { + "x": [0, 1, 2, np.nan, np.nan], + "y": [np.nan, 0, 1, 2, np.nan], + "z": [np.nan, np.nan, 0, 1, 2], + }, + index=exp_ind, + ) + + tm.assert_frame_equal(result, expected) + + def test_concat_rename_index(self): + a = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_a"), + ) + b = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_b"), + ) + + result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"]) + + exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"]) + names = list(exp.index.names) + names[1] = "lvl1" + exp.index.set_names(names, inplace=True) + + tm.assert_frame_equal(result, exp) + assert result.index.names == exp.index.names + + @pytest.mark.parametrize("test_series", [True, False]) + def test_concat_copy_index(self, test_series, axis): + # GH 29879 + if test_series: + ser = Series([1, 2]) + comb = concat([ser, ser], axis=axis, copy=True) + assert comb.index is not ser.index + else: + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + comb = concat([df, df], axis=axis, copy=True) + assert comb.index is not df.index + assert comb.columns is not df.columns + + def test_default_index(self): + # is_series and ignore_index + s1 = Series([1, 2, 3], name="x") + s2 = Series([4, 5, 6], name="y") + res = pd.concat([s1, s2], axis=1, ignore_index=True) + assert isinstance(res.columns, pd.RangeIndex) + exp = DataFrame([[1, 4], [2, 5], [3, 6]]) + # use check_index_type=True to check the result have + # RangeIndex (default index) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + # is_series and all inputs have no names + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + res = pd.concat([s1, s2], axis=1, ignore_index=False) + assert isinstance(res.columns, pd.RangeIndex) + exp = DataFrame([[1, 4], [2, 5], [3, 6]]) + exp.columns = pd.RangeIndex(2) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + # is_dataframe and ignore_index + df1 = DataFrame({"A": [1, 2], "B": [5, 6]}) + df2 = DataFrame({"A": [3, 4], "B": [7, 8]}) + + res = pd.concat([df1, df2], axis=0, ignore_index=True) + exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + res = pd.concat([df1, df2], axis=1, ignore_index=True) + exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + def test_dups_index(self): + # GH 4771 + + # single dtypes + df = DataFrame( + np.random.randint(0, 10, size=40).reshape(10, 4), + columns=["A", "A", "C", "C"], + ) + + result = concat([df, df], axis=1) + tm.assert_frame_equal(result.iloc[:, :4], df) + tm.assert_frame_equal(result.iloc[:, 4:], df) + + result = concat([df, df], axis=0) + tm.assert_frame_equal(result.iloc[:10], df) + tm.assert_frame_equal(result.iloc[10:], df) + + # multi dtypes + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + + result = concat([df, df], axis=1) + tm.assert_frame_equal(result.iloc[:, :6], df) + tm.assert_frame_equal(result.iloc[:, 6:], df) + + result = concat([df, df], axis=0) + tm.assert_frame_equal(result.iloc[:10], df) + tm.assert_frame_equal(result.iloc[10:], df) + + # append + result = df.iloc[0:8, :].append(df.iloc[8:]) + tm.assert_frame_equal(result, df) + + result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) + tm.assert_frame_equal(result, df) + + expected = concat([df, df], axis=0) + result = df.append(df) + tm.assert_frame_equal(result, expected) + + +class TestMultiIndexConcat: + def test_concat_multiindex_with_keys(self): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + result = concat([frame, frame], keys=[0, 1], names=["iteration"]) + + assert result.index.names == ("iteration",) + index.names + tm.assert_frame_equal(result.loc[0], frame) + tm.assert_frame_equal(result.loc[1], frame) + assert result.index.nlevels == 3 + + def test_concat_multiindex_with_none_in_index_names(self): + # GH 15787 + index = pd.MultiIndex.from_product([[1], range(5)], names=["level1", None]) + df = DataFrame({"col": range(5)}, index=index, dtype=np.int32) + + result = concat([df, df], keys=[1, 2], names=["level2"]) + index = pd.MultiIndex.from_product( + [[1, 2], [1], range(5)], names=["level2", "level1", None] + ) + expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32) + tm.assert_frame_equal(result, expected) + + result = concat([df, df[:2]], keys=[1, 2], names=["level2"]) + level2 = [1] * 5 + [2] * 2 + level1 = [1] * 7 + no_name = list(range(5)) + list(range(2)) + tuples = list(zip(level2, level1, no_name)) + index = pd.MultiIndex.from_tuples(tuples, names=["level2", "level1", None]) + expected = DataFrame({"col": no_name}, index=index, dtype=np.int32) + tm.assert_frame_equal(result, expected) + + def test_concat_multiindex_rangeindex(self): + # GH13542 + # when multi-index levels are RangeIndex objects + # there is a bug in concat with objects of len 1 + + df = DataFrame(np.random.randn(9, 2)) + df.index = MultiIndex( + levels=[pd.RangeIndex(3), pd.RangeIndex(3)], + codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)], + ) + + res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) + exp = df.iloc[[2, 3, 4, 5], :] + tm.assert_frame_equal(res, exp) + + def test_concat_multiindex_dfs_with_deepcopy(self): + # GH 9967 + from copy import deepcopy + + example_multiindex1 = pd.MultiIndex.from_product([["a"], ["b"]]) + example_dataframe1 = DataFrame([0], index=example_multiindex1) + + example_multiindex2 = pd.MultiIndex.from_product([["a"], ["c"]]) + example_dataframe2 = DataFrame([1], index=example_multiindex2) + + example_dict = {"s1": example_dataframe1, "s2": example_dataframe2} + expected_index = pd.MultiIndex( + levels=[["s1", "s2"], ["a"], ["b", "c"]], + codes=[[0, 1], [0, 0], [0, 1]], + names=["testname", None, None], + ) + expected = DataFrame([[0], [1]], index=expected_index) + result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) + tm.assert_frame_equal(result_copy, expected) + result_no_copy = pd.concat(example_dict, names=["testname"]) + tm.assert_frame_equal(result_no_copy, expected) diff --git a/pandas/tests/reshape/concat/test_invalid.py b/pandas/tests/reshape/concat/test_invalid.py new file mode 100644 index 0000000000000..3a886e0d612c6 --- /dev/null +++ b/pandas/tests/reshape/concat/test_invalid.py @@ -0,0 +1,51 @@ +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame, concat, read_csv +import pandas._testing as tm + + +class TestInvalidConcat: + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = tm.makeCustomDataframe(10, 2) + for obj in [1, dict(), [1, 2], (1, 2)]: + + msg = ( + f"cannot concatenate object of type '{type(obj)}'; " + "only Series and DataFrame objs are valid" + ) + with pytest.raises(TypeError, match=msg): + concat([df1, obj]) + + def test_concat_invalid_first_argument(self): + df1 = tm.makeCustomDataframe(10, 2) + df2 = tm.makeCustomDataframe(10, 2) + msg = ( + "first argument must be an iterable of pandas " + 'objects, you passed an object of type "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + concat(df1, df2) + + # generator ok though + concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) + + # text reader ok + # GH6583 + data = """index,A,B,C,D + foo,2,3,4,5 + bar,7,8,9,10 + baz,12,13,14,15 + qux,12,13,14,15 + foo2,12,13,14,15 + bar2,12,13,14,15 + """ + + reader = read_csv(StringIO(data), chunksize=1) + result = concat(reader, ignore_index=True) + expected = read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py new file mode 100644 index 0000000000000..aea2840bb897f --- /dev/null +++ b/pandas/tests/reshape/concat/test_series.py @@ -0,0 +1,146 @@ +import numpy as np +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + concat, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param + + +class TestSeriesConcat: + def test_concat_series(self): + + ts = tm.makeTimeSeries() + ts.name = "foo" + + pieces = [ts[:5], ts[5:15], ts[15:]] + + result = concat(pieces) + tm.assert_series_equal(result, ts) + assert result.name == ts.name + + result = concat(pieces, keys=[0, 1, 2]) + expected = ts.copy() + + ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) + + exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) + expected.index = exp_index + tm.assert_series_equal(result, expected) + + def test_concat_empty_and_non_empty_series_regression(self): + # GH 18187 regression test + s1 = Series([1]) + s2 = Series([], dtype=object) + + expected = s1 + result = pd.concat([s1, s2]) + tm.assert_series_equal(result, expected) + + def test_concat_series_axis1(self, sort=sort): + ts = tm.makeTimeSeries() + + pieces = [ts[:-2], ts[2:], ts[2:-2]] + + result = concat(pieces, axis=1) + expected = DataFrame(pieces).T + tm.assert_frame_equal(result, expected) + + result = concat(pieces, keys=["A", "B", "C"], axis=1) + expected = DataFrame(pieces, index=["A", "B", "C"]).T + tm.assert_frame_equal(result, expected) + + # preserve series names, #2489 + s = Series(randn(5), name="A") + s2 = Series(randn(5), name="B") + + result = concat([s, s2], axis=1) + expected = DataFrame({"A": s, "B": s2}) + tm.assert_frame_equal(result, expected) + + s2.name = None + result = concat([s, s2], axis=1) + tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object")) + + # must reindex, #2603 + s = Series(randn(3), index=["c", "a", "b"], name="A") + s2 = Series(randn(4), index=["d", "a", "b", "c"], name="B") + result = concat([s, s2], axis=1, sort=sort) + expected = DataFrame({"A": s, "B": s2}) + tm.assert_frame_equal(result, expected) + + def test_concat_series_axis1_names_applied(self): + # ensure names argument is not ignored on axis=1, #23490 + s = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], columns=Index(["a", "b"], name="A") + ) + tm.assert_frame_equal(result, expected) + + result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], + columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]), + ) + tm.assert_frame_equal(result, expected) + + def test_concat_series_axis1_same_names_ignore_index(self): + dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1] + s1 = Series(randn(len(dates)), index=dates, name="value") + s2 = Series(randn(len(dates)), index=dates, name="value") + + result = concat([s1, s2], axis=1, ignore_index=True) + expected = Index([0, 1]) + + tm.assert_index_equal(result.columns, expected) + + @pytest.mark.parametrize( + "s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))] + ) + def test_concat_series_name_npscalar_tuple(self, s1name, s2name): + # GH21015 + s1 = Series({"a": 1, "b": 2}, name=s1name) + s2 = Series({"c": 5, "d": 6}, name=s2name) + result = pd.concat([s1, s2]) + expected = Series({"a": 1, "b": 2, "c": 5, "d": 6}) + tm.assert_series_equal(result, expected) + + def test_concat_series_partial_columns_names(self): + # GH10698 + foo = Series([1, 2], name="foo") + bar = Series([1, 2]) + baz = Series([4, 5]) + + result = concat([foo, bar, baz], axis=1) + expected = DataFrame( + {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1] + ) + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"]) + expected = DataFrame( + {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]}, + columns=["red", "blue", "yellow"], + ) + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, ignore_index=True) + expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_sort.py b/pandas/tests/reshape/concat/test_sort.py new file mode 100644 index 0000000000000..865f696b7a73a --- /dev/null +++ b/pandas/tests/reshape/concat/test_sort.py @@ -0,0 +1,83 @@ +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +class TestConcatSort: + def test_concat_sorts_columns(self, sort): + # GH-4588 + df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) + df2 = DataFrame({"a": [3, 4], "c": [5, 6]}) + + # for sort=True/None + expected = DataFrame( + {"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]}, + columns=["a", "b", "c"], + ) + + if sort is False: + expected = expected[["b", "a", "c"]] + + # default + with tm.assert_produces_warning(None): + result = pd.concat([df1, df2], ignore_index=True, sort=sort) + tm.assert_frame_equal(result, expected) + + def test_concat_sorts_index(self, sort): + df1 = DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) + df2 = DataFrame({"b": [1, 2]}, index=["a", "b"]) + + # For True/None + expected = DataFrame( + {"a": [2, 3, 1], "b": [1, 2, None]}, + index=["a", "b", "c"], + columns=["a", "b"], + ) + if sort is False: + expected = expected.loc[["c", "a", "b"]] + + # Warn and sort by default + with tm.assert_produces_warning(None): + result = pd.concat([df1, df2], axis=1, sort=sort) + tm.assert_frame_equal(result, expected) + + def test_concat_inner_sort(self, sort): + # https://github.com/pandas-dev/pandas/pull/20613 + df1 = DataFrame( + {"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"] + ) + df2 = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) + + with tm.assert_produces_warning(None): + # unset sort should *not* warn for inner join + # since that never sorted + result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True) + + expected = DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) + if sort is True: + expected = expected[["a", "b"]] + tm.assert_frame_equal(result, expected) + + def test_concat_aligned_sort(self): + # GH-4588 + df = DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"]) + result = pd.concat([df, df], sort=True, ignore_index=True) + expected = DataFrame( + {"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]}, + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + result = pd.concat( + [df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True + ) + expected = expected[["b", "c"]] + tm.assert_frame_equal(result, expected) + + def test_concat_aligned_sort_does_not_raise(self): + # GH-4588 + # We catch TypeErrors from sorting internally and do not re-raise. + df = DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"]) + expected = DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"]) + result = pd.concat([df, df], ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index d4d4c4190417e..c1efbb8536d68 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -2,8 +2,6 @@ from numpy.random import randn import pytest -from pandas._libs.join import inner_join, left_outer_join - import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge import pandas._testing as tm @@ -43,96 +41,6 @@ def setup_method(self, method): {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"] ) - def test_cython_left_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - ls, rs = left_outer_join(left, right, max_group) - - exp_ls = left.argsort(kind="mergesort") - exp_rs = right.argsort(kind="mergesort") - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_cython_right_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - rs, ls = left_outer_join(right, left, max_group) - - exp_ls = left.argsort(kind="mergesort") - exp_rs = right.argsort(kind="mergesort") - - # 0 1 1 1 - exp_li = a_( - [ - 0, - 1, - 2, - 3, - 4, - 5, - 3, - 4, - 5, - 3, - 4, - 5, - # 2 2 4 - 6, - 7, - 8, - 6, - 7, - 8, - -1, - ] - ) - exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_cython_inner_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) - max_group = 5 - - ls, rs = inner_join(left, right, max_group) - - exp_ls = left.argsort(kind="mergesort") - exp_rs = right.argsort(kind="mergesort") - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2") _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") @@ -597,7 +505,7 @@ def test_join_sort(self): # smoke test joined = left.join(right, on="key", sort=False) - tm.assert_index_equal(joined.index, pd.Index(list(range(4)))) + tm.assert_index_equal(joined.index, Index(list(range(4)))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index @@ -743,7 +651,7 @@ def test_join_multi_to_multi(self, join_type): def test_join_on_tz_aware_datetimeindex(self): # GH 23931, 26335 - df1 = pd.DataFrame( + df1 = DataFrame( { "date": pd.date_range( start="2018-01-01", periods=5, tz="America/Chicago" @@ -752,7 +660,7 @@ def test_join_on_tz_aware_datetimeindex(self): } ) - df2 = pd.DataFrame( + df2 = DataFrame( { "date": pd.date_range( start="2018-01-03", periods=5, tz="America/Chicago" @@ -762,7 +670,7 @@ def test_join_on_tz_aware_datetimeindex(self): ) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object) + expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): @@ -879,3 +787,20 @@ def _join_by_hand(a, b, how="left"): for col, s in b_re.items(): a_re[col] = s return a_re.reindex(columns=result_columns) + + +def test_join_inner_multiindex_deterministic_order(): + # GH: 36910 + left = DataFrame( + data={"e": 5}, + index=pd.MultiIndex.from_tuples([(1, 2, 4)], names=("a", "b", "d")), + ) + right = DataFrame( + data={"f": 6}, index=pd.MultiIndex.from_tuples([(2, 3)], names=("b", "c")) + ) + result = left.join(right, how="inner") + expected = DataFrame( + {"e": [5], "f": [6]}, + index=pd.MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index aee503235d36c..c4c9b0e516192 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -46,22 +46,22 @@ def get_test_data(ngroups=NGROUPS, n=N): def get_series(): return [ - pd.Series([1], dtype="int64"), - pd.Series([1], dtype="Int64"), - pd.Series([1.23]), - pd.Series(["foo"]), - pd.Series([True]), - pd.Series([pd.Timestamp("2018-01-01")]), - pd.Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), + Series([1], dtype="int64"), + Series([1], dtype="Int64"), + Series([1.23]), + Series(["foo"]), + Series([True]), + Series([pd.Timestamp("2018-01-01")]), + Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), ] def get_series_na(): return [ - pd.Series([np.nan], dtype="Int64"), - pd.Series([np.nan], dtype="float"), - pd.Series([np.nan], dtype="object"), - pd.Series([pd.NaT]), + Series([np.nan], dtype="Int64"), + Series([np.nan], dtype="float"), + Series([np.nan], dtype="object"), + Series([pd.NaT]), ] @@ -122,10 +122,10 @@ def setup_method(self, method): def test_merge_inner_join_empty(self): # GH 15328 - df_empty = pd.DataFrame() - df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") + df_empty = DataFrame() + df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") result = pd.merge(df_empty, df_a, left_index=True, right_index=True) - expected = pd.DataFrame({"a": []}, index=[], dtype="int64") + expected = DataFrame({"a": []}, index=[], dtype="int64") tm.assert_frame_equal(result, expected) def test_merge_common(self): @@ -136,7 +136,7 @@ def test_merge_common(self): def test_merge_non_string_columns(self): # https://github.com/pandas-dev/pandas/issues/17962 # Checks that method runs for non string column names - left = pd.DataFrame( + left = DataFrame( {0: [1, 0, 1, 0], 1: [0, 1, 0, 0], 2: [0, 0, 2, 0], 3: [1, 0, 0, 3]} ) @@ -253,16 +253,16 @@ def test_merge_different_column_key_names(self): right, left_on="lkey", right_on="rkey", how="outer", sort=True ) - exp = pd.Series(["bar", "baz", "foo", "foo", "foo", "foo", np.nan], name="lkey") + exp = Series(["bar", "baz", "foo", "foo", "foo", "foo", np.nan], name="lkey") tm.assert_series_equal(merged["lkey"], exp) - exp = pd.Series(["bar", np.nan, "foo", "foo", "foo", "foo", "qux"], name="rkey") + exp = Series(["bar", np.nan, "foo", "foo", "foo", "foo", "qux"], name="rkey") tm.assert_series_equal(merged["rkey"], exp) - exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name="value_x") + exp = Series([2, 3, 1, 1, 4, 4, np.nan], name="value_x") tm.assert_series_equal(merged["value_x"], exp) - exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name="value_y") + exp = Series([6, np.nan, 5, 8, 5, 8, 7], name="value_y") tm.assert_series_equal(merged["value_y"], exp) def test_merge_copy(self): @@ -430,10 +430,10 @@ def test_left_merge_empty_dataframe(self): ) def test_merge_left_empty_right_empty(self, join_type, kwarg): # GH 10824 - left = pd.DataFrame(columns=["a", "b", "c"]) - right = pd.DataFrame(columns=["x", "y", "z"]) + left = DataFrame(columns=["a", "b", "c"]) + right = DataFrame(columns=["x", "y", "z"]) - exp_in = pd.DataFrame( + exp_in = DataFrame( columns=["a", "b", "c", "x", "y", "z"], index=pd.Index([], dtype=object), dtype=object, @@ -444,10 +444,10 @@ def test_merge_left_empty_right_empty(self, join_type, kwarg): def test_merge_left_empty_right_notempty(self): # GH 10824 - left = pd.DataFrame(columns=["a", "b", "c"]) - right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"]) + left = DataFrame(columns=["a", "b", "c"]) + right = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"]) - exp_out = pd.DataFrame( + exp_out = DataFrame( { "a": np.array([np.nan] * 3, dtype=object), "b": np.array([np.nan] * 3, dtype=object), @@ -493,10 +493,10 @@ def check2(exp, kwarg): def test_merge_left_notempty_right_empty(self): # GH 10824 - left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) - right = pd.DataFrame(columns=["x", "y", "z"]) + left = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) + right = DataFrame(columns=["x", "y", "z"]) - exp_out = pd.DataFrame( + exp_out = DataFrame( { "a": [1, 4, 7], "b": [2, 5, 8], @@ -534,16 +534,16 @@ def check2(exp, kwarg): def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): # GH 25183 - df = pd.DataFrame( + df = DataFrame( {"key": series_of_dtype, "value": series_of_dtype2}, columns=["key", "value"], ) df_empty = df[:0] - expected = pd.DataFrame( + expected = DataFrame( { - "value_x": pd.Series(dtype=df.dtypes["value"]), - "key": pd.Series(dtype=df.dtypes["key"]), - "value_y": pd.Series(dtype=df.dtypes["value"]), + "value_x": Series(dtype=df.dtypes["value"]), + "key": Series(dtype=df.dtypes["key"]), + "value_y": Series(dtype=df.dtypes["value"]), }, columns=["value_x", "key", "value_y"], ) @@ -552,15 +552,15 @@ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na): # GH 25183 - df_left = pd.DataFrame( + df_left = DataFrame( {"key": series_of_dtype, "value": series_of_dtype_all_na}, columns=["key", "value"], ) - df_right = pd.DataFrame( + df_right = DataFrame( {"key": series_of_dtype, "value": series_of_dtype_all_na}, columns=["key", "value"], ) - expected = pd.DataFrame( + expected = DataFrame( { "key": series_of_dtype, "value_x": series_of_dtype_all_na, @@ -675,8 +675,8 @@ def test_join_append_timedeltas(self): def test_other_datetime_unit(self): # GH 13389 - df1 = pd.DataFrame({"entity_id": [101, 102]}) - s = pd.Series([None, None], index=[101, 102], name="days") + df1 = DataFrame({"entity_id": [101, 102]}) + s = Series([None, None], index=[101, 102], name="days") for dtype in [ "datetime64[D]", @@ -694,7 +694,7 @@ def test_other_datetime_unit(self): result = df1.merge(df2, left_on="entity_id", right_index=True) - exp = pd.DataFrame( + exp = DataFrame( { "entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype="datetime64[ns]"), @@ -706,8 +706,8 @@ def test_other_datetime_unit(self): @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) def test_other_timedelta_unit(self, unit): # GH 13389 - df1 = pd.DataFrame({"entity_id": [101, 102]}) - s = pd.Series([None, None], index=[101, 102], name="days") + df1 = DataFrame({"entity_id": [101, 102]}) + s = Series([None, None], index=[101, 102], name="days") dtype = f"m8[{unit}]" df2 = s.astype(dtype).to_frame("days") @@ -715,7 +715,7 @@ def test_other_timedelta_unit(self, unit): result = df1.merge(df2, left_on="entity_id", right_index=True) - exp = pd.DataFrame( + exp = DataFrame( {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)}, columns=["entity_id", "days"], ) @@ -748,13 +748,13 @@ def test_overlapping_columns_error_message(self): def test_merge_on_datetime64tz(self): # GH11405 - left = pd.DataFrame( + left = DataFrame( { "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), "value": [1, 2], } ) - right = pd.DataFrame( + right = DataFrame( { "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), "value": [1, 2, 3], @@ -771,13 +771,13 @@ def test_merge_on_datetime64tz(self): result = pd.merge(left, right, on="key", how="outer") tm.assert_frame_equal(result, expected) - left = pd.DataFrame( + left = DataFrame( { "key": [1, 2], "value": pd.date_range("20151010", periods=2, tz="US/Eastern"), } ) - right = pd.DataFrame( + right = DataFrame( { "key": [2, 3], "value": pd.date_range("20151011", periods=2, tz="US/Eastern"), @@ -800,7 +800,7 @@ def test_merge_on_datetime64tz(self): def test_merge_on_datetime64tz_empty(self): # https://github.com/pandas-dev/pandas/issues/25014 dtz = pd.DatetimeTZDtype(tz="UTC") - right = pd.DataFrame( + right = DataFrame( { "date": [pd.Timestamp("2018", tz=dtz.tz)], "value": [4.0], @@ -810,13 +810,13 @@ def test_merge_on_datetime64tz_empty(self): ) left = right[:0] result = left.merge(right, on="date") - expected = pd.DataFrame( + expected = DataFrame( { - "value_x": pd.Series(dtype=float), - "date2_x": pd.Series(dtype=dtz), - "date": pd.Series(dtype=dtz), - "value_y": pd.Series(dtype=float), - "date2_y": pd.Series(dtype=dtz), + "value_x": Series(dtype=float), + "date2_x": Series(dtype=dtz), + "date": Series(dtype=dtz), + "value_y": Series(dtype=float), + "date2_y": Series(dtype=dtz), }, columns=["value_x", "date2_x", "date", "value_y", "date2_y"], ) @@ -824,12 +824,12 @@ def test_merge_on_datetime64tz_empty(self): def test_merge_datetime64tz_with_dst_transition(self): # GH 18885 - df1 = pd.DataFrame( + df1 = DataFrame( pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"), columns=["date"], ) df1["value"] = 1 - df2 = pd.DataFrame( + df2 = DataFrame( { "date": pd.to_datetime( [ @@ -843,7 +843,7 @@ def test_merge_datetime64tz_with_dst_transition(self): ) df2["date"] = df2["date"].dt.tz_localize("UTC").dt.tz_convert("Europe/Madrid") result = pd.merge(df1, df2, how="outer", on="date") - expected = pd.DataFrame( + expected = DataFrame( { "date": pd.date_range( "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid" @@ -868,10 +868,10 @@ def test_merge_non_unique_period_index(self): tm.assert_frame_equal(result, expected) def test_merge_on_periods(self): - left = pd.DataFrame( + left = DataFrame( {"key": pd.period_range("20151010", periods=2, freq="D"), "value": [1, 2]} ) - right = pd.DataFrame( + right = DataFrame( { "key": pd.period_range("20151011", periods=3, freq="D"), "value": [1, 2, 3], @@ -888,10 +888,10 @@ def test_merge_on_periods(self): result = pd.merge(left, right, on="key", how="outer") tm.assert_frame_equal(result, expected) - left = pd.DataFrame( + left = DataFrame( {"key": [1, 2], "value": pd.period_range("20151010", periods=2, freq="D")} ) - right = pd.DataFrame( + right = DataFrame( {"key": [2, 3], "value": pd.period_range("20151011", periods=2, freq="D")} ) @@ -1132,7 +1132,7 @@ def test_validation(self): tm.assert_frame_equal(result, expected_3) # Dups on right - right_w_dups = right.append(pd.DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])) + right_w_dups = right.append(DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])) merge( left, right_w_dups, @@ -1156,7 +1156,7 @@ def test_validation(self): # Dups on left left_w_dups = left.append( - pd.DataFrame({"a": ["a"], "c": ["cow"]}, index=[3]), sort=True + DataFrame({"a": ["a"], "c": ["cow"]}, index=[3]), sort=True ) merge( left_w_dups, @@ -1242,7 +1242,7 @@ def test_validation(self): def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 - a = pd.DataFrame({"a": [], "b": [], "c": []}) + a = DataFrame({"a": [], "b": [], "c": []}) with np.errstate(divide="raise"): merge(a, a, on=("a", "b")) @@ -1285,10 +1285,10 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) - df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]}) + df1 = DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) + df2 = DataFrame({"b": [0, 1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on="key", right_index=True, how=how) - expected = pd.DataFrame( + expected = DataFrame( [ [0, 0, 0], [1, 1, 1], @@ -1306,10 +1306,10 @@ def test_merge_right_index_right(self): # Note: the expected output here is probably incorrect. # See https://github.com/pandas-dev/pandas/issues/17257 for more. # We include this as a regression test for GH-24897. - left = pd.DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]}) - right = pd.DataFrame({"b": [1, 2, 3]}) + left = DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]}) + right = DataFrame({"b": [1, 2, 3]}) - expected = pd.DataFrame( + expected = DataFrame( {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, columns=["a", "key", "b"], index=[0, 1, 2, np.nan], @@ -1320,30 +1320,26 @@ def test_merge_right_index_right(self): @pytest.mark.parametrize("how", ["left", "right"]) def test_merge_preserves_row_order(self, how): # GH 27453 - left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) - right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]}) + left_df = DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) + right_df = DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]}) result = left_df.merge(right_df, on=["animal", "max_speed"], how=how) if how == "right": - expected = pd.DataFrame( - {"animal": ["quetzal", "pig"], "max_speed": [80, 11]} - ) + expected = DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]}) else: - expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) + expected = DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) tm.assert_frame_equal(result, expected) def test_merge_take_missing_values_from_index_of_other_dtype(self): # GH 24212 - left = pd.DataFrame( + left = DataFrame( { "a": [1, 2, 3], "key": pd.Categorical(["a", "a", "b"], categories=list("abc")), } ) - right = pd.DataFrame( - {"b": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"]) - ) + right = DataFrame({"b": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"])) result = left.merge(right, left_on="key", right_index=True, how="right") - expected = pd.DataFrame( + expected = DataFrame( { "a": [1, 2, 3, None], "key": pd.Categorical(["a", "a", "b", "c"]), @@ -1356,10 +1352,10 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): def test_merge_readonly(self): # https://github.com/pandas-dev/pandas/issues/27943 - data1 = pd.DataFrame( + data1 = DataFrame( np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] ) - data2 = pd.DataFrame( + data2 = DataFrame( np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] ) @@ -1521,8 +1517,8 @@ def test_merge_incompat_infer_boolean_object(self): ([0, 1, 2], Series(["a", "b", "a"]).astype("category")), ([0.0, 1.0, 2.0], Series(["a", "b", "a"]).astype("category")), # no not infer - ([0, 1], pd.Series([False, True], dtype=object)), - ([0, 1], pd.Series([False, True], dtype=bool)), + ([0, 1], Series([False, True], dtype=object)), + ([0, 1], Series([False, True], dtype=bool)), ], ) def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): @@ -1743,7 +1739,7 @@ def test_self_join_multiple_categories(self): # GH 16767 # non-duplicates should work with multiple categories m = 5 - df = pd.DataFrame( + df = DataFrame( { "a": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] * m, "b": ["t", "w", "x", "y", "z"] * 2 * m, @@ -1783,17 +1779,17 @@ def test_dtype_on_categorical_dates(self): # GH 16900 # dates should not be coerced to ints - df = pd.DataFrame( + df = DataFrame( [[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], columns=["date", "num2"] ) df["date"] = df["date"].astype("category") - df2 = pd.DataFrame( + df2 = DataFrame( [[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], columns=["date", "num4"] ) df2["date"] = df2["date"].astype("category") - expected_outer = pd.DataFrame( + expected_outer = DataFrame( [ [pd.Timestamp("2001-01-01"), 1.1, 1.3], [pd.Timestamp("2001-01-02"), 1.3, np.nan], @@ -1804,7 +1800,7 @@ def test_dtype_on_categorical_dates(self): result_outer = pd.merge(df, df2, how="outer", on=["date"]) tm.assert_frame_equal(result_outer, expected_outer) - expected_inner = pd.DataFrame( + expected_inner = DataFrame( [[pd.Timestamp("2001-01-01"), 1.1, 1.3]], columns=["date", "num2", "num4"] ) result_inner = pd.merge(df, df2, how="inner", on=["date"]) @@ -1824,22 +1820,20 @@ def test_merging_with_bool_or_int_cateorical_column( ): # GH 17187 # merging with a boolean/int categorical column - df1 = pd.DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) + df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered)) - df2 = pd.DataFrame({"id": [2, 4], "num": [1, 9]}) + df2 = DataFrame({"id": [2, 4], "num": [1, 9]}) result = df1.merge(df2) - expected = pd.DataFrame( - {"id": [2, 4], "cat": expected_categories, "num": [1, 9]} - ) + expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]}) expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered)) tm.assert_frame_equal(expected, result) def test_merge_on_int_array(self): # GH 23020 - df = pd.DataFrame({"A": pd.Series([1, 2, np.nan], dtype="Int64"), "B": 1}) + df = DataFrame({"A": Series([1, 2, np.nan], dtype="Int64"), "B": 1}) result = pd.merge(df, df, on="A") - expected = pd.DataFrame( - {"A": pd.Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1} + expected = DataFrame( + {"A": Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1} ) tm.assert_frame_equal(result, expected) @@ -1950,20 +1944,20 @@ def test_merge_index_types(index): ) def test_merge_series(on, left_on, right_on, left_index, right_index, nm): # GH 21220 - a = pd.DataFrame( + a = DataFrame( {"A": [1, 2, 3, 4]}, index=pd.MultiIndex.from_product( [["a", "b"], [0, 1]], names=["outer", "inner"] ), ) - b = pd.Series( + b = Series( [1, 2, 3, 4], index=pd.MultiIndex.from_product( [["a", "b"], [1, 2]], names=["outer", "inner"] ), name=nm, ) - expected = pd.DataFrame( + expected = DataFrame( {"A": [2, 4], "B": [1, 3]}, index=pd.MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]), ) @@ -2012,10 +2006,10 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): ) def test_merge_suffix(col1, col2, kwargs, expected_cols): # issue: 24782 - a = pd.DataFrame({col1: [1, 2, 3]}) - b = pd.DataFrame({col2: [4, 5, 6]}) + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [4, 5, 6]}) - expected = pd.DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) + expected = DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) result = a.merge(b, left_index=True, right_index=True, **kwargs) tm.assert_frame_equal(result, expected) @@ -2060,8 +2054,8 @@ def test_merge_duplicate_suffix(how, expected): ) def test_merge_suffix_error(col1, col2, suffixes): # issue: 24782 - a = pd.DataFrame({col1: [1, 2, 3]}) - b = pd.DataFrame({col2: [3, 4, 5]}) + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [3, 4, 5]}) # TODO: might reconsider current raise behaviour, see issue 24782 msg = "columns overlap but no suffix specified" @@ -2071,8 +2065,8 @@ def test_merge_suffix_error(col1, col2, suffixes): @pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}]) def test_merge_suffix_warns(suffixes): - a = pd.DataFrame({"a": [1, 2, 3]}) - b = pd.DataFrame({"b": [3, 4, 5]}) + a = DataFrame({"a": [1, 2, 3]}) + b = DataFrame({"b": [3, 4, 5]}) with tm.assert_produces_warning(FutureWarning): pd.merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"}) @@ -2086,8 +2080,8 @@ def test_merge_suffix_warns(suffixes): ], ) def test_merge_suffix_length_error(col1, col2, suffixes, msg): - a = pd.DataFrame({col1: [1, 2, 3]}) - b = pd.DataFrame({col2: [3, 4, 5]}) + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [3, 4, 5]}) with pytest.raises(ValueError, match=msg): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) @@ -2176,9 +2170,9 @@ def test_merge_multiindex_columns(): numbers = ["1", "2", "3"] index = pd.MultiIndex.from_product((letters, numbers), names=["outer", "inner"]) - frame_x = pd.DataFrame(columns=index) + frame_x = DataFrame(columns=index) frame_x["id"] = "" - frame_y = pd.DataFrame(columns=index) + frame_y = DataFrame(columns=index) frame_y["id"] = "" l_suf = "_x" @@ -2190,7 +2184,7 @@ def test_merge_multiindex_columns(): expected_index = pd.MultiIndex.from_product( [expected_labels, numbers], names=["outer", "inner"] ) - expected = pd.DataFrame(columns=expected_index) + expected = DataFrame(columns=expected_index) expected["id"] = "" tm.assert_frame_equal(result, expected) @@ -2198,12 +2192,12 @@ def test_merge_multiindex_columns(): def test_merge_datetime_upcast_dtype(): # https://github.com/pandas-dev/pandas/issues/31208 - df1 = pd.DataFrame({"x": ["a", "b", "c"], "y": ["1", "2", "4"]}) - df2 = pd.DataFrame( + df1 = DataFrame({"x": ["a", "b", "c"], "y": ["1", "2", "4"]}) + df2 = DataFrame( {"y": ["1", "2", "3"], "z": pd.to_datetime(["2000", "2001", "2002"])} ) result = pd.merge(df1, df2, how="left", on="y") - expected = pd.DataFrame( + expected = DataFrame( { "x": ["a", "b", "c"], "y": ["1", "2", "4"], @@ -2233,3 +2227,59 @@ def test_categorical_non_unique_monotonic(n_categories): index=left_index, ) tm.assert_frame_equal(expected, result) + + +def test_merge_join_categorical_multiindex(): + # From issue 16627 + a = { + "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]), + "Int1": [0, 1, 0, 1, 0, 0], + } + a = DataFrame(a) + + b = { + "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + expected = expected.drop(["Cat", "Int"], axis=1) + result = a.join(b, on=["Cat1", "Int1"]) + tm.assert_frame_equal(expected, result) + + # Same test, but with ordered categorical + a = { + "Cat1": Categorical( + ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True + ), + "Int1": [0, 1, 0, 1, 0, 0], + } + a = DataFrame(a) + + b = { + "Cat": Categorical( + ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True + ), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + expected = expected.drop(["Cat", "Int"], axis=1) + result = a.join(b, on=["Cat1", "Int1"]) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index e0063925a03e1..17f2f44f45fce 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -88,9 +88,9 @@ def test_empty_sequence_concat(self): with pytest.raises(ValueError, match=pattern): pd.concat(df_seq) - pd.concat([pd.DataFrame()]) - pd.concat([None, pd.DataFrame()]) - pd.concat([pd.DataFrame(), None]) + pd.concat([DataFrame()]) + pd.concat([None, DataFrame()]) + pd.concat([DataFrame(), None]) def test_doc_example(self): left = DataFrame( diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 61fdafa0c6db2..68096192c51ea 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -200,11 +200,9 @@ def test_merge_multiple_cols_with_mixed_cols_index(self): pd.MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]), name="Amount", ) - df = pd.DataFrame( - {"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0} - ) + df = DataFrame({"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0}) result = pd.merge(df, s.reset_index(), on=["lev1", "lev2"]) - expected = pd.DataFrame( + expected = DataFrame( { "lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], @@ -801,7 +799,7 @@ def test_single_common_level(self): [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] ) - left = pd.DataFrame( + left = DataFrame( {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left ) @@ -809,7 +807,7 @@ def test_single_common_level(self): [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] ) - right = pd.DataFrame( + right = DataFrame( {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=index_right, ) @@ -828,12 +826,12 @@ def test_join_multi_wrong_order(self): midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) midx3 = pd.MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"]) - left = pd.DataFrame(index=midx1, data={"x": [10, 20, 30, 40]}) - right = pd.DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]}) + left = DataFrame(index=midx1, data={"x": [10, 20, 30, 40]}) + right = DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]}) result = left.join(right) - expected = pd.DataFrame( + expected = DataFrame( index=midx1, data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]}, ) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py deleted file mode 100644 index f0eb745041a66..0000000000000 --- a/pandas/tests/reshape/test_concat.py +++ /dev/null @@ -1,2927 +0,0 @@ -from collections import abc, deque -import datetime as dt -from datetime import datetime -from decimal import Decimal -from io import StringIO -from itertools import combinations -from warnings import catch_warnings - -import dateutil -import numpy as np -from numpy.random import randn -import pytest - -from pandas.core.dtypes.dtypes import CategoricalDtype - -import pandas as pd -from pandas import ( - Categorical, - DataFrame, - DatetimeIndex, - Index, - MultiIndex, - Series, - Timestamp, - concat, - date_range, - isna, - read_csv, -) -import pandas._testing as tm -from pandas.core.arrays import SparseArray -from pandas.core.construction import create_series_with_explicit_dtype -from pandas.tests.extension.decimal import to_decimal - - -@pytest.fixture(params=[True, False]) -def sort(request): - """Boolean sort keyword for concat and DataFrame.append.""" - return request.param - - -class TestConcatAppendCommon: - """ - Test common dtype coercion rules between concat and append. - """ - - def setup_method(self, method): - - dt_data = [ - pd.Timestamp("2011-01-01"), - pd.Timestamp("2011-01-02"), - pd.Timestamp("2011-01-03"), - ] - tz_data = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-01-02", tz="US/Eastern"), - pd.Timestamp("2011-01-03", tz="US/Eastern"), - ] - - td_data = [ - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - ] - - period_data = [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - ] - - self.data = { - "bool": [True, False, True], - "int64": [1, 2, 3], - "float64": [1.1, np.nan, 3.3], - "category": pd.Categorical(["X", "Y", "Z"]), - "object": ["a", "b", "c"], - "datetime64[ns]": dt_data, - "datetime64[ns, US/Eastern]": tz_data, - "timedelta64[ns]": td_data, - "period[M]": period_data, - } - - def _check_expected_dtype(self, obj, label): - """ - Check whether obj has expected dtype depending on label - considering not-supported dtypes - """ - if isinstance(obj, pd.Index): - if label == "bool": - assert obj.dtype == "object" - else: - assert obj.dtype == label - elif isinstance(obj, pd.Series): - if label.startswith("period"): - assert obj.dtype == "Period[M]" - else: - assert obj.dtype == label - else: - raise ValueError - - def test_dtypes(self): - # to confirm test case covers intended dtypes - for typ, vals in self.data.items(): - self._check_expected_dtype(pd.Index(vals), typ) - self._check_expected_dtype(pd.Series(vals), typ) - - def test_concatlike_same_dtypes(self): - # GH 13660 - for typ1, vals1 in self.data.items(): - - vals2 = vals1 - vals3 = vals1 - - if typ1 == "category": - exp_data = pd.Categorical(list(vals1) + list(vals2)) - exp_data3 = pd.Categorical(list(vals1) + list(vals2) + list(vals3)) - else: - exp_data = vals1 + vals2 - exp_data3 = vals1 + vals2 + vals3 - - # ----- Index ----- # - - # index.append - res = pd.Index(vals1).append(pd.Index(vals2)) - exp = pd.Index(exp_data) - tm.assert_index_equal(res, exp) - - # 3 elements - res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) - exp = pd.Index(exp_data3) - tm.assert_index_equal(res, exp) - - # index.append name mismatch - i1 = pd.Index(vals1, name="x") - i2 = pd.Index(vals2, name="y") - res = i1.append(i2) - exp = pd.Index(exp_data) - tm.assert_index_equal(res, exp) - - # index.append name match - i1 = pd.Index(vals1, name="x") - i2 = pd.Index(vals2, name="x") - res = i1.append(i2) - exp = pd.Index(exp_data, name="x") - tm.assert_index_equal(res, exp) - - # cannot append non-index - with pytest.raises(TypeError, match="all inputs must be Index"): - pd.Index(vals1).append(vals2) - - with pytest.raises(TypeError, match="all inputs must be Index"): - pd.Index(vals1).append([pd.Index(vals2), vals3]) - - # ----- Series ----- # - - # series.append - res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) - exp = pd.Series(exp_data) - tm.assert_series_equal(res, exp, check_index_type=True) - - # concat - res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # 3 elements - res = pd.Series(vals1).append( - [pd.Series(vals2), pd.Series(vals3)], ignore_index=True - ) - exp = pd.Series(exp_data3) - tm.assert_series_equal(res, exp) - - res = pd.concat( - [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], - ignore_index=True, - ) - tm.assert_series_equal(res, exp) - - # name mismatch - s1 = pd.Series(vals1, name="x") - s2 = pd.Series(vals2, name="y") - res = s1.append(s2, ignore_index=True) - exp = pd.Series(exp_data) - tm.assert_series_equal(res, exp, check_index_type=True) - - res = pd.concat([s1, s2], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # name match - s1 = pd.Series(vals1, name="x") - s2 = pd.Series(vals2, name="x") - res = s1.append(s2, ignore_index=True) - exp = pd.Series(exp_data, name="x") - tm.assert_series_equal(res, exp, check_index_type=True) - - res = pd.concat([s1, s2], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # cannot append non-index - msg = ( - r"cannot concatenate object of type '.+'; " - "only Series and DataFrame objs are valid" - ) - with pytest.raises(TypeError, match=msg): - pd.Series(vals1).append(vals2) - - with pytest.raises(TypeError, match=msg): - pd.Series(vals1).append([pd.Series(vals2), vals3]) - - with pytest.raises(TypeError, match=msg): - pd.concat([pd.Series(vals1), vals2]) - - with pytest.raises(TypeError, match=msg): - pd.concat([pd.Series(vals1), pd.Series(vals2), vals3]) - - def test_concatlike_dtypes_coercion(self): - # GH 13660 - for typ1, vals1 in self.data.items(): - for typ2, vals2 in self.data.items(): - - vals3 = vals2 - - # basically infer - exp_index_dtype = None - exp_series_dtype = None - - if typ1 == typ2: - # same dtype is tested in test_concatlike_same_dtypes - continue - elif typ1 == "category" or typ2 == "category": - # TODO: suspicious - continue - - # specify expected dtype - if typ1 == "bool" and typ2 in ("int64", "float64"): - # series coerces to numeric based on numpy rule - # index doesn't because bool is object dtype - exp_series_dtype = typ2 - elif typ2 == "bool" and typ1 in ("int64", "float64"): - exp_series_dtype = typ1 - elif ( - typ1 == "datetime64[ns, US/Eastern]" - or typ2 == "datetime64[ns, US/Eastern]" - or typ1 == "timedelta64[ns]" - or typ2 == "timedelta64[ns]" - ): - exp_index_dtype = object - exp_series_dtype = object - - exp_data = vals1 + vals2 - exp_data3 = vals1 + vals2 + vals3 - - # ----- Index ----- # - - # index.append - res = pd.Index(vals1).append(pd.Index(vals2)) - exp = pd.Index(exp_data, dtype=exp_index_dtype) - tm.assert_index_equal(res, exp) - - # 3 elements - res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) - exp = pd.Index(exp_data3, dtype=exp_index_dtype) - tm.assert_index_equal(res, exp) - - # ----- Series ----- # - - # series.append - res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) - exp = pd.Series(exp_data, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp, check_index_type=True) - - # concat - res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # 3 elements - res = pd.Series(vals1).append( - [pd.Series(vals2), pd.Series(vals3)], ignore_index=True - ) - exp = pd.Series(exp_data3, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp) - - res = pd.concat( - [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], - ignore_index=True, - ) - tm.assert_series_equal(res, exp) - - def test_concatlike_common_coerce_to_pandas_object(self): - # GH 13626 - # result must be Timestamp/Timedelta, not datetime.datetime/timedelta - dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) - tdi = pd.TimedeltaIndex(["1 days", "2 days"]) - - exp = pd.Index( - [ - pd.Timestamp("2011-01-01"), - pd.Timestamp("2011-01-02"), - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - ] - ) - - res = dti.append(tdi) - tm.assert_index_equal(res, exp) - assert isinstance(res[0], pd.Timestamp) - assert isinstance(res[-1], pd.Timedelta) - - dts = pd.Series(dti) - tds = pd.Series(tdi) - res = dts.append(tds) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - assert isinstance(res.iloc[0], pd.Timestamp) - assert isinstance(res.iloc[-1], pd.Timedelta) - - res = pd.concat([dts, tds]) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - assert isinstance(res.iloc[0], pd.Timestamp) - assert isinstance(res.iloc[-1], pd.Timedelta) - - def test_concatlike_datetimetz(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH 7795 - dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) - dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz) - - exp = pd.DatetimeIndex( - ["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz - ) - - res = dti1.append(dti2) - tm.assert_index_equal(res, exp) - - dts1 = pd.Series(dti1) - dts2 = pd.Series(dti2) - res = dts1.append(dts2) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([dts1, dts2]) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"]) - def test_concatlike_datetimetz_short(self, tz): - # GH#7795 - ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz) - ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz) - df1 = pd.DataFrame(0, index=ix1, columns=["A", "B"]) - df2 = pd.DataFrame(0, index=ix2, columns=["A", "B"]) - - exp_idx = pd.DatetimeIndex( - ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], - tz=tz, - ) - exp = pd.DataFrame(0, index=exp_idx, columns=["A", "B"]) - - tm.assert_frame_equal(df1.append(df2), exp) - tm.assert_frame_equal(pd.concat([df1, df2]), exp) - - def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH 13660 - - # different tz coerces to object - dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) - dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"]) - - exp = pd.Index( - [ - pd.Timestamp("2011-01-01", tz=tz), - pd.Timestamp("2011-01-02", tz=tz), - pd.Timestamp("2012-01-01"), - pd.Timestamp("2012-01-02"), - ], - dtype=object, - ) - - res = dti1.append(dti2) - tm.assert_index_equal(res, exp) - - dts1 = pd.Series(dti1) - dts2 = pd.Series(dti2) - res = dts1.append(dts2) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([dts1, dts2]) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - # different tz - dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") - - exp = pd.Index( - [ - pd.Timestamp("2011-01-01", tz=tz), - pd.Timestamp("2011-01-02", tz=tz), - pd.Timestamp("2012-01-01", tz="US/Pacific"), - pd.Timestamp("2012-01-02", tz="US/Pacific"), - ], - dtype=object, - ) - - res = dti1.append(dti3) - # tm.assert_index_equal(res, exp) - - dts1 = pd.Series(dti1) - dts3 = pd.Series(dti3) - res = dts1.append(dts3) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([dts1, dts3]) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - def test_concatlike_common_period(self): - # GH 13660 - pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") - pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M") - - exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M") - - res = pi1.append(pi2) - tm.assert_index_equal(res, exp) - - ps1 = pd.Series(pi1) - ps2 = pd.Series(pi2) - res = ps1.append(ps2) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([ps1, ps2]) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - def test_concatlike_common_period_diff_freq_to_object(self): - # GH 13221 - pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") - pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") - - exp = pd.Index( - [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Period("2012-01-01", freq="D"), - pd.Period("2012-02-01", freq="D"), - ], - dtype=object, - ) - - res = pi1.append(pi2) - tm.assert_index_equal(res, exp) - - ps1 = pd.Series(pi1) - ps2 = pd.Series(pi2) - res = ps1.append(ps2) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([ps1, ps2]) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - def test_concatlike_common_period_mixed_dt_to_object(self): - # GH 13221 - # different datetimelike - pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") - tdi = pd.TimedeltaIndex(["1 days", "2 days"]) - exp = pd.Index( - [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - ], - dtype=object, - ) - - res = pi1.append(tdi) - tm.assert_index_equal(res, exp) - - ps1 = pd.Series(pi1) - tds = pd.Series(tdi) - res = ps1.append(tds) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([ps1, tds]) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - # inverse - exp = pd.Index( - [ - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - ], - dtype=object, - ) - - res = tdi.append(pi1) - tm.assert_index_equal(res, exp) - - ps1 = pd.Series(pi1) - tds = pd.Series(tdi) - res = tds.append(ps1) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - res = pd.concat([tds, ps1]) - tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - - def test_concat_categorical(self): - # GH 13524 - - # same categories -> category - s1 = pd.Series([1, 2, np.nan], dtype="category") - s2 = pd.Series([2, 1, 2], dtype="category") - - exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="category") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - # partially different categories => not-category - s1 = pd.Series([3, 2], dtype="category") - s2 = pd.Series([2, 1], dtype="category") - - exp = pd.Series([3, 2, 2, 1]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - # completely different categories (same dtype) => not-category - s1 = pd.Series([10, 11, np.nan], dtype="category") - s2 = pd.Series([np.nan, 1, 3, 2], dtype="category") - - exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - def test_union_categorical_same_categories_different_order(self): - # https://github.com/pandas-dev/pandas/issues/19096 - a = pd.Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"])) - b = pd.Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"])) - result = pd.concat([a, b], ignore_index=True) - expected = pd.Series( - Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]) - ) - tm.assert_series_equal(result, expected) - - def test_concat_categorical_coercion(self): - # GH 13524 - - # category + not-category => not-category - s1 = pd.Series([1, 2, np.nan], dtype="category") - s2 = pd.Series([2, 1, 2]) - - exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="object") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - # result shouldn't be affected by 1st elem dtype - exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype="object") - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # all values are not in category => not-category - s1 = pd.Series([3, 2], dtype="category") - s2 = pd.Series([2, 1]) - - exp = pd.Series([3, 2, 2, 1]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = pd.Series([2, 1, 3, 2]) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # completely different categories => not-category - s1 = pd.Series([10, 11, np.nan], dtype="category") - s2 = pd.Series([1, 3, 2]) - - exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype="object") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype="object") - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # different dtype => not-category - s1 = pd.Series([10, 11, np.nan], dtype="category") - s2 = pd.Series(["a", "b", "c"]) - - exp = pd.Series([10, 11, np.nan, "a", "b", "c"]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = pd.Series(["a", "b", "c", 10, 11, np.nan]) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # if normal series only contains NaN-likes => not-category - s1 = pd.Series([10, 11], dtype="category") - s2 = pd.Series([np.nan, np.nan, np.nan]) - - exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = pd.Series([np.nan, np.nan, np.nan, 10, 11]) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - def test_concat_categorical_3elem_coercion(self): - # GH 13524 - - # mixed dtypes => not-category - s1 = pd.Series([1, 2, np.nan], dtype="category") - s2 = pd.Series([2, 1, 2], dtype="category") - s3 = pd.Series([1, 2, 1, 2, np.nan]) - - exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") - tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - - exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") - tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) - - # values are all in either category => not-category - s1 = pd.Series([4, 5, 6], dtype="category") - s2 = pd.Series([1, 2, 3], dtype="category") - s3 = pd.Series([1, 3, 4]) - - exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) - tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - - exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) - tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) - - # values are all in either category => not-category - s1 = pd.Series([4, 5, 6], dtype="category") - s2 = pd.Series([1, 2, 3], dtype="category") - s3 = pd.Series([10, 11, 12]) - - exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) - tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) - tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - - exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) - tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) - - def test_concat_categorical_multi_coercion(self): - # GH 13524 - - s1 = pd.Series([1, 3], dtype="category") - s2 = pd.Series([3, 4], dtype="category") - s3 = pd.Series([2, 3]) - s4 = pd.Series([2, 2], dtype="category") - s5 = pd.Series([1, np.nan]) - s6 = pd.Series([1, 3, 2], dtype="category") - - # mixed dtype, values are all in categories => not-category - exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) - res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) - tm.assert_series_equal(res, exp) - res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) - tm.assert_series_equal(res, exp) - - exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) - res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) - tm.assert_series_equal(res, exp) - res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) - tm.assert_series_equal(res, exp) - - def test_concat_categorical_ordered(self): - # GH 13524 - - s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True)) - s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True)) - - exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - exp = pd.Series( - pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True) - ) - tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) - - def test_concat_categorical_coercion_nan(self): - # GH 13524 - - # some edge cases - # category + not-category => not category - s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category") - s2 = pd.Series([np.nan, 1]) - - exp = pd.Series([np.nan, np.nan, np.nan, 1]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - s1 = pd.Series([1, np.nan], dtype="category") - s2 = pd.Series([np.nan, np.nan]) - - exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="float") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - # mixed dtype, all nan-likes => not-category - s1 = pd.Series([np.nan, np.nan], dtype="category") - s2 = pd.Series([np.nan, np.nan]) - - exp = pd.Series([np.nan, np.nan, np.nan, np.nan]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - # all category nan-likes => category - s1 = pd.Series([np.nan, np.nan], dtype="category") - s2 = pd.Series([np.nan, np.nan], dtype="category") - - exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype="category") - - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - def test_concat_categorical_empty(self): - # GH 13524 - - s1 = pd.Series([], dtype="category") - s2 = pd.Series([1, 2], dtype="category") - - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - - s1 = pd.Series([], dtype="category") - s2 = pd.Series([], dtype="category") - - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - - s1 = pd.Series([], dtype="category") - s2 = pd.Series([], dtype="object") - - # different dtype => not-category - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - - s1 = pd.Series([], dtype="category") - s2 = pd.Series([np.nan, np.nan]) - - # empty Series is ignored - exp = pd.Series([np.nan, np.nan]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - - -class TestAppend: - def test_append(self, sort, float_frame): - mixed_frame = float_frame.copy() - mixed_frame["foo"] = "bar" - - begin_index = float_frame.index[:5] - end_index = float_frame.index[5:] - - begin_frame = float_frame.reindex(begin_index) - end_frame = float_frame.reindex(end_index) - - appended = begin_frame.append(end_frame) - tm.assert_almost_equal(appended["A"], float_frame["A"]) - - del end_frame["A"] - partial_appended = begin_frame.append(end_frame, sort=sort) - assert "A" in partial_appended - - partial_appended = end_frame.append(begin_frame, sort=sort) - assert "A" in partial_appended - - # mixed type handling - appended = mixed_frame[:5].append(mixed_frame[5:]) - tm.assert_frame_equal(appended, mixed_frame) - - # what to test here - mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort) - mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort) - - # all equal except 'foo' column - tm.assert_frame_equal( - mixed_appended.reindex(columns=["A", "B", "C", "D"]), - mixed_appended2.reindex(columns=["A", "B", "C", "D"]), - ) - - def test_append_empty(self, float_frame): - empty = DataFrame() - - appended = float_frame.append(empty) - tm.assert_frame_equal(float_frame, appended) - assert appended is not float_frame - - appended = empty.append(float_frame) - tm.assert_frame_equal(float_frame, appended) - assert appended is not float_frame - - def test_append_overlap_raises(self, float_frame): - msg = "Indexes have overlapping values" - with pytest.raises(ValueError, match=msg): - float_frame.append(float_frame, verify_integrity=True) - - def test_append_new_columns(self): - # see gh-6129: new columns - df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) - row = Series([5, 6, 7], index=["a", "b", "c"], name="z") - expected = DataFrame( - { - "a": {"x": 1, "y": 2, "z": 5}, - "b": {"x": 3, "y": 4, "z": 6}, - "c": {"z": 7}, - } - ) - result = df.append(row) - tm.assert_frame_equal(result, expected) - - def test_append_length0_frame(self, sort): - df = DataFrame(columns=["A", "B", "C"]) - df3 = DataFrame(index=[0, 1], columns=["A", "B"]) - df5 = df.append(df3, sort=sort) - - expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) - tm.assert_frame_equal(df5, expected) - - def test_append_records(self): - arr1 = np.zeros((2,), dtype=("i4,f4,a10")) - arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - - arr2 = np.zeros((3,), dtype=("i4,f4,a10")) - arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] - - df1 = DataFrame(arr1) - df2 = DataFrame(arr2) - - result = df1.append(df2, ignore_index=True) - expected = DataFrame(np.concatenate((arr1, arr2))) - tm.assert_frame_equal(result, expected) - - # rewrite sort fixture, since we also want to test default of None - def test_append_sorts(self, sort): - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) - df2 = pd.DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) - - with tm.assert_produces_warning(None): - result = df1.append(df2, sort=sort) - - # for None / True - expected = pd.DataFrame( - {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, - columns=["a", "b", "c"], - ) - if sort is False: - expected = expected[["b", "a", "c"]] - tm.assert_frame_equal(result, expected) - - def test_append_different_columns(self, sort): - df = DataFrame( - { - "bools": np.random.randn(10) > 0, - "ints": np.random.randint(0, 10, 10), - "floats": np.random.randn(10), - "strings": ["foo", "bar"] * 5, - } - ) - - a = df[:5].loc[:, ["bools", "ints", "floats"]] - b = df[5:].loc[:, ["strings", "ints", "floats"]] - - appended = a.append(b, sort=sort) - assert isna(appended["strings"][0:4]).all() - assert isna(appended["bools"][5:]).all() - - def test_append_many(self, sort, float_frame): - chunks = [ - float_frame[:5], - float_frame[5:10], - float_frame[10:15], - float_frame[15:], - ] - - result = chunks[0].append(chunks[1:]) - tm.assert_frame_equal(result, float_frame) - - chunks[-1] = chunks[-1].copy() - chunks[-1]["foo"] = "bar" - result = chunks[0].append(chunks[1:], sort=sort) - tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) - assert (result["foo"][15:] == "bar").all() - assert result["foo"][:15].isna().all() - - def test_append_preserve_index_name(self): - # #980 - df1 = DataFrame(columns=["A", "B", "C"]) - df1 = df1.set_index(["A"]) - df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) - df2 = df2.set_index(["A"]) - - result = df1.append(df2) - assert result.index.name == "A" - - indexes_can_append = [ - pd.RangeIndex(3), - pd.Index([4, 5, 6]), - pd.Index([4.5, 5.5, 6.5]), - pd.Index(list("abc")), - pd.CategoricalIndex("A B C".split()), - pd.CategoricalIndex("D E F".split(), ordered=True), - pd.IntervalIndex.from_breaks([7, 8, 9, 10]), - pd.DatetimeIndex( - [ - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 3, 7, 12), - ] - ), - ] - - indexes_cannot_append_with_other = [ - pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]) - ] - - all_indexes = indexes_can_append + indexes_cannot_append_with_other - - @pytest.mark.parametrize("index", all_indexes, ids=lambda x: type(x).__name__) - def test_append_same_columns_type(self, index): - # GH18359 - - # df wider than ser - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) - ser_index = index[:2] - ser = pd.Series([7, 8], index=ser_index, name=2) - result = df.append(ser) - expected = pd.DataFrame( - [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index - ) - tm.assert_frame_equal(result, expected) - - # ser wider than df - ser_index = index - index = index[:2] - df = pd.DataFrame([[1, 2], [4, 5]], columns=index) - ser = pd.Series([7, 8, 9], index=ser_index, name=2) - result = df.append(ser) - expected = pd.DataFrame( - [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], - index=[0, 1, 2], - columns=ser_index, - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "df_columns, series_index", - combinations(indexes_can_append, r=2), - ids=lambda x: type(x).__name__, - ) - def test_append_different_columns_types(self, df_columns, series_index): - # GH18359 - # See also test 'test_append_different_columns_types_raises' below - # for errors raised when appending - - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) - ser = pd.Series([7, 8, 9], index=series_index, name=2) - - result = df.append(ser) - idx_diff = ser.index.difference(df_columns) - combined_columns = Index(df_columns.tolist()).append(idx_diff) - expected = pd.DataFrame( - [ - [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], - [4, 5, 6, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, 7, 8, 9], - ], - index=[0, 1, 2], - columns=combined_columns, - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "index_can_append", indexes_can_append, ids=lambda x: type(x).__name__ - ) - @pytest.mark.parametrize( - "index_cannot_append_with_other", - indexes_cannot_append_with_other, - ids=lambda x: type(x).__name__, - ) - def test_append_different_columns_types_raises( - self, index_can_append, index_cannot_append_with_other - ): - # GH18359 - # Dataframe.append will raise if MultiIndex appends - # or is appended to a different index type - # - # See also test 'test_append_different_columns_types' above for - # appending without raising. - - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) - ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) - msg = ( - r"Expected tuple, got (int|long|float|str|" - r"pandas._libs.interval.Interval)|" - r"object of type '(int|float|Timestamp|" - r"pandas._libs.interval.Interval)' has no len\(\)|" - ) - with pytest.raises(TypeError, match=msg): - df.append(ser) - - df = pd.DataFrame( - [[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other - ) - ser = pd.Series([7, 8, 9], index=index_can_append, name=2) - - with pytest.raises(TypeError, match=msg): - df.append(ser) - - def test_append_dtype_coerce(self, sort): - - # GH 4993 - # appending with datetime will incorrectly convert datetime64 - - df1 = DataFrame( - index=[1, 2], - data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], - columns=["start_time"], - ) - df2 = DataFrame( - index=[4, 5], - data=[ - [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], - [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)], - ], - columns=["start_time", "end_time"], - ) - - expected = concat( - [ - Series( - [ - pd.NaT, - pd.NaT, - dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 4, 7, 10), - ], - name="end_time", - ), - Series( - [ - dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0), - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 4, 0, 0), - ], - name="start_time", - ), - ], - axis=1, - sort=sort, - ) - result = df1.append(df2, ignore_index=True, sort=sort) - if sort: - expected = expected[["end_time", "start_time"]] - else: - expected = expected[["start_time", "end_time"]] - - tm.assert_frame_equal(result, expected) - - def test_append_missing_column_proper_upcast(self, sort): - df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) - df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) - - appended = df1.append(df2, ignore_index=True, sort=sort) - assert appended["A"].dtype == "f8" - assert appended["B"].dtype == "O" - - def test_append_empty_frame_to_series_with_dateutil_tz(self): - # GH 23682 - date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) - s = Series({"date": date, "a": 1.0, "b": 2.0}) - df = DataFrame(columns=["c", "d"]) - result_a = df.append(s, ignore_index=True) - expected = DataFrame( - [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] - ) - # These columns get cast to object after append - expected["c"] = expected["c"].astype(object) - expected["d"] = expected["d"].astype(object) - tm.assert_frame_equal(result_a, expected) - - expected = DataFrame( - [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"] - ) - expected["c"] = expected["c"].astype(object) - expected["d"] = expected["d"].astype(object) - - result_b = result_a.append(s, ignore_index=True) - tm.assert_frame_equal(result_b, expected) - - # column order is different - expected = expected[["c", "d", "date", "a", "b"]] - result = df.append([s, s], ignore_index=True) - tm.assert_frame_equal(result, expected) - - def test_append_empty_tz_frame_with_datetime64ns(self): - # https://github.com/pandas-dev/pandas/issues/35460 - df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") - - # pd.NaT gets inferred as tz-naive, so append result is tz-naive - result = df.append({"a": pd.NaT}, ignore_index=True) - expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - # also test with typed value to append - df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") - result = df.append( - pd.Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True - ) - expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - -class TestConcatenate: - def test_concat_copy(self): - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) - df3 = DataFrame({5: "foo"}, index=range(4)) - - # These are actual copies. - result = concat([df, df2, df3], axis=1, copy=True) - - for b in result._mgr.blocks: - assert b.values.base is None - - # These are the same. - result = concat([df, df2, df3], axis=1, copy=False) - - for b in result._mgr.blocks: - if b.is_float: - assert b.values.base is df._mgr.blocks[0].values.base - elif b.is_integer: - assert b.values.base is df2._mgr.blocks[0].values.base - elif b.is_object: - assert b.values.base is not None - - # Float block was consolidated. - df4 = DataFrame(np.random.randn(4, 1)) - result = concat([df, df2, df3, df4], axis=1, copy=False) - for b in result._mgr.blocks: - if b.is_float: - assert b.values.base is None - elif b.is_integer: - assert b.values.base is df2._mgr.blocks[0].values.base - elif b.is_object: - assert b.values.base is not None - - def test_concat_with_group_keys(self): - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randn(4, 4)) - - # axis=0 - df = DataFrame(np.random.randn(3, 4)) - df2 = DataFrame(np.random.randn(4, 4)) - - result = concat([df, df2], keys=[0, 1]) - exp_index = MultiIndex.from_arrays( - [[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]] - ) - expected = DataFrame(np.r_[df.values, df2.values], index=exp_index) - tm.assert_frame_equal(result, expected) - - result = concat([df, df], keys=[0, 1]) - exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) - expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) - tm.assert_frame_equal(result, expected) - - # axis=1 - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randn(4, 4)) - - result = concat([df, df2], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index) - tm.assert_frame_equal(result, expected) - - result = concat([df, df], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2) - tm.assert_frame_equal(result, expected) - - def test_concat_keys_specific_levels(self): - df = DataFrame(np.random.randn(10, 4)) - pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]] - level = ["three", "two", "one", "zero"] - result = concat( - pieces, - axis=1, - keys=["one", "two", "three"], - levels=[level], - names=["group_key"], - ) - - tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) - tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) - - assert result.columns.names == ["group_key", None] - - def test_concat_dataframe_keys_bug(self, sort): - t1 = DataFrame( - {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))} - ) - t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) - - # it works - result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) - assert list(result.columns) == [("t1", "value"), ("t2", "value")] - - def test_concat_series_partial_columns_names(self): - # GH10698 - foo = Series([1, 2], name="foo") - bar = Series([1, 2]) - baz = Series([4, 5]) - - result = concat([foo, bar, baz], axis=1) - expected = DataFrame( - {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1] - ) - tm.assert_frame_equal(result, expected) - - result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"]) - expected = DataFrame( - {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]}, - columns=["red", "blue", "yellow"], - ) - tm.assert_frame_equal(result, expected) - - result = concat([foo, bar, baz], axis=1, ignore_index=True) - expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("mapping", ["mapping", "dict"]) - def test_concat_mapping(self, mapping, non_dict_mapping_subclass): - constructor = dict if mapping == "dict" else non_dict_mapping_subclass - frames = constructor( - { - "foo": DataFrame(np.random.randn(4, 3)), - "bar": DataFrame(np.random.randn(4, 3)), - "baz": DataFrame(np.random.randn(4, 3)), - "qux": DataFrame(np.random.randn(4, 3)), - } - ) - - sorted_keys = list(frames.keys()) - - result = concat(frames) - expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) - tm.assert_frame_equal(result, expected) - - result = concat(frames, axis=1) - expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1) - tm.assert_frame_equal(result, expected) - - keys = ["baz", "foo", "bar"] - result = concat(frames, keys=keys) - expected = concat([frames[k] for k in keys], keys=keys) - tm.assert_frame_equal(result, expected) - - def test_concat_ignore_index(self, sort): - frame1 = DataFrame( - {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]} - ) - frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) - frame1.index = Index(["x", "y", "z"]) - frame2.index = Index(["x", "y", "q"]) - - v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort) - - nan = np.nan - expected = DataFrame( - [ - [nan, nan, nan, 4.3], - ["a", 1, 4.5, 5.2], - ["b", 2, 3.2, 2.2], - ["c", 3, 1.2, nan], - ], - index=Index(["q", "x", "y", "z"]), - ) - if not sort: - expected = expected.loc[["x", "y", "z", "q"]] - - tm.assert_frame_equal(v1, expected) - - @pytest.mark.parametrize( - "name_in1,name_in2,name_in3,name_out", - [ - ("idx", "idx", "idx", "idx"), - ("idx", "idx", None, None), - ("idx", None, None, None), - ("idx1", "idx2", None, None), - ("idx1", "idx1", "idx2", None), - ("idx1", "idx2", "idx3", None), - (None, None, None, None), - ], - ) - def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out): - # GH13475 - indices = [ - pd.Index(["a", "b", "c"], name=name_in1), - pd.Index(["b", "c", "d"], name=name_in2), - pd.Index(["c", "d", "e"], name=name_in3), - ] - frames = [ - pd.DataFrame({c: [0, 1, 2]}, index=i) - for i, c in zip(indices, ["x", "y", "z"]) - ] - result = pd.concat(frames, axis=1) - - exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out) - expected = pd.DataFrame( - { - "x": [0, 1, 2, np.nan, np.nan], - "y": [np.nan, 0, 1, 2, np.nan], - "z": [np.nan, np.nan, 0, 1, 2], - }, - index=exp_ind, - ) - - tm.assert_frame_equal(result, expected) - - def test_concat_multiindex_with_keys(self): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["first", "second"], - ) - frame = DataFrame( - np.random.randn(10, 3), - index=index, - columns=Index(["A", "B", "C"], name="exp"), - ) - result = concat([frame, frame], keys=[0, 1], names=["iteration"]) - - assert result.index.names == ("iteration",) + index.names - tm.assert_frame_equal(result.loc[0], frame) - tm.assert_frame_equal(result.loc[1], frame) - assert result.index.nlevels == 3 - - def test_concat_multiindex_with_tz(self): - # GH 6606 - df = DataFrame( - { - "dt": [ - datetime(2014, 1, 1), - datetime(2014, 1, 2), - datetime(2014, 1, 3), - ], - "b": ["A", "B", "C"], - "c": [1, 2, 3], - "d": [4, 5, 6], - } - ) - df["dt"] = df["dt"].apply(lambda d: Timestamp(d, tz="US/Pacific")) - df = df.set_index(["dt", "b"]) - - exp_idx1 = DatetimeIndex( - ["2014-01-01", "2014-01-02", "2014-01-03"] * 2, tz="US/Pacific", name="dt" - ) - exp_idx2 = Index(["A", "B", "C"] * 2, name="b") - exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame( - {"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"] - ) - - result = concat([df, df]) - tm.assert_frame_equal(result, expected) - - def test_concat_multiindex_with_none_in_index_names(self): - # GH 15787 - index = pd.MultiIndex.from_product([[1], range(5)], names=["level1", None]) - df = pd.DataFrame({"col": range(5)}, index=index, dtype=np.int32) - - result = concat([df, df], keys=[1, 2], names=["level2"]) - index = pd.MultiIndex.from_product( - [[1, 2], [1], range(5)], names=["level2", "level1", None] - ) - expected = pd.DataFrame( - {"col": list(range(5)) * 2}, index=index, dtype=np.int32 - ) - tm.assert_frame_equal(result, expected) - - result = concat([df, df[:2]], keys=[1, 2], names=["level2"]) - level2 = [1] * 5 + [2] * 2 - level1 = [1] * 7 - no_name = list(range(5)) + list(range(2)) - tuples = list(zip(level2, level1, no_name)) - index = pd.MultiIndex.from_tuples(tuples, names=["level2", "level1", None]) - expected = pd.DataFrame({"col": no_name}, index=index, dtype=np.int32) - tm.assert_frame_equal(result, expected) - - def test_concat_keys_and_levels(self): - df = DataFrame(np.random.randn(1, 3)) - df2 = DataFrame(np.random.randn(1, 4)) - - levels = [["foo", "baz"], ["one", "two"]] - names = ["first", "second"] - result = concat( - [df, df2, df, df2], - keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], - levels=levels, - names=names, - ) - expected = concat([df, df2, df, df2]) - exp_index = MultiIndex( - levels=levels + [[0]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]], - names=names + [None], - ) - expected.index = exp_index - - tm.assert_frame_equal(result, expected) - - # no names - result = concat( - [df, df2, df, df2], - keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], - levels=levels, - ) - assert result.index.names == (None,) * 3 - - # no levels - result = concat( - [df, df2, df, df2], - keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], - names=["first", "second"], - ) - assert result.index.names == ("first", "second", None) - tm.assert_index_equal( - result.index.levels[0], Index(["baz", "foo"], name="first") - ) - - def test_concat_keys_levels_no_overlap(self): - # GH #1406 - df = DataFrame(np.random.randn(1, 3), index=["a"]) - df2 = DataFrame(np.random.randn(1, 4), index=["b"]) - - msg = "Values not found in passed level" - with pytest.raises(ValueError, match=msg): - concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) - - msg = "Key one not in level" - with pytest.raises(ValueError, match=msg): - concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) - - def test_concat_rename_index(self): - a = DataFrame( - np.random.rand(3, 3), - columns=list("ABC"), - index=Index(list("abc"), name="index_a"), - ) - b = DataFrame( - np.random.rand(3, 3), - columns=list("ABC"), - index=Index(list("abc"), name="index_b"), - ) - - result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"]) - - exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"]) - names = list(exp.index.names) - names[1] = "lvl1" - exp.index.set_names(names, inplace=True) - - tm.assert_frame_equal(result, exp) - assert result.index.names == exp.index.names - - def test_crossed_dtypes_weird_corner(self): - columns = ["A", "B", "C", "D"] - df1 = DataFrame( - { - "A": np.array([1, 2, 3, 4], dtype="f8"), - "B": np.array([1, 2, 3, 4], dtype="i8"), - "C": np.array([1, 2, 3, 4], dtype="f8"), - "D": np.array([1, 2, 3, 4], dtype="i8"), - }, - columns=columns, - ) - - df2 = DataFrame( - { - "A": np.array([1, 2, 3, 4], dtype="i8"), - "B": np.array([1, 2, 3, 4], dtype="f8"), - "C": np.array([1, 2, 3, 4], dtype="i8"), - "D": np.array([1, 2, 3, 4], dtype="f8"), - }, - columns=columns, - ) - - appended = df1.append(df2, ignore_index=True) - expected = DataFrame( - np.concatenate([df1.values, df2.values], axis=0), columns=columns - ) - tm.assert_frame_equal(appended, expected) - - df = DataFrame(np.random.randn(1, 3), index=["a"]) - df2 = DataFrame(np.random.randn(1, 4), index=["b"]) - result = concat([df, df2], keys=["one", "two"], names=["first", "second"]) - assert result.index.names == ("first", "second") - - def test_dups_index(self): - # GH 4771 - - # single dtypes - df = DataFrame( - np.random.randint(0, 10, size=40).reshape(10, 4), - columns=["A", "A", "C", "C"], - ) - - result = concat([df, df], axis=1) - tm.assert_frame_equal(result.iloc[:, :4], df) - tm.assert_frame_equal(result.iloc[:, 4:], df) - - result = concat([df, df], axis=0) - tm.assert_frame_equal(result.iloc[:10], df) - tm.assert_frame_equal(result.iloc[10:], df) - - # multi dtypes - df = concat( - [ - DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), - DataFrame( - np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] - ), - ], - axis=1, - ) - - result = concat([df, df], axis=1) - tm.assert_frame_equal(result.iloc[:, :6], df) - tm.assert_frame_equal(result.iloc[:, 6:], df) - - result = concat([df, df], axis=0) - tm.assert_frame_equal(result.iloc[:10], df) - tm.assert_frame_equal(result.iloc[10:], df) - - # append - result = df.iloc[0:8, :].append(df.iloc[8:]) - tm.assert_frame_equal(result, df) - - result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) - tm.assert_frame_equal(result, df) - - expected = concat([df, df], axis=0) - result = df.append(df) - tm.assert_frame_equal(result, expected) - - def test_with_mixed_tuples(self, sort): - # 10697 - # columns have mixed tuples, so handle properly - df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2)) - df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2)) - - # it works - concat([df1, df2], sort=sort) - - def test_handle_empty_objects(self, sort): - df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) - - baz = df[:5].copy() - baz["foo"] = "bar" - empty = df[5:5] - - frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0, sort=sort) - - expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) - expected["foo"] = expected["foo"].astype("O") - expected.loc[0:4, "foo"] = "bar" - - tm.assert_frame_equal(concatted, expected) - - # empty as first element with time series - # GH3259 - df = DataFrame( - dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") - ) - empty = DataFrame() - result = concat([df, empty], axis=1) - tm.assert_frame_equal(result, df) - result = concat([empty, df], axis=1) - tm.assert_frame_equal(result, df) - - result = concat([df, empty]) - tm.assert_frame_equal(result, df) - result = concat([empty, df]) - tm.assert_frame_equal(result, df) - - def test_concat_mixed_objs(self): - - # concat mixed series/frames - # G2385 - - # axis 1 - index = date_range("01-Jan-2013", periods=10, freq="H") - arr = np.arange(10, dtype="int64") - s1 = Series(arr, index=index) - s2 = Series(arr, index=index) - df = DataFrame(arr.reshape(-1, 1), index=index) - - expected = DataFrame( - np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0] - ) - result = concat([df, df], axis=1) - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1] - ) - result = concat([s1, s2], axis=1) - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] - ) - result = concat([s1, s2, s1], axis=1) - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3] - ) - result = concat([s1, df, s2, s2, s1], axis=1) - tm.assert_frame_equal(result, expected) - - # with names - s1.name = "foo" - expected = DataFrame( - np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0] - ) - result = concat([s1, df, s2], axis=1) - tm.assert_frame_equal(result, expected) - - s2.name = "bar" - expected = DataFrame( - np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"] - ) - result = concat([s1, df, s2], axis=1) - tm.assert_frame_equal(result, expected) - - # ignore index - expected = DataFrame( - np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] - ) - result = concat([s1, df, s2], axis=1, ignore_index=True) - tm.assert_frame_equal(result, expected) - - # axis 0 - expected = DataFrame( - np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0] - ) - result = concat([s1, df, s2]) - tm.assert_frame_equal(result, expected) - - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) - result = concat([s1, df, s2], ignore_index=True) - tm.assert_frame_equal(result, expected) - - def test_empty_dtype_coerce(self): - - # xref to #12411 - # xref to #12045 - # xref to #11594 - # see below - - # 10571 - df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"]) - df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"]) - result = concat([df1, df2]) - expected = df1.dtypes - tm.assert_series_equal(result.dtypes, expected) - - def test_dtype_coerceion(self): - - # 12411 - df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) - - result = concat([df.iloc[[0]], df.iloc[[1]]]) - tm.assert_series_equal(result.dtypes, df.dtypes) - - # 12045 - import datetime - - df = DataFrame( - {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} - ) - result = concat([df.iloc[[0]], df.iloc[[1]]]) - tm.assert_series_equal(result.dtypes, df.dtypes) - - # 11594 - df = DataFrame({"text": ["some words"] + [None] * 9}) - result = concat([df.iloc[[0]], df.iloc[[1]]]) - tm.assert_series_equal(result.dtypes, df.dtypes) - - def test_concat_series(self): - - ts = tm.makeTimeSeries() - ts.name = "foo" - - pieces = [ts[:5], ts[5:15], ts[15:]] - - result = concat(pieces) - tm.assert_series_equal(result, ts) - assert result.name == ts.name - - result = concat(pieces, keys=[0, 1, 2]) - expected = ts.copy() - - ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) - - exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] - exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) - expected.index = exp_index - tm.assert_series_equal(result, expected) - - def test_concat_series_axis1(self, sort=sort): - ts = tm.makeTimeSeries() - - pieces = [ts[:-2], ts[2:], ts[2:-2]] - - result = concat(pieces, axis=1) - expected = DataFrame(pieces).T - tm.assert_frame_equal(result, expected) - - result = concat(pieces, keys=["A", "B", "C"], axis=1) - expected = DataFrame(pieces, index=["A", "B", "C"]).T - tm.assert_frame_equal(result, expected) - - # preserve series names, #2489 - s = Series(randn(5), name="A") - s2 = Series(randn(5), name="B") - - result = concat([s, s2], axis=1) - expected = DataFrame({"A": s, "B": s2}) - tm.assert_frame_equal(result, expected) - - s2.name = None - result = concat([s, s2], axis=1) - tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object")) - - # must reindex, #2603 - s = Series(randn(3), index=["c", "a", "b"], name="A") - s2 = Series(randn(4), index=["d", "a", "b", "c"], name="B") - result = concat([s, s2], axis=1, sort=sort) - expected = DataFrame({"A": s, "B": s2}) - tm.assert_frame_equal(result, expected) - - def test_concat_series_axis1_names_applied(self): - # ensure names argument is not ignored on axis=1, #23490 - s = Series([1, 2, 3]) - s2 = Series([4, 5, 6]) - result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"]) - expected = DataFrame( - [[1, 4], [2, 5], [3, 6]], columns=pd.Index(["a", "b"], name="A") - ) - tm.assert_frame_equal(result, expected) - - result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"]) - expected = DataFrame( - [[1, 4], [2, 5], [3, 6]], - columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]), - ) - tm.assert_frame_equal(result, expected) - - def test_concat_single_with_key(self): - df = DataFrame(np.random.randn(10, 4)) - - result = concat([df], keys=["foo"]) - expected = concat([df, df], keys=["foo", "bar"]) - tm.assert_frame_equal(result, expected[:10]) - - def test_concat_exclude_none(self): - df = DataFrame(np.random.randn(10, 4)) - - pieces = [df[:5], None, None, df[5:]] - result = concat(pieces) - tm.assert_frame_equal(result, df) - with pytest.raises(ValueError, match="All objects passed were None"): - concat([None, None]) - - def test_concat_datetime64_block(self): - from pandas.core.indexes.datetimes import date_range - - rng = date_range("1/1/2000", periods=10) - - df = DataFrame({"time": rng}) - - result = concat([df, df]) - assert (result.iloc[:10]["time"] == rng).all() - assert (result.iloc[10:]["time"] == rng).all() - - def test_concat_timedelta64_block(self): - from pandas import to_timedelta - - rng = to_timedelta(np.arange(10), unit="s") - - df = DataFrame({"time": rng}) - - result = concat([df, df]) - assert (result.iloc[:10]["time"] == rng).all() - assert (result.iloc[10:]["time"] == rng).all() - - def test_concat_keys_with_none(self): - # #1649 - df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) - - result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) - expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) - tm.assert_frame_equal(result, expected) - - result = concat( - [None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"] - ) - expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"]) - tm.assert_frame_equal(result, expected) - - def test_concat_bug_1719(self): - ts1 = tm.makeTimeSeries() - ts2 = tm.makeTimeSeries()[::2] - - # to join with union - # these two are of different length! - left = concat([ts1, ts2], join="outer", axis=1) - right = concat([ts2, ts1], join="outer", axis=1) - - assert len(left) == len(right) - - def test_concat_bug_2972(self): - ts0 = Series(np.zeros(5)) - ts1 = Series(np.ones(5)) - ts0.name = ts1.name = "same name" - result = concat([ts0, ts1], axis=1) - - expected = DataFrame({0: ts0, 1: ts1}) - expected.columns = ["same name", "same name"] - tm.assert_frame_equal(result, expected) - - def test_concat_bug_3602(self): - - # GH 3602, duplicate columns - df1 = DataFrame( - { - "firmNo": [0, 0, 0, 0], - "prc": [6, 6, 6, 6], - "stringvar": ["rrr", "rrr", "rrr", "rrr"], - } - ) - df2 = DataFrame( - {"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]} - ) - expected = DataFrame( - [ - [0, 6, "rrr", 9, 1, 6], - [0, 6, "rrr", 10, 2, 6], - [0, 6, "rrr", 11, 3, 6], - [0, 6, "rrr", 12, 4, 6], - ] - ) - expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"] - - result = concat([df1, df2], axis=1) - tm.assert_frame_equal(result, expected) - - def test_concat_inner_join_empty(self): - # GH 15328 - df_empty = pd.DataFrame() - df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") - df_expected = pd.DataFrame({"a": []}, index=[], dtype="int64") - - for how, expected in [("inner", df_expected), ("outer", df_a)]: - result = pd.concat([df_a, df_empty], axis=1, join=how) - tm.assert_frame_equal(result, expected) - - def test_concat_series_axis1_same_names_ignore_index(self): - dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1] - s1 = Series(randn(len(dates)), index=dates, name="value") - s2 = Series(randn(len(dates)), index=dates, name="value") - - result = concat([s1, s2], axis=1, ignore_index=True) - expected = Index([0, 1]) - - tm.assert_index_equal(result.columns, expected) - - def test_concat_iterables(self): - # GH8645 check concat works with tuples, list, generators, and weird - # stuff like deque and custom iterables - df1 = DataFrame([1, 2, 3]) - df2 = DataFrame([4, 5, 6]) - expected = DataFrame([1, 2, 3, 4, 5, 6]) - tm.assert_frame_equal(concat((df1, df2), ignore_index=True), expected) - tm.assert_frame_equal(concat([df1, df2], ignore_index=True), expected) - tm.assert_frame_equal( - concat((df for df in (df1, df2)), ignore_index=True), expected - ) - tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) - - class CustomIterator1: - def __len__(self) -> int: - return 2 - - def __getitem__(self, index): - try: - return {0: df1, 1: df2}[index] - except KeyError as err: - raise IndexError from err - - tm.assert_frame_equal(pd.concat(CustomIterator1(), ignore_index=True), expected) - - class CustomIterator2(abc.Iterable): - def __iter__(self): - yield df1 - yield df2 - - tm.assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) - - def test_concat_invalid(self): - - # trying to concat a ndframe with a non-ndframe - df1 = tm.makeCustomDataframe(10, 2) - for obj in [1, dict(), [1, 2], (1, 2)]: - - msg = ( - f"cannot concatenate object of type '{type(obj)}'; " - "only Series and DataFrame objs are valid" - ) - with pytest.raises(TypeError, match=msg): - concat([df1, obj]) - - def test_concat_invalid_first_argument(self): - df1 = tm.makeCustomDataframe(10, 2) - df2 = tm.makeCustomDataframe(10, 2) - msg = ( - "first argument must be an iterable of pandas " - 'objects, you passed an object of type "DataFrame"' - ) - with pytest.raises(TypeError, match=msg): - concat(df1, df2) - - # generator ok though - concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) - - # text reader ok - # GH6583 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - reader = read_csv(StringIO(data), chunksize=1) - result = concat(reader, ignore_index=True) - expected = read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - def test_concat_NaT_series(self): - # GH 11693 - # test for merging NaT series with datetime series. - x = Series( - date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern") - ) - y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") - expected = Series([x[0], x[1], pd.NaT, pd.NaT]) - - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # all NaT with tz - expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]") - result = pd.concat([y, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # without tz - x = pd.Series(pd.date_range("20151124 08:00", "20151124 09:00", freq="1h")) - y = pd.Series(pd.date_range("20151124 10:00", "20151124 11:00", freq="1h")) - y[:] = pd.NaT - expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) - result = pd.concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # all NaT without tz - x[:] = pd.NaT - expected = pd.Series(pd.NaT, index=range(4), dtype="datetime64[ns]") - result = pd.concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - def test_concat_tz_frame(self): - df2 = DataFrame( - dict( - A=pd.Timestamp("20130102", tz="US/Eastern"), - B=pd.Timestamp("20130603", tz="CET"), - ), - index=range(5), - ) - - # concat - df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) - tm.assert_frame_equal(df2, df3) - - def test_concat_tz_series(self): - # gh-11755: tz and no tz - x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) - y = Series(date_range("2012-01-01", "2012-01-02")) - expected = Series([x[0], x[1], y[0], y[1]], dtype="object") - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # gh-11887: concat tz and object - x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) - y = Series(["a", "b"]) - expected = Series([x[0], x[1], y[0], y[1]], dtype="object") - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # see gh-12217 and gh-12306 - # Concatenating two UTC times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize("UTC") - - second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize("UTC") - - result = pd.concat([first, second]) - assert result[0].dtype == "datetime64[ns, UTC]" - - # Concatenating two London times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize("Europe/London") - - second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize("Europe/London") - - result = pd.concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" - - # Concatenating 2+1 London times - first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) - first[0] = first[0].dt.tz_localize("Europe/London") - - second = pd.DataFrame([[datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize("Europe/London") - - result = pd.concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" - - # Concat'ing 1+2 London times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize("Europe/London") - - second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize("Europe/London") - - result = pd.concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" - - def test_concat_tz_series_with_datetimelike(self): - # see gh-12620: tz and timedelta - x = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-02-01", tz="US/Eastern"), - ] - y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) - - # tz and period - y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) - - def test_concat_tz_series_tzlocal(self): - # see gh-13583 - x = [ - pd.Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()), - pd.Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()), - ] - y = [ - pd.Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()), - pd.Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()), - ] - - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y)) - assert result.dtype == "datetime64[ns, tzlocal()]" - - @pytest.mark.parametrize("tz1", [None, "UTC"]) - @pytest.mark.parametrize("tz2", [None, "UTC"]) - @pytest.mark.parametrize("s", [pd.NaT, pd.Timestamp("20150101")]) - def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): - # GH 12396 - - # tz-naive - first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply( - lambda x: x.dt.tz_localize(tz1) - ) - second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) - - result = pd.concat([first, second], axis=0) - expected = pd.DataFrame(pd.Series([pd.NaT, pd.NaT, s], index=[0, 1, 0])) - expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) - if tz1 != tz2: - expected = expected.astype(object) - - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("tz1", [None, "UTC"]) - @pytest.mark.parametrize("tz2", [None, "UTC"]) - def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): - # GH 12396 - - first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)) - second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize(tz2), columns=[1]) - expected = pd.DataFrame( - { - 0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), - 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2), - } - ) - result = pd.concat([first, second], axis=1) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("tz1", [None, "UTC"]) - @pytest.mark.parametrize("tz2", [None, "UTC"]) - def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): - # GH 12396 - - # tz-naive - first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) - second = pd.DataFrame( - [ - [pd.Timestamp("2015/01/01", tz=tz2)], - [pd.Timestamp("2016/01/01", tz=tz2)], - ], - index=[2, 3], - ) - - expected = pd.DataFrame( - [ - pd.NaT, - pd.NaT, - pd.Timestamp("2015/01/01", tz=tz2), - pd.Timestamp("2016/01/01", tz=tz2), - ] - ) - if tz1 != tz2: - expected = expected.astype(object) - - result = pd.concat([first, second]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("tz", [None, "UTC"]) - def test_concat_NaT_dataframes(self, tz): - # GH 12396 - - first = pd.DataFrame([[pd.NaT], [pd.NaT]]) - first = first.apply(lambda x: x.dt.tz_localize(tz)) - second = pd.DataFrame( - [[pd.Timestamp("2015/01/01", tz=tz)], [pd.Timestamp("2016/01/01", tz=tz)]], - index=[2, 3], - ) - expected = pd.DataFrame( - [ - pd.NaT, - pd.NaT, - pd.Timestamp("2015/01/01", tz=tz), - pd.Timestamp("2016/01/01", tz=tz), - ] - ) - - result = pd.concat([first, second], axis=0) - tm.assert_frame_equal(result, expected) - - def test_concat_period_series(self): - x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) - y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D")) - expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]") - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - def test_concat_period_multiple_freq_series(self): - x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) - y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M")) - expected = Series([x[0], x[1], y[0], y[1]], dtype="object") - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - assert result.dtype == "object" - - def test_concat_period_other_series(self): - x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) - y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M")) - expected = Series([x[0], x[1], y[0], y[1]], dtype="object") - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - assert result.dtype == "object" - - # non-period - x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) - y = Series(pd.DatetimeIndex(["2015-11-01", "2015-12-01"])) - expected = Series([x[0], x[1], y[0], y[1]], dtype="object") - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - assert result.dtype == "object" - - x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) - y = Series(["A", "B"]) - expected = Series([x[0], x[1], y[0], y[1]], dtype="object") - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - assert result.dtype == "object" - - def test_concat_empty_series(self): - # GH 11082 - s1 = pd.Series([1, 2, 3], name="x") - s2 = pd.Series(name="y", dtype="float64") - res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame( - {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, - index=pd.Index([0, 1, 2], dtype="O"), - ) - tm.assert_frame_equal(res, exp) - - s1 = pd.Series([1, 2, 3], name="x") - s2 = pd.Series(name="y", dtype="float64") - res = pd.concat([s1, s2], axis=0) - # name will be reset - exp = pd.Series([1, 2, 3]) - tm.assert_series_equal(res, exp) - - # empty Series with no name - s1 = pd.Series([1, 2, 3], name="x") - s2 = pd.Series(name=None, dtype="float64") - res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame( - {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=["x", 0], - index=pd.Index([0, 1, 2], dtype="O"), - ) - tm.assert_frame_equal(res, exp) - - @pytest.mark.parametrize("tz", [None, "UTC"]) - @pytest.mark.parametrize("values", [[], [1, 2, 3]]) - def test_concat_empty_series_timelike(self, tz, values): - # GH 18447 - - first = Series([], dtype="M8[ns]").dt.tz_localize(tz) - dtype = None if values else np.float64 - second = Series(values, dtype=dtype) - - expected = DataFrame( - { - 0: pd.Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), - 1: values, - } - ) - result = concat([first, second], axis=1) - tm.assert_frame_equal(result, expected) - - def test_default_index(self): - # is_series and ignore_index - s1 = pd.Series([1, 2, 3], name="x") - s2 = pd.Series([4, 5, 6], name="y") - res = pd.concat([s1, s2], axis=1, ignore_index=True) - assert isinstance(res.columns, pd.RangeIndex) - exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) - # use check_index_type=True to check the result have - # RangeIndex (default index) - tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - - # is_series and all inputs have no names - s1 = pd.Series([1, 2, 3]) - s2 = pd.Series([4, 5, 6]) - res = pd.concat([s1, s2], axis=1, ignore_index=False) - assert isinstance(res.columns, pd.RangeIndex) - exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) - exp.columns = pd.RangeIndex(2) - tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - - # is_dataframe and ignore_index - df1 = pd.DataFrame({"A": [1, 2], "B": [5, 6]}) - df2 = pd.DataFrame({"A": [3, 4], "B": [7, 8]}) - - res = pd.concat([df1, df2], axis=0, ignore_index=True) - exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) - tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - - res = pd.concat([df1, df2], axis=1, ignore_index=True) - exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) - tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) - - def test_concat_multiindex_rangeindex(self): - # GH13542 - # when multi-index levels are RangeIndex objects - # there is a bug in concat with objects of len 1 - - df = DataFrame(np.random.randn(9, 2)) - df.index = MultiIndex( - levels=[pd.RangeIndex(3), pd.RangeIndex(3)], - codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)], - ) - - res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) - exp = df.iloc[[2, 3, 4, 5], :] - tm.assert_frame_equal(res, exp) - - def test_concat_multiindex_dfs_with_deepcopy(self): - # GH 9967 - from copy import deepcopy - - example_multiindex1 = pd.MultiIndex.from_product([["a"], ["b"]]) - example_dataframe1 = pd.DataFrame([0], index=example_multiindex1) - - example_multiindex2 = pd.MultiIndex.from_product([["a"], ["c"]]) - example_dataframe2 = pd.DataFrame([1], index=example_multiindex2) - - example_dict = {"s1": example_dataframe1, "s2": example_dataframe2} - expected_index = pd.MultiIndex( - levels=[["s1", "s2"], ["a"], ["b", "c"]], - codes=[[0, 1], [0, 0], [0, 1]], - names=["testname", None, None], - ) - expected = pd.DataFrame([[0], [1]], index=expected_index) - result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) - tm.assert_frame_equal(result_copy, expected) - result_no_copy = pd.concat(example_dict, names=["testname"]) - tm.assert_frame_equal(result_no_copy, expected) - - def test_categorical_concat_append(self): - cat = Categorical(["a", "b"], categories=["a", "b"]) - vals = [1, 2] - df = DataFrame({"cats": cat, "vals": vals}) - cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"]) - vals2 = [1, 2, 1, 2] - exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) - - tm.assert_frame_equal(pd.concat([df, df]), exp) - tm.assert_frame_equal(df.append(df), exp) - - # GH 13524 can concat different categories - cat3 = Categorical(["a", "b"], categories=["a", "b", "c"]) - vals3 = [1, 2] - df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) - - res = pd.concat([df, df_different_categories], ignore_index=True) - exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) - tm.assert_frame_equal(res, exp) - - res = df.append(df_different_categories, ignore_index=True) - tm.assert_frame_equal(res, exp) - - def test_categorical_concat_dtypes(self): - - # GH8143 - index = ["cat", "obj", "num"] - cat = Categorical(["a", "b", "c"]) - obj = Series(["a", "b", "c"]) - num = Series([1, 2, 3]) - df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - - result = df.dtypes == "object" - expected = Series([False, True, False], index=index) - tm.assert_series_equal(result, expected) - - result = df.dtypes == "int64" - expected = Series([False, False, True], index=index) - tm.assert_series_equal(result, expected) - - result = df.dtypes == "category" - expected = Series([True, False, False], index=index) - tm.assert_series_equal(result, expected) - - def test_categorical_concat(self, sort): - # See GH 10177 - df1 = DataFrame( - np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"] - ) - - df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"]) - - cat_values = ["one", "one", "two", "one", "two", "two", "one"] - df2["h"] = Series(Categorical(cat_values)) - - res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) - exp = DataFrame( - { - "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], - "b": [ - 1, - 4, - 7, - 10, - 13, - 16, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ], - "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], - "h": [None] * 6 + cat_values, - } - ) - tm.assert_frame_equal(res, exp) - - def test_categorical_concat_gh7864(self): - # GH 7864 - # make sure ordering is preserved - df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")}) - df["grade"] = Categorical(df["raw_grade"]) - df["grade"].cat.set_categories(["e", "a", "b"]) - - df1 = df[0:3] - df2 = df[3:] - - tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories) - tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories) - - dfx = pd.concat([df1, df2]) - tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) - - dfa = df1.append(df2) - tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) - - def test_categorical_concat_preserve(self): - - # GH 8641 series concat not preserving category dtype - # GH 13524 can concat different categories - s = Series(list("abc"), dtype="category") - s2 = Series(list("abd"), dtype="category") - - exp = Series(list("abcabd")) - res = pd.concat([s, s2], ignore_index=True) - tm.assert_series_equal(res, exp) - - exp = Series(list("abcabc"), dtype="category") - res = pd.concat([s, s], ignore_index=True) - tm.assert_series_equal(res, exp) - - exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category") - res = pd.concat([s, s]) - tm.assert_series_equal(res, exp) - - a = Series(np.arange(6, dtype="int64")) - b = Series(list("aabbca")) - - df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))}) - res = pd.concat([df2, df2]) - exp = DataFrame( - { - "A": pd.concat([a, a]), - "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), - } - ) - tm.assert_frame_equal(res, exp) - - def test_categorical_index_preserver(self): - - a = Series(np.arange(6, dtype="int64")) - b = Series(list("aabbca")) - - df2 = DataFrame( - {"A": a, "B": b.astype(CategoricalDtype(list("cab")))} - ).set_index("B") - result = pd.concat([df2, df2]) - expected = DataFrame( - { - "A": pd.concat([a, a]), - "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), - } - ).set_index("B") - tm.assert_frame_equal(result, expected) - - # wrong categories - df3 = DataFrame( - {"A": a, "B": Categorical(b, categories=list("abe"))} - ).set_index("B") - msg = "categories must match existing categories when appending" - with pytest.raises(TypeError, match=msg): - pd.concat([df2, df3]) - - def test_concat_categoricalindex(self): - # GH 16111, categories that aren't lexsorted - categories = [9, 0, 1, 2, 3] - - a = pd.Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) - b = pd.Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) - c = pd.Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) - - result = pd.concat([a, b, c], axis=1) - - exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) - exp = pd.DataFrame( - { - 0: [1, 1, np.nan, np.nan], - 1: [np.nan, 2, 2, np.nan], - 2: [np.nan, np.nan, 3, 3], - }, - columns=[0, 1, 2], - index=exp_idx, - ) - tm.assert_frame_equal(result, exp) - - def test_concat_order(self): - # GH 17344 - dfs = [pd.DataFrame(index=range(3), columns=["a", 1, None])] - dfs += [ - pd.DataFrame(index=range(3), columns=[None, 1, "a"]) for i in range(100) - ] - - result = pd.concat(dfs, sort=True).columns - expected = dfs[0].columns - tm.assert_index_equal(result, expected) - - def test_concat_datetime_timezone(self): - # GH 18523 - idx1 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") - idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq="H") - df1 = pd.DataFrame({"a": [1, 2, 3]}, index=idx1) - df2 = pd.DataFrame({"b": [1, 2, 3]}, index=idx2) - result = pd.concat([df1, df2], axis=1) - - exp_idx = ( - DatetimeIndex( - [ - "2011-01-01 00:00:00+01:00", - "2011-01-01 01:00:00+01:00", - "2011-01-01 02:00:00+01:00", - ], - freq="H", - ) - .tz_convert("UTC") - .tz_convert("Europe/Paris") - ) - - expected = pd.DataFrame( - [[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"] - ) - - tm.assert_frame_equal(result, expected) - - idx3 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") - df3 = pd.DataFrame({"b": [1, 2, 3]}, index=idx3) - result = pd.concat([df1, df3], axis=1) - - exp_idx = DatetimeIndex( - [ - "2010-12-31 15:00:00+00:00", - "2010-12-31 16:00:00+00:00", - "2010-12-31 17:00:00+00:00", - "2010-12-31 23:00:00+00:00", - "2011-01-01 00:00:00+00:00", - "2011-01-01 01:00:00+00:00", - ] - ) - - expected = pd.DataFrame( - [ - [np.nan, 1], - [np.nan, 2], - [np.nan, 3], - [1, np.nan], - [2, np.nan], - [3, np.nan], - ], - index=exp_idx, - columns=["a", "b"], - ) - - tm.assert_frame_equal(result, expected) - - # GH 13783: Concat after resample - result = pd.concat( - [df1.resample("H").mean(), df2.resample("H").mean()], sort=True - ) - expected = pd.DataFrame( - {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, - index=idx1.append(idx1), - ) - tm.assert_frame_equal(result, expected) - - def test_concat_different_extension_dtypes_upcasts(self): - a = pd.Series(pd.core.arrays.integer_array([1, 2])) - b = pd.Series(to_decimal([1, 2])) - - result = pd.concat([a, b], ignore_index=True) - expected = pd.Series([1, 2, Decimal(1), Decimal(2)], dtype=object) - tm.assert_series_equal(result, expected) - - def test_concat_odered_dict(self): - # GH 21510 - expected = pd.concat( - [pd.Series(range(3)), pd.Series(range(4))], keys=["First", "Another"] - ) - result = pd.concat( - dict([("First", pd.Series(range(3))), ("Another", pd.Series(range(4)))]) - ) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("pdt", [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize("dt", np.sctypes["float"]) -def test_concat_no_unnecessary_upcast(dt, pdt): - # GH 13247 - dims = pdt(dtype=object).ndim - - dfs = [ - pdt(np.array([1], dtype=dt, ndmin=dims)), - pdt(np.array([np.nan], dtype=dt, ndmin=dims)), - pdt(np.array([5], dtype=dt, ndmin=dims)), - ] - x = pd.concat(dfs) - assert x.values.dtype == dt - - -@pytest.mark.parametrize("pdt", [create_series_with_explicit_dtype, pd.DataFrame]) -@pytest.mark.parametrize("dt", np.sctypes["int"]) -def test_concat_will_upcast(dt, pdt): - with catch_warnings(record=True): - dims = pdt().ndim - dfs = [ - pdt(np.array([1], dtype=dt, ndmin=dims)), - pdt(np.array([np.nan], ndmin=dims)), - pdt(np.array([5], dtype=dt, ndmin=dims)), - ] - x = pd.concat(dfs) - assert x.values.dtype == "float64" - - -def test_concat_empty_and_non_empty_frame_regression(): - # GH 18178 regression test - df1 = pd.DataFrame({"foo": [1]}) - df2 = pd.DataFrame({"foo": []}) - expected = pd.DataFrame({"foo": [1.0]}) - result = pd.concat([df1, df2]) - tm.assert_frame_equal(result, expected) - - -def test_concat_empty_and_non_empty_series_regression(): - # GH 18187 regression test - s1 = pd.Series([1]) - s2 = pd.Series([], dtype=object) - - expected = s1 - result = pd.concat([s1, s2]) - tm.assert_series_equal(result, expected) - - -def test_concat_sorts_columns(sort): - # GH-4588 - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) - df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]}) - - # for sort=True/None - expected = pd.DataFrame( - {"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]}, - columns=["a", "b", "c"], - ) - - if sort is False: - expected = expected[["b", "a", "c"]] - - # default - with tm.assert_produces_warning(None): - result = pd.concat([df1, df2], ignore_index=True, sort=sort) - tm.assert_frame_equal(result, expected) - - -def test_concat_sorts_index(sort): - df1 = pd.DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) - df2 = pd.DataFrame({"b": [1, 2]}, index=["a", "b"]) - - # For True/None - expected = pd.DataFrame( - {"a": [2, 3, 1], "b": [1, 2, None]}, index=["a", "b", "c"], columns=["a", "b"] - ) - if sort is False: - expected = expected.loc[["c", "a", "b"]] - - # Warn and sort by default - with tm.assert_produces_warning(None): - result = pd.concat([df1, df2], axis=1, sort=sort) - tm.assert_frame_equal(result, expected) - - -def test_concat_inner_sort(sort): - # https://github.com/pandas-dev/pandas/pull/20613 - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]) - df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) - - with tm.assert_produces_warning(None): - # unset sort should *not* warn for inner join - # since that never sorted - result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True) - - expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) - if sort is True: - expected = expected[["a", "b"]] - tm.assert_frame_equal(result, expected) - - -def test_concat_aligned_sort(): - # GH-4588 - df = pd.DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"]) - result = pd.concat([df, df], sort=True, ignore_index=True) - expected = pd.DataFrame( - {"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]}, - columns=["a", "b", "c"], - ) - tm.assert_frame_equal(result, expected) - - result = pd.concat([df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True) - expected = expected[["b", "c"]] - tm.assert_frame_equal(result, expected) - - -def test_concat_aligned_sort_does_not_raise(): - # GH-4588 - # We catch TypeErrors from sorting internally and do not re-raise. - df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"]) - expected = pd.DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"]) - result = pd.concat([df, df], ignore_index=True, sort=True) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]) -def test_concat_series_name_npscalar_tuple(s1name, s2name): - # GH21015 - s1 = pd.Series({"a": 1, "b": 2}, name=s1name) - s2 = pd.Series({"c": 5, "d": 6}, name=s2name) - result = pd.concat([s1, s2]) - expected = pd.Series({"a": 1, "b": 2, "c": 5, "d": 6}) - tm.assert_series_equal(result, expected) - - -def test_concat_categorical_tz(): - # GH-23816 - a = pd.Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific")) - b = pd.Series(["a", "b"], dtype="category") - result = pd.concat([a, b], ignore_index=True) - expected = pd.Series( - [ - pd.Timestamp("2017-01-01", tz="US/Pacific"), - pd.Timestamp("2017-01-02", tz="US/Pacific"), - "a", - "b", - ] - ) - tm.assert_series_equal(result, expected) - - -def test_concat_categorical_unchanged(): - # GH-12007 - # test fix for when concat on categorical and float - # coerces dtype categorical -> float - df = pd.DataFrame(pd.Series(["a", "b", "c"], dtype="category", name="A")) - ser = pd.Series([0, 1, 2], index=[0, 1, 3], name="B") - result = pd.concat([df, ser], axis=1) - expected = pd.DataFrame( - { - "A": pd.Series(["a", "b", "c", np.nan], dtype="category"), - "B": pd.Series([0, 1, np.nan, 2], dtype="float"), - } - ) - tm.assert_equal(result, expected) - - -def test_concat_datetimeindex_freq(): - # GH 3232 - # Monotonic index result - dr = pd.date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") - data = list(range(100)) - expected = pd.DataFrame(data, index=dr) - result = pd.concat([expected[:50], expected[50:]]) - tm.assert_frame_equal(result, expected) - - # Non-monotonic index result - result = pd.concat([expected[50:], expected[:50]]) - expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) - expected.index._data.freq = None - tm.assert_frame_equal(result, expected) - - -def test_concat_empty_df_object_dtype(): - # GH 9149 - df_1 = pd.DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) - df_2 = pd.DataFrame(columns=df_1.columns) - result = pd.concat([df_1, df_2], axis=0) - expected = df_1.astype(object) - tm.assert_frame_equal(result, expected) - - -def test_concat_sparse(): - # GH 23557 - a = pd.Series(SparseArray([0, 1, 2])) - expected = pd.DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( - pd.SparseDtype(np.int64, 0) - ) - result = pd.concat([a, a], axis=1) - tm.assert_frame_equal(result, expected) - - -def test_concat_dense_sparse(): - # GH 30668 - a = pd.Series(pd.arrays.SparseArray([1, None]), dtype=float) - b = pd.Series([1], dtype=float) - expected = pd.Series(data=[1, None, 1], index=[0, 1, 0]).astype( - pd.SparseDtype(np.float64, None) - ) - result = pd.concat([a, b], axis=0) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("test_series", [True, False]) -def test_concat_copy_index(test_series, axis): - # GH 29879 - if test_series: - ser = Series([1, 2]) - comb = concat([ser, ser], axis=axis, copy=True) - assert comb.index is not ser.index - else: - df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - comb = concat([df, df], axis=axis, copy=True) - assert comb.index is not df.index - assert comb.columns is not df.columns - - -def test_concat_multiindex_datetime_object_index(): - # https://github.com/pandas-dev/pandas/issues/11058 - s = Series( - ["a", "b"], - index=MultiIndex.from_arrays( - [[1, 2], Index([dt.date(2013, 1, 1), dt.date(2014, 1, 1)], dtype="object")], - names=["first", "second"], - ), - ) - s2 = Series( - ["a", "b"], - index=MultiIndex.from_arrays( - [[1, 2], Index([dt.date(2013, 1, 1), dt.date(2015, 1, 1)], dtype="object")], - names=["first", "second"], - ), - ) - expected = DataFrame( - [["a", "a"], ["b", np.nan], [np.nan, "b"]], - index=MultiIndex.from_arrays( - [ - [1, 2, 2], - DatetimeIndex( - ["2013-01-01", "2014-01-01", "2015-01-01"], - dtype="datetime64[ns]", - freq=None, - ), - ], - names=["first", "second"], - ), - ) - result = concat([s, s2], axis=1) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("keys", [["e", "f", "f"], ["f", "e", "f"]]) -def test_duplicate_keys(keys): - # GH 33654 - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - s1 = Series([7, 8, 9], name="c") - s2 = Series([10, 11, 12], name="d") - result = concat([df, s1, s2], axis=1, keys=keys) - expected_values = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]] - expected_columns = pd.MultiIndex.from_tuples( - [(keys[0], "a"), (keys[0], "b"), (keys[1], "c"), (keys[2], "d")] - ) - expected = DataFrame(expected_values, columns=expected_columns) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "obj", - [ - tm.SubclassedDataFrame({"A": np.arange(0, 10)}), - tm.SubclassedSeries(np.arange(0, 10), name="A"), - ], -) -def test_concat_preserves_subclass(obj): - # GH28330 -- preserve subclass - - result = concat([obj, obj]) - assert isinstance(result, type(obj)) - - -def test_concat_frame_axis0_extension_dtypes(): - # preserve extension dtype (through common_dtype mechanism) - df1 = pd.DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")}) - df2 = pd.DataFrame({"a": np.array([4, 5, 6])}) - - result = pd.concat([df1, df2], ignore_index=True) - expected = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64") - tm.assert_frame_equal(result, expected) - - result = pd.concat([df2, df1], ignore_index=True) - expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") - tm.assert_frame_equal(result, expected) - - -def test_concat_preserves_extension_int64_dtype(): - # GH 24768 - df_a = pd.DataFrame({"a": [-1]}, dtype="Int64") - df_b = pd.DataFrame({"b": [1]}, dtype="Int64") - result = pd.concat([df_a, df_b], ignore_index=True) - expected = pd.DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64") - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 4d2195da85a13..8aa4012b3e77c 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -668,9 +668,16 @@ def test_cut_unordered_with_missing_labels_raises_error(): def test_cut_unordered_with_series_labels(): # https://github.com/pandas-dev/pandas/issues/36603 - s = pd.Series([1, 2, 3, 4, 5]) - bins = pd.Series([0, 2, 4, 6]) - labels = pd.Series(["a", "b", "c"]) + s = Series([1, 2, 3, 4, 5]) + bins = Series([0, 2, 4, 6]) + labels = Series(["a", "b", "c"]) result = pd.cut(s, bins=bins, labels=labels, ordered=False) - expected = pd.Series(["a", "a", "b", "b", "c"], dtype="category") + expected = Series(["a", "a", "b", "b", "c"], dtype="category") tm.assert_series_equal(result, expected) + + +def test_cut_no_warnings(): + df = DataFrame({"value": np.random.randint(0, 100, 20)}) + labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] + with tm.assert_produces_warning(False): + df["group"] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 79879ef346f53..99beff39e8e09 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -15,7 +15,7 @@ def setup_method(self, method): self.var_name = "var" self.value_name = "val" - self.df1 = pd.DataFrame( + self.df1 = DataFrame( [ [1.067683, -1.110463, 0.20867], [-1.321405, 0.368915, -1.055342], @@ -310,7 +310,7 @@ def test_melt_missing_columns_raises(self): # attempted with column names absent from the dataframe # Generate data - df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) + df = DataFrame(np.random.randn(5, 4), columns=list("abcd")) # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" @@ -634,7 +634,7 @@ class TestWideToLong: def test_simple(self): np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame( + df = DataFrame( { "A1970": {0: "a", 1: "b", 2: "c"}, "A1980": {0: "d", 1: "e", 2: "f"}, @@ -658,7 +658,7 @@ def test_simple(self): def test_stubs(self): # GH9204 - df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) + df = DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) df.columns = ["id", "inc1", "inc2", "edu1", "edu2"] stubs = ["inc", "edu"] @@ -671,7 +671,7 @@ def test_separating_character(self): # GH14779 np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame( + df = DataFrame( { "A.1970": {0: "a", 1: "b", 2: "c"}, "A.1980": {0: "d", 1: "e", 2: "f"}, @@ -696,7 +696,7 @@ def test_separating_character(self): def test_escapable_characters(self): np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame( + df = DataFrame( { "A(quarterly)1970": {0: "a", 1: "b", 2: "c"}, "A(quarterly)1980": {0: "d", 1: "e", 2: "f"}, @@ -722,7 +722,7 @@ def test_escapable_characters(self): def test_unbalanced(self): # test that we can have a varying amount of time variables - df = pd.DataFrame( + df = DataFrame( { "A2010": [1.0, 2.0], "A2011": [3.0, 4.0], @@ -738,14 +738,14 @@ def test_unbalanced(self): "id": [0, 0, 1, 1], "year": [2010, 2011, 2010, 2011], } - expected = pd.DataFrame(exp_data) + expected = DataFrame(exp_data) expected = expected.set_index(["id", "year"])[["X", "A", "B"]] result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_character_overlap(self): # Test we handle overlapping characters in both id_vars and value_vars - df = pd.DataFrame( + df = DataFrame( { "A11": ["a11", "a22", "a33"], "A12": ["a21", "a22", "a23"], @@ -758,7 +758,7 @@ def test_character_overlap(self): } ) df["id"] = df.index - expected = pd.DataFrame( + expected = DataFrame( { "BBBX": [91, 92, 93, 91, 92, 93], "BBBZ": [91, 92, 93, 91, 92, 93], @@ -776,7 +776,7 @@ def test_character_overlap(self): def test_invalid_separator(self): # if an invalid separator is supplied a empty data frame is returned sep = "nope!" - df = pd.DataFrame( + df = DataFrame( { "A2010": [1.0, 2.0], "A2011": [3.0, 4.0], @@ -795,7 +795,7 @@ def test_invalid_separator(self): "A": [], "B": [], } - expected = pd.DataFrame(exp_data).astype({"year": "int"}) + expected = DataFrame(exp_data).astype({"year": "int"}) expected = expected.set_index(["id", "year"])[ ["X", "A2010", "A2011", "B2010", "A", "B"] ] @@ -806,7 +806,7 @@ def test_invalid_separator(self): def test_num_string_disambiguation(self): # Test that we can disambiguate number value_vars from # string value_vars - df = pd.DataFrame( + df = DataFrame( { "A11": ["a11", "a22", "a33"], "A12": ["a21", "a22", "a23"], @@ -819,7 +819,7 @@ def test_num_string_disambiguation(self): } ) df["id"] = df.index - expected = pd.DataFrame( + expected = DataFrame( { "Arating": [91, 92, 93, 91, 92, 93], "Arating_old": [91, 92, 93, 91, 92, 93], @@ -839,7 +839,7 @@ def test_num_string_disambiguation(self): def test_invalid_suffixtype(self): # If all stubs names end with a string, but a numeric suffix is # assumed, an empty data frame is returned - df = pd.DataFrame( + df = DataFrame( { "Aone": [1.0, 2.0], "Atwo": [3.0, 4.0], @@ -858,7 +858,7 @@ def test_invalid_suffixtype(self): "A": [], "B": [], } - expected = pd.DataFrame(exp_data).astype({"year": "int"}) + expected = DataFrame(exp_data).astype({"year": "int"}) expected = expected.set_index(["id", "year"]) expected.index = expected.index.set_levels([0, 1], level=0) @@ -867,7 +867,7 @@ def test_invalid_suffixtype(self): def test_multiple_id_columns(self): # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm - df = pd.DataFrame( + df = DataFrame( { "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], @@ -875,7 +875,7 @@ def test_multiple_id_columns(self): "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], } ) - expected = pd.DataFrame( + expected = DataFrame( { "ht": [ 2.8, @@ -909,7 +909,7 @@ def test_multiple_id_columns(self): def test_non_unique_idvars(self): # GH16382 # Raise an error message if non unique id vars (i) are passed - df = pd.DataFrame( + df = DataFrame( {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]} ) msg = "the id variables need to uniquely identify each row" @@ -917,7 +917,7 @@ def test_non_unique_idvars(self): wide_to_long(df, ["A_A", "B_B"], i="x", j="colname") def test_cast_j_int(self): - df = pd.DataFrame( + df = DataFrame( { "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"], "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"], @@ -927,7 +927,7 @@ def test_cast_j_int(self): } ) - expected = pd.DataFrame( + expected = DataFrame( { "actor": [ "CCH Pounder", @@ -956,7 +956,7 @@ def test_cast_j_int(self): tm.assert_frame_equal(result, expected) def test_identical_stubnames(self): - df = pd.DataFrame( + df = DataFrame( { "A2010": [1.0, 2.0], "A2011": [3.0, 4.0], @@ -969,7 +969,7 @@ def test_identical_stubnames(self): wide_to_long(df, ["A", "B"], i="A", j="colname") def test_nonnumeric_suffix(self): - df = pd.DataFrame( + df = DataFrame( { "treatment_placebo": [1.0, 2.0], "treatment_test": [3.0, 4.0], @@ -977,7 +977,7 @@ def test_nonnumeric_suffix(self): "A": ["X1", "X2"], } ) - expected = pd.DataFrame( + expected = DataFrame( { "A": ["X1", "X1", "X2", "X2"], "colname": ["placebo", "test", "placebo", "test"], @@ -992,7 +992,7 @@ def test_nonnumeric_suffix(self): tm.assert_frame_equal(result, expected) def test_mixed_type_suffix(self): - df = pd.DataFrame( + df = DataFrame( { "A": ["X1", "X2"], "result_1": [0, 9], @@ -1001,7 +1001,7 @@ def test_mixed_type_suffix(self): "treatment_foo": [3.0, 4.0], } ) - expected = pd.DataFrame( + expected = DataFrame( { "A": ["X1", "X2", "X1", "X2"], "colname": ["1", "1", "foo", "foo"], @@ -1015,7 +1015,7 @@ def test_mixed_type_suffix(self): tm.assert_frame_equal(result, expected) def test_float_suffix(self): - df = pd.DataFrame( + df = DataFrame( { "treatment_1.1": [1.0, 2.0], "treatment_2.1": [3.0, 4.0], @@ -1024,7 +1024,7 @@ def test_float_suffix(self): "A": ["X1", "X2"], } ) - expected = pd.DataFrame( + expected = DataFrame( { "A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"], "colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], @@ -1060,8 +1060,8 @@ def test_warn_of_column_name_value(self): # GH34731 # raise a warning if the resultant value column name matches # a name in the dataframe already (default name is "value") - df = pd.DataFrame({"col": list("ABC"), "value": range(10, 16, 2)}) - expected = pd.DataFrame( + df = DataFrame({"col": list("ABC"), "value": range(10, 16, 2)}) + expected = DataFrame( [["A", "col", "A"], ["B", "col", "B"], ["C", "col", "C"]], columns=["value", "variable", "value"], ) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 67b3151b0ff9c..642e6a691463e 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import IS64 + import pandas as pd from pandas import ( Categorical, @@ -110,7 +112,7 @@ def test_pivot_table(self, observed): def test_pivot_table_categorical_observed_equal(self, observed): # issue #24923 - df = pd.DataFrame( + df = DataFrame( {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]} ) @@ -229,7 +231,7 @@ def test_pivot_table_dropna_categoricals(self, dropna): def test_pivot_with_non_observable_dropna(self, dropna): # gh-21133 - df = pd.DataFrame( + df = DataFrame( { "A": pd.Categorical( [np.nan, "low", "high", "low", "high"], @@ -241,9 +243,9 @@ def test_pivot_with_non_observable_dropna(self, dropna): ) result = df.pivot_table(index="A", values="B", dropna=dropna) - expected = pd.DataFrame( + expected = DataFrame( {"B": [2, 3]}, - index=pd.Index( + index=Index( pd.Categorical.from_codes( [0, 1], categories=["low", "high"], ordered=True ), @@ -254,7 +256,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): tm.assert_frame_equal(result, expected) # gh-21378 - df = pd.DataFrame( + df = DataFrame( { "A": pd.Categorical( ["left", "low", "high", "low", "high"], @@ -266,9 +268,9 @@ def test_pivot_with_non_observable_dropna(self, dropna): ) result = df.pivot_table(index="A", values="B", dropna=dropna) - expected = pd.DataFrame( + expected = DataFrame( {"B": [2, 3, 0]}, - index=pd.Index( + index=Index( pd.Categorical.from_codes( [0, 1, 2], categories=["low", "high", "left"], ordered=True ), @@ -395,16 +397,14 @@ def test_pivot_no_values(self): idx = pd.DatetimeIndex( ["2011-01-01", "2011-02-01", "2011-01-02", "2011-01-01", "2011-01-02"] ) - df = pd.DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx) + df = DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx) res = df.pivot_table(index=df.index.month, columns=df.index.day) exp_columns = pd.MultiIndex.from_tuples([("A", 1), ("A", 2)]) - exp = pd.DataFrame( - [[2.5, 4.0], [2.0, np.nan]], index=[1, 2], columns=exp_columns - ) + exp = DataFrame([[2.5, 4.0], [2.0, np.nan]], index=[1, 2], columns=exp_columns) tm.assert_frame_equal(res, exp) - df = pd.DataFrame( + df = DataFrame( { "A": [1, 2, 3, 4, 5], "dt": pd.date_range("2011-01-01", freq="D", periods=5), @@ -416,13 +416,13 @@ def test_pivot_no_values(self): ) exp_columns = pd.MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))]) exp_columns.names = [None, "dt"] - exp = pd.DataFrame([3.25, 2.0], index=[1, 2], columns=exp_columns) + exp = DataFrame([3.25, 2.0], index=[1, 2], columns=exp_columns) tm.assert_frame_equal(res, exp) res = df.pivot_table( index=pd.Grouper(freq="A"), columns=pd.Grouper(key="dt", freq="M") ) - exp = pd.DataFrame( + exp = DataFrame( [3], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns ) tm.assert_frame_equal(res, exp) @@ -577,7 +577,7 @@ def test_pivot_with_tz(self, method): def test_pivot_tz_in_values(self): # GH 14948 - df = pd.DataFrame( + df = DataFrame( [ { "uid": "aa", @@ -612,14 +612,14 @@ def test_pivot_tz_in_values(self): columns=[mins], aggfunc=np.min, ) - expected = pd.DataFrame( + expected = DataFrame( [ [ pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), ] ], - index=pd.Index(["aa"], name="uid"), + index=Index(["aa"], name="uid"), columns=pd.DatetimeIndex( [ pd.Timestamp("2016-08-12 00:00:00", tz="US/Pacific"), @@ -693,10 +693,8 @@ def test_pivot_periods_with_margins(self): expected = DataFrame( data=1.0, - index=pd.Index([1, 2, "All"], name="a"), - columns=pd.Index( - [pd.Period("2019Q1"), pd.Period("2019Q2"), "All"], name="b" - ), + index=Index([1, 2, "All"], name="a"), + columns=Index([pd.Period("2019Q1"), pd.Period("2019Q2"), "All"], name="b"), ) result = df.pivot_table(index="a", columns="b", values="x", margins=True) @@ -707,14 +705,14 @@ def test_pivot_periods_with_margins(self): [ ["baz", "zoo"], np.array(["baz", "zoo"]), - pd.Series(["baz", "zoo"]), - pd.Index(["baz", "zoo"]), + Series(["baz", "zoo"]), + Index(["baz", "zoo"]), ], ) @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_list_like_values(self, values, method): # issue #17160 - df = pd.DataFrame( + df = DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": ["A", "B", "C", "A", "B", "C"], @@ -743,14 +741,14 @@ def test_pivot_with_list_like_values(self, values, method): [ ["bar", "baz"], np.array(["bar", "baz"]), - pd.Series(["bar", "baz"]), - pd.Index(["bar", "baz"]), + Series(["bar", "baz"]), + Index(["bar", "baz"]), ], ) @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_list_like_values_nans(self, values, method): # issue #17160 - df = pd.DataFrame( + df = DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": ["A", "B", "C", "A", "B", "C"], @@ -783,9 +781,7 @@ def test_pivot_with_list_like_values_nans(self, values, method): def test_pivot_columns_none_raise_error(self): # GH 30924 - df = pd.DataFrame( - {"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]} - ) + df = DataFrame({"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]}) msg = r"pivot\(\) missing 1 required argument: 'columns'" with pytest.raises(TypeError, match=msg): df.pivot(index="col1", values="col3") @@ -835,7 +831,7 @@ def test_pivot_with_multiindex(self, method): @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_tuple_of_values(self, method): # issue #17160 - df = pd.DataFrame( + df = DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": ["A", "B", "C", "A", "B", "C"], @@ -941,7 +937,7 @@ def test_margin_with_only_columns_defined( self, columns, aggfunc, values, expected_columns ): # GH 31016 - df = pd.DataFrame( + df = DataFrame( { "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], @@ -962,9 +958,7 @@ def test_margin_with_only_columns_defined( ) result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) - expected = pd.DataFrame( - values, index=Index(["D", "E"]), columns=expected_columns - ) + expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) tm.assert_frame_equal(result, expected) @@ -1655,9 +1649,7 @@ def test_monthly(self): rng = date_range("1/1/2000", "12/31/2004", freq="M") ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_table( - pd.DataFrame(ts), index=ts.index.year, columns=ts.index.month - ) + annual = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month) annual.columns = annual.columns.droplevel(0) month = ts.index.month @@ -1690,7 +1682,7 @@ def test_pivot_table_with_iterator_values(self): def test_pivot_table_margins_name_with_aggfunc_list(self): # GH 13354 margins_name = "Weekly" - costs = pd.DataFrame( + costs = DataFrame( { "item": ["bacon", "cheese", "bacon", "cheese"], "cost": [2.5, 4.5, 3.2, 3.3], @@ -1704,7 +1696,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): margins_name=margins_name, aggfunc=[np.mean, max], ) - ix = pd.Index(["bacon", "cheese", margins_name], dtype="object", name="item") + ix = Index(["bacon", "cheese", margins_name], dtype="object", name="item") tups = [ ("mean", "cost", "M"), ("mean", "cost", "T"), @@ -1714,17 +1706,17 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): ("max", "cost", margins_name), ] cols = pd.MultiIndex.from_tuples(tups, names=[None, None, "day"]) - expected = pd.DataFrame(table.values, index=ix, columns=cols) + expected = DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") def test_categorical_margins(self, observed): # GH 10989 - df = pd.DataFrame( + df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} ) - expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) + expected = DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) expected.index = Index([0, 1, "All"], name="y") expected.columns = Index([0, 1, "All"], name="z") @@ -1733,11 +1725,11 @@ def test_categorical_margins(self, observed): @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") def test_categorical_margins_category(self, observed): - df = pd.DataFrame( + df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} ) - expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) + expected = DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) expected.index = Index([0, 1, "All"], name="y") expected.columns = Index([0, 1, "All"], name="z") @@ -1748,7 +1740,7 @@ def test_categorical_margins_category(self, observed): def test_margins_casted_to_float(self, observed): # GH 24893 - df = pd.DataFrame( + df = DataFrame( { "A": [2, 4, 6, 8], "B": [1, 4, 5, 8], @@ -1758,9 +1750,9 @@ def test_margins_casted_to_float(self, observed): ) result = pd.pivot_table(df, index="D", margins=True) - expected = pd.DataFrame( + expected = DataFrame( {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, - index=pd.Index(["X", "Y", "All"], name="D"), + index=Index(["X", "Y", "All"], name="D"), ) tm.assert_frame_equal(result, expected) @@ -1768,7 +1760,7 @@ def test_pivot_with_categorical(self, observed, ordered): # gh-21370 idx = [np.nan, "low", "high", "low", np.nan] col = [np.nan, "A", "B", np.nan, "A"] - df = pd.DataFrame( + df = DataFrame( { "In": pd.Categorical(idx, categories=["low", "high"], ordered=ordered), "Col": pd.Categorical(col, categories=["A", "B"], ordered=ordered), @@ -1782,9 +1774,7 @@ def test_pivot_with_categorical(self, observed, ordered): expected_cols = pd.CategoricalIndex(["A", "B"], ordered=ordered, name="Col") - expected = pd.DataFrame( - data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols - ) + expected = DataFrame(data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols) expected.index = Index( pd.Categorical( ["low", "high"], categories=["low", "high"], ordered=ordered @@ -1797,7 +1787,7 @@ def test_pivot_with_categorical(self, observed, ordered): # case with columns/value result = df.pivot_table(columns="Col", values="Val", observed=observed) - expected = pd.DataFrame( + expected = DataFrame( data=[[3.5, 3.0]], columns=expected_cols, index=Index(["Val"]) ) @@ -1805,7 +1795,7 @@ def test_pivot_with_categorical(self, observed, ordered): def test_categorical_aggfunc(self, observed): # GH 9534 - df = pd.DataFrame( + df = DataFrame( {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]} ) df["C1"] = df["C1"].astype("category") @@ -1816,16 +1806,16 @@ def test_categorical_aggfunc(self, observed): expected_index = pd.CategoricalIndex( ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" ) - expected_columns = pd.Index(["a", "b"], name="C2") + expected_columns = Index(["a", "b"], name="C2") expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64) - expected = pd.DataFrame( + expected = DataFrame( expected_data, index=expected_index, columns=expected_columns ) tm.assert_frame_equal(result, expected) def test_categorical_pivot_index_ordering(self, observed): # GH 8731 - df = pd.DataFrame( + df = DataFrame( { "Sales": [100, 120, 220], "Month": ["January", "January", "January"], @@ -1859,7 +1849,7 @@ def test_categorical_pivot_index_ordering(self, observed): months, categories=months, ordered=False, name="Month" ) expected_data = [[320, 120]] + [[0, 0]] * 11 - expected = pd.DataFrame( + expected = DataFrame( expected_data, index=expected_index, columns=expected_columns ) if observed: @@ -1898,12 +1888,12 @@ def test_pivot_table_not_series(self): def test_pivot_margins_name_unicode(self): # issue #13292 greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae" - frame = pd.DataFrame({"foo": [1, 2, 3]}) + frame = DataFrame({"foo": [1, 2, 3]}) table = pd.pivot_table( frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek ) - index = pd.Index([1, 2, 3, greek], dtype="object", name="foo") - expected = pd.DataFrame(index=index) + index = Index([1, 2, 3, greek], dtype="object", name="foo") + expected = DataFrame(index=index) tm.assert_frame_equal(table, expected) def test_pivot_string_as_func(self): @@ -2001,7 +1991,7 @@ def test_pivot_number_of_levels_larger_than_int32(self): def test_pivot_table_aggfunc_dropna(self, dropna): # GH 22159 - df = pd.DataFrame( + df = DataFrame( { "fruit": ["apple", "peach", "apple"], "size": [1, 1, 2], @@ -2027,7 +2017,7 @@ def ret_none(x): [["ret_sum", "ret_none", "ret_one"], ["apple", "peach"]], names=[None, "fruit"], ) - expected = pd.DataFrame(data, index=["size", "taste"], columns=col) + expected = DataFrame(data, index=["size", "taste"], columns=col) if dropna: expected = expected.dropna(axis="columns") @@ -2036,15 +2026,15 @@ def ret_none(x): def test_pivot_table_aggfunc_scalar_dropna(self, dropna): # GH 22159 - df = pd.DataFrame( + df = DataFrame( {"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]} ) result = pd.pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna) data = [[2.5, np.nan], [1, np.nan]] - col = pd.Index(["one", "two"], name="A") - expected = pd.DataFrame(data, index=["x", "y"], columns=col) + col = Index(["one", "two"], name="A") + expected = DataFrame(data, index=["x", "y"], columns=col) if dropna: expected = expected.dropna(axis="columns") @@ -2053,7 +2043,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): def test_pivot_table_empty_aggfunc(self): # GH 9186 - df = pd.DataFrame( + df = DataFrame( { "A": [2, 2, 3, 3, 2], "id": [5, 6, 7, 8, 9], @@ -2062,7 +2052,7 @@ def test_pivot_table_empty_aggfunc(self): } ) result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) - expected = pd.DataFrame() + expected = DataFrame() tm.assert_frame_equal(result, expected) def test_pivot_table_no_column_raises(self): @@ -2070,8 +2060,98 @@ def test_pivot_table_no_column_raises(self): def agg(l): return np.mean(l) - foo = pd.DataFrame( - {"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]} - ) + foo = DataFrame({"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]}) with pytest.raises(KeyError, match="notpresent"): foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) + + +class TestPivot: + def test_pivot(self): + data = { + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], + } + + frame = DataFrame(data) + pivoted = frame.pivot(index="index", columns="columns", values="values") + + expected = DataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) + + expected.index.name, expected.columns.name = "index", "columns" + tm.assert_frame_equal(pivoted, expected) + + # name tracking + assert pivoted.index.name == "index" + assert pivoted.columns.name == "columns" + + # don't specify values + pivoted = frame.pivot(index="index", columns="columns") + assert pivoted.index.name == "index" + assert pivoted.columns.names == (None, "columns") + + def test_pivot_duplicates(self): + data = DataFrame( + { + "a": ["bar", "bar", "foo", "foo", "foo"], + "b": ["one", "two", "one", "one", "two"], + "c": [1.0, 2.0, 3.0, 3.0, 4.0], + } + ) + with pytest.raises(ValueError, match="duplicate entries"): + data.pivot("a", "b", "c") + + @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") + def test_pivot_empty(self): + df = DataFrame(columns=["a", "b", "c"]) + result = df.pivot("a", "b", "c") + expected = DataFrame() + tm.assert_frame_equal(result, expected, check_names=False) + + def test_pivot_integer_bug(self): + df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")]) + + result = df.pivot(index=1, columns=0, values=2) + repr(result) + tm.assert_index_equal(result.columns, Index(["A", "B"], name=0)) + + def test_pivot_index_none(self): + # GH#3962 + data = { + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], + } + + frame = DataFrame(data).set_index("index") + result = frame.pivot(columns="columns", values="values") + expected = DataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) + + expected.index.name, expected.columns.name = "index", "columns" + tm.assert_frame_equal(result, expected) + + # omit values + result = frame.pivot(columns="columns") + + expected.columns = pd.MultiIndex.from_tuples( + [("values", "One"), ("values", "Two")], names=[None, "columns"] + ) + expected.index.name = "index" + tm.assert_frame_equal(result, expected, check_names=False) + assert result.index.name == "index" + assert result.columns.names == (None, "columns") + expected.columns = expected.columns.droplevel(0) + result = frame.pivot(columns="columns", values="values") + + expected.columns.name = "columns" + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 8918d19e4ba7b..f6a4f8c0cf601 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -331,7 +331,7 @@ def test_union_categoricals_sort_false(self): def test_union_categorical_unwrap(self): # GH 14173 c1 = Categorical(["a", "b"]) - c2 = pd.Series(["b", "c"], dtype="category") + c2 = Series(["b", "c"], dtype="category") result = union_categoricals([c1, c2]) expected = Categorical(["a", "b", "b", "c"]) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 23fb25b838da6..06bdb8a6cf0a2 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -228,6 +228,14 @@ def test_overflow_on_construction(): ("P0DT0H0M0.001S", Timedelta(milliseconds=1)), ("P0DT0H1M0S", Timedelta(minutes=1)), ("P1DT25H61M61S", Timedelta(days=1, hours=25, minutes=61, seconds=61)), + ("PT1S", Timedelta(seconds=1)), + ("PT0S", Timedelta(seconds=0)), + ("P1WT0S", Timedelta(days=7, seconds=0)), + ("P1D", Timedelta(days=1)), + ("P1DT1H", Timedelta(days=1, hours=1)), + ("P1W", Timedelta(days=7)), + ("PT300S", Timedelta(seconds=300)), + ("P1DT0H0M00000000000S", Timedelta(days=1)), ], ) def test_iso_constructor(fmt, exp): @@ -241,7 +249,6 @@ def test_iso_constructor(fmt, exp): "PDTHMS", "P0DT999H999M999S", "P1DT0H0M0.0000000000000S", - "P1DT0H0M00000000000S", "P1DT0H0M0.S", ], ) diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index cee7ac450e411..92675f387fe1e 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -26,6 +26,7 @@ def test_properties_business(self): ts = Timestamp("2017-10-01", freq="B") control = Timestamp("2017-10-01") assert ts.dayofweek == 6 + assert ts.day_of_week == 6 assert not ts.is_month_start # not a weekday assert not ts.is_quarter_start # not a weekday # Control case: non-business is month/qtr start @@ -35,6 +36,7 @@ def test_properties_business(self): ts = Timestamp("2017-09-30", freq="B") control = Timestamp("2017-09-30") assert ts.dayofweek == 5 + assert ts.day_of_week == 5 assert not ts.is_month_end # not a weekday assert not ts.is_quarter_end # not a weekday # Control case: non-business is month/qtr start @@ -61,8 +63,10 @@ def check(value, equal): check(ts.microsecond, 100) check(ts.nanosecond, 1) check(ts.dayofweek, 6) + check(ts.day_of_week, 6) check(ts.quarter, 2) check(ts.dayofyear, 130) + check(ts.day_of_year, 130) check(ts.week, 19) check(ts.daysinmonth, 31) check(ts.daysinmonth, 31) @@ -81,8 +85,10 @@ def check(value, equal): check(ts.microsecond, 0) check(ts.nanosecond, 0) check(ts.dayofweek, 2) + check(ts.day_of_week, 2) check(ts.quarter, 4) check(ts.dayofyear, 365) + check(ts.day_of_year, 365) check(ts.week, 1) check(ts.daysinmonth, 31) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index ce8759c4ba76d..9096d2a1033e5 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -25,7 +25,7 @@ def test_apply(self, datetime_series): ) # empty series - s = Series(dtype=object, name="foo", index=pd.Index([], name="bar")) + s = Series(dtype=object, name="foo", index=Index([], name="bar")) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs) @@ -92,56 +92,56 @@ def func(x): def test_apply_box(self): # ufunc will not be boxed. Same test cases as the test_map_box vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = pd.Series(vals) + s = Series(vals) assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) + exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) vals = [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), ] - s = pd.Series(vals) + s = Series(vals) assert s.dtype == "datetime64[ns, US/Eastern]" res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) # timedelta vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = pd.Series(vals) + s = Series(vals) assert s.dtype == "timedelta64[ns]" res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") - exp = pd.Series(["Timedelta_1", "Timedelta_2"]) + exp = Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) # period vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = pd.Series(vals) + s = Series(vals) assert s.dtype == "Period[M]" res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") - exp = pd.Series(["Period_M", "Period_M"]) + exp = Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) def test_apply_datetimetz(self): values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( "Asia/Tokyo" ) - s = pd.Series(values, name="XX") + s = Series(values, name="XX") result = s.apply(lambda x: x + pd.offsets.Day()) exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( "Asia/Tokyo" ) - exp = pd.Series(exp_values, name="XX") + exp = Series(exp_values, name="XX") tm.assert_series_equal(result, exp) # change dtype # GH 14506 : Returned dtype changed from int32 to int64 result = s.apply(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) + exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) tm.assert_series_equal(result, exp) # not vectorized @@ -151,12 +151,12 @@ def f(x): return str(x.tz) result = s.map(f) - exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") + exp = Series(["Asia/Tokyo"] * 25, name="XX") tm.assert_series_equal(result, exp) def test_apply_dict_depr(self): - tsdf = pd.DataFrame( + tsdf = DataFrame( np.random.randn(10, 3), columns=["A", "B", "C"], index=pd.date_range("1/1/2000", periods=10), @@ -167,34 +167,34 @@ def test_apply_dict_depr(self): def test_apply_categorical(self): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) - ser = pd.Series(values, name="XX", index=list("abcdefg")) + ser = Series(values, name="XX", index=list("abcdefg")) result = ser.apply(lambda x: x.lower()) # should be categorical dtype when the number of categories are # the same values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) - exp = pd.Series(values, name="XX", index=list("abcdefg")) + exp = Series(values, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) tm.assert_categorical_equal(result.values, exp.values) result = ser.apply(lambda x: "A") - exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) + exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) assert result.dtype == object @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) def test_apply_categorical_with_nan_values(self, series): # GH 20714 bug fixed in: GH 24275 - s = pd.Series(series, dtype="category") + s = Series(series, dtype="category") result = s.apply(lambda x: x.split("-")[0]) result = result.astype(object) - expected = pd.Series(["1", "1", np.NaN], dtype="category") + expected = Series(["1", "1", np.NaN], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_apply_empty_integer_series_with_datetime_index(self): # GH 21245 - s = pd.Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) + s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) result = s.apply(lambda x: x) tm.assert_series_equal(result, s) @@ -447,9 +447,9 @@ def test_agg_cython_table_raises(self, series, func, expected): def test_series_apply_no_suffix_index(self): # GH36189 - s = pd.Series([4] * 3) + s = Series([4] * 3) result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = pd.Series([12, 12, 12], index=["sum", "", ""]) + expected = Series([12, 12, 12], index=["sum", "", ""]) tm.assert_series_equal(result, expected) @@ -517,7 +517,7 @@ def test_map_empty(self, index): s = Series(index) result = s.map({}) - expected = pd.Series(np.nan, index=s.index) + expected = Series(np.nan, index=s.index) tm.assert_series_equal(result, expected) def test_map_compat(self): @@ -566,11 +566,11 @@ def test_map_dict_with_tuple_keys(self): from being mapped properly. """ # GH 18496 - df = pd.DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) + df = DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} df["labels"] = df["a"].map(label_mappings) - df["expected_labels"] = pd.Series(["A", "B", "A", "B"], index=df.index) + df["expected_labels"] = Series(["A", "B", "A", "B"], index=df.index) # All labels should be filled now tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) @@ -651,53 +651,53 @@ def __missing__(self, key): def test_map_box(self): vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = pd.Series(vals) + s = Series(vals) assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) + exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) vals = [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), ] - s = pd.Series(vals) + s = Series(vals) assert s.dtype == "datetime64[ns, US/Eastern]" res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) # timedelta vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = pd.Series(vals) + s = Series(vals) assert s.dtype == "timedelta64[ns]" res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") - exp = pd.Series(["Timedelta_1", "Timedelta_2"]) + exp = Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) # period vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = pd.Series(vals) + s = Series(vals) assert s.dtype == "Period[M]" res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") - exp = pd.Series(["Period_M", "Period_M"]) + exp = Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) def test_map_categorical(self): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) - s = pd.Series(values, name="XX", index=list("abcdefg")) + s = Series(values, name="XX", index=list("abcdefg")) result = s.map(lambda x: x.lower()) exp_values = pd.Categorical( list("abbabcd"), categories=list("dcba"), ordered=True ) - exp = pd.Series(exp_values, name="XX", index=list("abcdefg")) + exp = Series(exp_values, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) tm.assert_categorical_equal(result.values, exp_values) result = s.map(lambda x: "A") - exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) + exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) assert result.dtype == object @@ -708,20 +708,20 @@ def test_map_datetimetz(self): values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( "Asia/Tokyo" ) - s = pd.Series(values, name="XX") + s = Series(values, name="XX") # keep tz result = s.map(lambda x: x + pd.offsets.Day()) exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( "Asia/Tokyo" ) - exp = pd.Series(exp_values, name="XX") + exp = Series(exp_values, name="XX") tm.assert_series_equal(result, exp) # change dtype # GH 14506 : Returned dtype changed from int32 to int64 result = s.map(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) + exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) tm.assert_series_equal(result, exp) with pytest.raises(NotImplementedError): @@ -734,7 +734,7 @@ def f(x): return str(x.tz) result = s.map(f) - exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") + exp = Series(["Asia/Tokyo"] * 25, name="XX") tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -747,10 +747,10 @@ def f(x): ) def test_map_missing_mixed(self, vals, mapping, exp): # GH20495 - s = pd.Series(vals + [np.nan]) + s = Series(vals + [np.nan]) result = s.map(mapping) - tm.assert_series_equal(result, pd.Series(exp)) + tm.assert_series_equal(result, Series(exp)) @pytest.mark.parametrize( "dti,exp", @@ -769,26 +769,26 @@ def test_apply_series_on_date_time_index_aware_series(self, dti, exp): # GH 25959 # Calling apply on a localized time series should not cause an error index = dti.tz_localize("UTC").index - result = pd.Series(index).apply(lambda x: pd.Series([1, 2])) + result = Series(index).apply(lambda x: Series([1, 2])) tm.assert_frame_equal(result, exp) def test_apply_scaler_on_date_time_index_aware_series(self): # GH 25959 # Calling apply on a localized time series should not cause an error series = tm.makeTimeSeries(nper=30).tz_localize("UTC") - result = pd.Series(series.index).apply(lambda x: 1) - tm.assert_series_equal(result, pd.Series(np.ones(30), dtype="int64")) + result = Series(series.index).apply(lambda x: 1) + tm.assert_series_equal(result, Series(np.ones(30), dtype="int64")) def test_map_float_to_string_precision(self): # GH 13228 - ser = pd.Series(1 / 3) + ser = Series(1 / 3) result = ser.map(lambda val: str(val)).to_dict() expected = {0: "0.3333333333333333"} assert result == expected def test_map_with_invalid_na_action_raises(self): # https://github.com/pandas-dev/pandas/issues/32815 - s = pd.Series([1, 2, 3]) + s = Series([1, 2, 3]) msg = "na_action must either be 'ignore' or None" with pytest.raises(ValueError, match=msg): s.map(lambda x: x, na_action="____") diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 67b271f757cfb..ada27289e795d 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -18,7 +18,7 @@ def test_transform_ufunc(string_series): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("op", transformation_kernels) +@pytest.mark.parametrize("op", sorted(transformation_kernels)) def test_transform_groupby_kernel(string_series, op): # GH 35964 if op == "cumcount": @@ -144,7 +144,7 @@ def test_transform_reducer_raises(all_reductions): # mypy doesn't allow adding lists of different types # https://github.com/python/mypy/issues/5492 -@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) +@pytest.mark.parametrize("op", [*sorted(transformation_kernels), lambda x: x + 1]) def test_transform_bad_dtype(op): # GH 35964 s = Series(3 * [object]) # Series that will fail on most transforms diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py deleted file mode 100644 index 3f88f4193e770..0000000000000 --- a/pandas/tests/series/indexing/test_boolean.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -import pytest - -from pandas import Index, Series, date_range -import pandas._testing as tm -from pandas.core.indexing import IndexingError - -from pandas.tseries.offsets import BDay - - -def test_getitem_boolean(string_series): - s = string_series - mask = s > s.median() - - # passing list is OK - result = s[list(mask)] - expected = s[mask] - tm.assert_series_equal(result, expected) - tm.assert_index_equal(result.index, s.index[mask]) - - -def test_getitem_boolean_empty(): - s = Series([], dtype=np.int64) - s.index.name = "index_name" - s = s[s.isna()] - assert s.index.name == "index_name" - assert s.dtype == np.int64 - - # GH5877 - # indexing with empty series - s = Series(["A", "B"]) - expected = Series(dtype=object, index=Index([], dtype="int64")) - result = s[Series([], dtype=object)] - tm.assert_series_equal(result, expected) - - # invalid because of the boolean indexer - # that's empty or not-aligned - msg = ( - r"Unalignable boolean Series provided as indexer \(index of " - r"the boolean Series and of the indexed object do not match" - ) - with pytest.raises(IndexingError, match=msg): - s[Series([], dtype=bool)] - - with pytest.raises(IndexingError, match=msg): - s[Series([True], dtype=bool)] - - -def test_getitem_boolean_object(string_series): - # using column from DataFrame - - s = string_series - mask = s > s.median() - omask = mask.astype(object) - - # getitem - result = s[omask] - expected = s[mask] - tm.assert_series_equal(result, expected) - - # setitem - s2 = s.copy() - cop = s.copy() - cop[omask] = 5 - s2[mask] = 5 - tm.assert_series_equal(cop, s2) - - # nans raise exception - omask[5:10] = np.nan - msg = "Cannot mask with non-boolean array containing NA / NaN values" - with pytest.raises(ValueError, match=msg): - s[omask] - with pytest.raises(ValueError, match=msg): - s[omask] = 5 - - -def test_getitem_setitem_boolean_corner(datetime_series): - ts = datetime_series - mask_shifted = ts.shift(1, freq=BDay()) > ts.median() - - # these used to raise...?? - - msg = ( - r"Unalignable boolean Series provided as indexer \(index of " - r"the boolean Series and of the indexed object do not match" - ) - with pytest.raises(IndexingError, match=msg): - ts[mask_shifted] - with pytest.raises(IndexingError, match=msg): - ts[mask_shifted] = 1 - - with pytest.raises(IndexingError, match=msg): - ts.loc[mask_shifted] - with pytest.raises(IndexingError, match=msg): - ts.loc[mask_shifted] = 1 - - -def test_setitem_boolean(string_series): - mask = string_series > string_series.median() - - # similar indexed series - result = string_series.copy() - result[mask] = string_series * 2 - expected = string_series * 2 - tm.assert_series_equal(result[mask], expected[mask]) - - # needs alignment - result = string_series.copy() - result[mask] = (string_series * 2)[0:5] - expected = (string_series * 2)[0:5].reindex_like(string_series) - expected[-mask] = string_series[mask] - tm.assert_series_equal(result[mask], expected[mask]) - - -def test_get_set_boolean_different_order(string_series): - ordered = string_series.sort_values() - - # setting - copy = string_series.copy() - copy[ordered > 0] = 0 - - expected = string_series.copy() - expected[expected > 0] = 0 - - tm.assert_series_equal(copy, expected) - - # getting - sel = string_series[ordered > 0] - exp = string_series[string_series > 0] - tm.assert_series_equal(sel, exp) - - -def test_getitem_boolean_dt64_copies(): - # GH#36210 - dti = date_range("2016-01-01", periods=4, tz="US/Pacific") - key = np.array([True, True, False, False]) - - ser = Series(dti._data) - - res = ser[key] - assert res._values._data.base is None - - # compare with numeric case for reference - ser2 = Series(range(4)) - res2 = ser2[key] - assert res2._values.base is None diff --git a/pandas/tests/series/indexing/test_callable.py b/pandas/tests/series/indexing/test_callable.py deleted file mode 100644 index fe575cf146641..0000000000000 --- a/pandas/tests/series/indexing/test_callable.py +++ /dev/null @@ -1,33 +0,0 @@ -import pandas as pd -import pandas._testing as tm - - -def test_getitem_callable(): - # GH 12533 - s = pd.Series(4, index=list("ABCD")) - result = s[lambda x: "A"] - assert result == s.loc["A"] - - result = s[lambda x: ["A", "B"]] - tm.assert_series_equal(result, s.loc[["A", "B"]]) - - result = s[lambda x: [True, False, True, True]] - tm.assert_series_equal(result, s.iloc[[0, 2, 3]]) - - -def test_setitem_callable(): - # GH 12533 - s = pd.Series([1, 2, 3, 4], index=list("ABCD")) - s[lambda x: "A"] = -1 - tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list("ABCD"))) - - -def test_setitem_other_callable(): - # GH 13299 - inc = lambda x: x + 1 - - s = pd.Series([1, 2, -1, 4]) - s[s < 0] = inc - - expected = pd.Series([1, 2, inc, 4]) - tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 1801d13e75565..6aee8b3ab34fa 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -65,21 +65,6 @@ def test_dti_reset_index_round_trip(): assert df.reset_index()["Date"][0] == stamp -def test_series_set_value(): - # #1561 - - dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] - index = DatetimeIndex(dates) - - s = Series(dtype=object) - s._set_value(dates[0], 1.0) - s._set_value(dates[1], np.nan) - - expected = Series([1.0, np.nan], index=index) - - tm.assert_series_equal(s, expected) - - @pytest.mark.slow def test_slice_locs_indexerror(): times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) for i in range(100000)] @@ -109,7 +94,7 @@ def test_slicing_datetimes(): tm.assert_frame_equal(result, expected) # duplicates - df = pd.DataFrame( + df = DataFrame( np.arange(5.0, dtype="float64"), index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]], ) @@ -552,7 +537,7 @@ def test_indexing_over_size_cutoff_period_index(monkeypatch): idx = pd.period_range("1/1/2000", freq="T", periods=n) assert idx._engine.over_size_threshold - s = pd.Series(np.random.randn(len(idx)), index=idx) + s = Series(np.random.randn(len(idx)), index=idx) pos = n - 1 timestamp = idx[pos] diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 5b585e8802752..9f6aab823c3ad 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -9,11 +9,35 @@ from pandas._libs.tslibs import conversion, timezones import pandas as pd -from pandas import Series, Timestamp, date_range, period_range +from pandas import Index, Series, Timestamp, date_range, period_range import pandas._testing as tm +from pandas.core.indexing import IndexingError + +from pandas.tseries.offsets import BDay class TestSeriesGetitemScalars: + def test_getitem_keyerror_with_int64index(self): + ser = Series(np.random.randn(6), index=[0, 0, 1, 1, 2, 2]) + + with pytest.raises(KeyError, match=r"^5$"): + ser[5] + + with pytest.raises(KeyError, match=r"^'c'$"): + ser["c"] + + # not monotonic + ser = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) + + with pytest.raises(KeyError, match=r"^5$"): + ser[5] + + with pytest.raises(KeyError, match=r"^'c'$"): + ser["c"] + + def test_getitem_int64(self, datetime_series): + idx = np.int64(5) + assert datetime_series[idx] == datetime_series[5] # TODO: better name/GH ref? def test_getitem_regression(self): @@ -90,7 +114,7 @@ def test_getitem_intlist_intindex_periodvalues(self): ser = Series(period_range("2000-01-01", periods=10, freq="D")) result = ser[[2, 4]] - exp = pd.Series( + exp = Series( [pd.Period("2000-01-03", freq="D"), pd.Period("2000-01-05", freq="D")], index=[2, 4], dtype="Period[D]", @@ -124,6 +148,123 @@ def test_getitem_intlist_multiindex_numeric_level(self, dtype, box): ser[key] +class TestGetitemBooleanMask: + def test_getitem_boolean(self, string_series): + ser = string_series + mask = ser > ser.median() + + # passing list is OK + result = ser[list(mask)] + expected = ser[mask] + tm.assert_series_equal(result, expected) + tm.assert_index_equal(result.index, ser.index[mask]) + + def test_getitem_boolean_empty(self): + ser = Series([], dtype=np.int64) + ser.index.name = "index_name" + ser = ser[ser.isna()] + assert ser.index.name == "index_name" + assert ser.dtype == np.int64 + + # GH#5877 + # indexing with empty series + ser = Series(["A", "B"]) + expected = Series(dtype=object, index=Index([], dtype="int64")) + result = ser[Series([], dtype=object)] + tm.assert_series_equal(result, expected) + + # invalid because of the boolean indexer + # that's empty or not-aligned + msg = ( + r"Unalignable boolean Series provided as indexer \(index of " + r"the boolean Series and of the indexed object do not match" + ) + with pytest.raises(IndexingError, match=msg): + ser[Series([], dtype=bool)] + + with pytest.raises(IndexingError, match=msg): + ser[Series([True], dtype=bool)] + + def test_getitem_boolean_object(self, string_series): + # using column from DataFrame + + ser = string_series + mask = ser > ser.median() + omask = mask.astype(object) + + # getitem + result = ser[omask] + expected = ser[mask] + tm.assert_series_equal(result, expected) + + # setitem + s2 = ser.copy() + cop = ser.copy() + cop[omask] = 5 + s2[mask] = 5 + tm.assert_series_equal(cop, s2) + + # nans raise exception + omask[5:10] = np.nan + msg = "Cannot mask with non-boolean array containing NA / NaN values" + with pytest.raises(ValueError, match=msg): + ser[omask] + with pytest.raises(ValueError, match=msg): + ser[omask] = 5 + + def test_getitem_boolean_dt64_copies(self): + # GH#36210 + dti = date_range("2016-01-01", periods=4, tz="US/Pacific") + key = np.array([True, True, False, False]) + + ser = Series(dti._data) + + res = ser[key] + assert res._values._data.base is None + + # compare with numeric case for reference + ser2 = Series(range(4)) + res2 = ser2[key] + assert res2._values.base is None + + def test_getitem_boolean_corner(self, datetime_series): + ts = datetime_series + mask_shifted = ts.shift(1, freq=BDay()) > ts.median() + + msg = ( + r"Unalignable boolean Series provided as indexer \(index of " + r"the boolean Series and of the indexed object do not match" + ) + with pytest.raises(IndexingError, match=msg): + ts[mask_shifted] + + with pytest.raises(IndexingError, match=msg): + ts.loc[mask_shifted] + + def test_getitem_boolean_different_order(self, string_series): + ordered = string_series.sort_values() + + sel = string_series[ordered > 0] + exp = string_series[string_series > 0] + tm.assert_series_equal(sel, exp) + + +class TestGetitemCallable: + def test_getitem_callable(self): + # GH#12533 + ser = Series(4, index=list("ABCD")) + result = ser[lambda x: "A"] + assert result == ser.loc["A"] + + result = ser[lambda x: ["A", "B"]] + expected = ser.loc[["A", "B"]] + tm.assert_series_equal(result, expected) + + result = ser[lambda x: [True, False, True, True]] + expected = ser.iloc[[0, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_getitem_generator(string_series): gen = (x > 0 for x in string_series) result = string_series[gen] @@ -134,6 +275,18 @@ def test_getitem_generator(string_series): def test_getitem_ndim_deprecated(): - s = pd.Series([0, 1]) + s = Series([0, 1]) with tm.assert_produces_warning(FutureWarning): s[:, None] + + +def test_getitem_multilevel_scalar_slice_not_implemented( + multiindex_year_month_day_dataframe_random_data, +): + # not implementing this for now + df = multiindex_year_month_day_dataframe_random_data + ser = df["A"] + + msg = r"\(2000, slice\(3, 4, None\)\)" + with pytest.raises(TypeError, match=msg): + ser[2000, 3:4] diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index fbdac2bb2d8e8..8c53ed85a20b3 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -132,7 +132,7 @@ def test_getitem_fancy(string_series, object_series): def test_type_promotion(): # GH12599 - s = pd.Series(dtype=object) + s = Series(dtype=object) s["a"] = pd.Timestamp("2016-01-01") s["b"] = 3.0 s["c"] = "foo" @@ -144,14 +144,14 @@ def test_type_promotion(): "result_1, duplicate_item, expected_1", [ [ - pd.Series({1: 12, 2: [1, 2, 2, 3]}), - pd.Series({1: 313}), - pd.Series({1: 12}, dtype=object), + Series({1: 12, 2: [1, 2, 2, 3]}), + Series({1: 313}), + Series({1: 12}, dtype=object), ], [ - pd.Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), - pd.Series({1: [1, 2, 3]}), - pd.Series({1: [1, 2, 3]}), + Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), + Series({1: [1, 2, 3]}), + Series({1: [1, 2, 3]}), ], ], ) @@ -205,7 +205,7 @@ def test_series_box_timestamp(): def test_series_box_timedelta(): rng = pd.timedelta_range("1 day 1 s", periods=5, freq="h") - ser = pd.Series(rng) + ser = Series(rng) assert isinstance(ser[0], Timedelta) assert isinstance(ser.at[1], Timedelta) assert isinstance(ser.iat[2], Timedelta) @@ -262,8 +262,8 @@ def test_setitem_ambiguous_keyerror(): def test_getitem_dataframe(): rng = list(range(10)) - s = pd.Series(10, index=rng) - df = pd.DataFrame(rng, index=rng) + s = Series(10, index=rng) + df = DataFrame(rng, index=rng) msg = ( "Indexing a Series with DataFrame is not supported, " "use the appropriate DataFrame column" @@ -299,15 +299,15 @@ def test_setitem(datetime_series, string_series): def test_setitem_empty_series(): # Test for issue #10193 key = pd.Timestamp("2012-01-01") - series = pd.Series(dtype=object) + series = Series(dtype=object) series[key] = 47 - expected = pd.Series(47, [key]) + expected = Series(47, [key]) tm.assert_series_equal(series, expected) # GH#33573 our index should retain its freq - series = pd.Series([], pd.DatetimeIndex([], freq="D"), dtype=object) + series = Series([], pd.DatetimeIndex([], freq="D"), dtype=object) series[key] = 47 - expected = pd.Series(47, pd.DatetimeIndex([key], freq="D")) + expected = Series(47, pd.DatetimeIndex([key], freq="D")) tm.assert_series_equal(series, expected) assert series.index.freq == expected.index.freq @@ -365,7 +365,7 @@ def test_setslice(datetime_series): def test_2d_to_1d_assignment_raises(): x = np.random.randn(2, 2) - y = pd.Series(range(2)) + y = Series(range(2)) msg = "|".join( [ @@ -409,13 +409,13 @@ def test_basic_getitem_setitem_corner(datetime_series): @pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) def test_setitem_with_tz(tz): - orig = pd.Series(pd.date_range("2016-01-01", freq="H", periods=3, tz=tz)) + orig = Series(pd.date_range("2016-01-01", freq="H", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" # scalar s = orig.copy() s[1] = pd.Timestamp("2011-01-01", tz=tz) - exp = pd.Series( + exp = Series( [ pd.Timestamp("2016-01-01 00:00", tz=tz), pd.Timestamp("2011-01-01 00:00", tz=tz), @@ -433,14 +433,14 @@ def test_setitem_with_tz(tz): tm.assert_series_equal(s, exp) # vector - vals = pd.Series( + vals = Series( [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], index=[1, 2], ) assert vals.dtype == f"datetime64[ns, {tz}]" s[[1, 2]] = vals - exp = pd.Series( + exp = Series( [ pd.Timestamp("2016-01-01 00:00", tz=tz), pd.Timestamp("2011-01-01 00:00", tz=tz), @@ -461,13 +461,13 @@ def test_setitem_with_tz(tz): def test_setitem_with_tz_dst(): # GH XXX TODO: fill in GH ref tz = "US/Eastern" - orig = pd.Series(pd.date_range("2016-11-06", freq="H", periods=3, tz=tz)) + orig = Series(pd.date_range("2016-11-06", freq="H", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" # scalar s = orig.copy() s[1] = pd.Timestamp("2011-01-01", tz=tz) - exp = pd.Series( + exp = Series( [ pd.Timestamp("2016-11-06 00:00-04:00", tz=tz), pd.Timestamp("2011-01-01 00:00-05:00", tz=tz), @@ -485,14 +485,14 @@ def test_setitem_with_tz_dst(): tm.assert_series_equal(s, exp) # vector - vals = pd.Series( + vals = Series( [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], index=[1, 2], ) assert vals.dtype == f"datetime64[ns, {tz}]" s[[1, 2]] = vals - exp = pd.Series( + exp = Series( [ pd.Timestamp("2016-11-06 00:00", tz=tz), pd.Timestamp("2011-01-01 00:00", tz=tz), @@ -547,7 +547,7 @@ def test_categorical_assigning_ops(): def test_getitem_categorical_str(): # GH#31765 - ser = pd.Series(range(5), index=pd.Categorical(["a", "b", "c", "a", "b"])) + ser = Series(range(5), index=pd.Categorical(["a", "b", "c", "a", "b"])) result = ser["a"] expected = ser.iloc[[0, 3]] tm.assert_series_equal(result, expected) @@ -646,7 +646,7 @@ def test_timedelta_assignment(): # GH 14155 s = Series(10 * [np.timedelta64(10, "m")]) s.loc[[1, 2, 3]] = np.timedelta64(20, "m") - expected = pd.Series(10 * [np.timedelta64(10, "m")]) + expected = Series(10 * [np.timedelta64(10, "m")]) expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, "m")) tm.assert_series_equal(s, expected) @@ -665,8 +665,8 @@ def test_dt64_series_assign_nat(nat_val, should_cast, tz): # into a datetime64 series. Others should coerce to object # and retain their dtypes. dti = pd.date_range("2016-01-01", periods=3, tz=tz) - base = pd.Series(dti) - expected = pd.Series([pd.NaT] + list(dti[1:]), dtype=dti.dtype) + base = Series(dti) + expected = Series([pd.NaT] + list(dti[1:]), dtype=dti.dtype) if not should_cast: expected = expected.astype(object) @@ -695,8 +695,8 @@ def test_td64_series_assign_nat(nat_val, should_cast): # some nat-like values should be cast to timedelta64 when inserting # into a timedelta64 series. Others should coerce to object # and retain their dtypes. - base = pd.Series([0, 1, 2], dtype="m8[ns]") - expected = pd.Series([pd.NaT, 1, 2], dtype="m8[ns]") + base = Series([0, 1, 2], dtype="m8[ns]") + expected = Series([pd.NaT, 1, 2], dtype="m8[ns]") if not should_cast: expected = expected.astype(object) @@ -723,14 +723,14 @@ def test_td64_series_assign_nat(nat_val, should_cast): ) def test_append_timedelta_does_not_cast(td): # GH#22717 inserting a Timedelta should _not_ cast to int64 - expected = pd.Series(["x", td], index=[0, "td"], dtype=object) + expected = Series(["x", td], index=[0, "td"], dtype=object) - ser = pd.Series(["x"]) + ser = Series(["x"]) ser["td"] = td tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], pd.Timedelta) - ser = pd.Series(["x"]) + ser = Series(["x"]) ser.loc["td"] = pd.Timedelta("9 days") tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], pd.Timedelta) @@ -771,7 +771,7 @@ def test_underlying_data_conversion(): # GH 3217 df = DataFrame(dict(a=[1, 3], b=[np.nan, 2])) df["c"] = np.nan - df["c"].update(pd.Series(["foo"], index=[0])) + df["c"].update(Series(["foo"], index=[0])) expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=["foo", np.nan])) tm.assert_frame_equal(df, expected) @@ -878,9 +878,9 @@ def test_pop(): def test_uint_drop(any_int_dtype): # see GH18311 # assigning series.loc[0] = 4 changed series.dtype to int - series = pd.Series([1, 2, 3], dtype=any_int_dtype) + series = Series([1, 2, 3], dtype=any_int_dtype) series.loc[0] = 4 - expected = pd.Series([4, 2, 3], dtype=any_int_dtype) + expected = Series([4, 2, 3], dtype=any_int_dtype) tm.assert_series_equal(series, expected) @@ -888,7 +888,7 @@ def test_getitem_unrecognized_scalar(): # GH#32684 a scalar key that is not recognized by lib.is_scalar # a series that might be produced via `frame.dtypes` - ser = pd.Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) + ser = Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) key = ser.index[1] @@ -949,7 +949,7 @@ def assert_slices_equivalent(l_slc, i_slc): def test_tuple_index(): # GH 35534 - Selecting values when a Series has an Index of tuples - s = pd.Series([1, 2], index=[("a",), ("b",)]) + s = Series([1, 2], index=[("a",), ("b",)]) assert s[("a",)] == 1 assert s[("b",)] == 2 s[("b",)] = 3 @@ -959,7 +959,7 @@ def test_tuple_index(): def test_frozenset_index(): # GH35747 - Selecting values when a Series has an Index of frozenset idx0, idx1 = frozenset("a"), frozenset("b") - s = pd.Series([1, 2], index=[idx0, idx1]) + s = Series([1, 2], index=[idx0, idx1]) assert s[idx0] == 1 assert s[idx1] == 2 s[idx1] = 3 diff --git a/pandas/tests/series/indexing/test_multiindex.py b/pandas/tests/series/indexing/test_multiindex.py deleted file mode 100644 index 0420d76b5e8a8..0000000000000 --- a/pandas/tests/series/indexing/test_multiindex.py +++ /dev/null @@ -1,51 +0,0 @@ -""" test get/set & misc """ - -import pytest - -import pandas as pd -from pandas import MultiIndex, Series -import pandas._testing as tm - - -def test_access_none_value_in_multiindex(): - # GH34318: test that you can access a None value using .loc through a Multiindex - - s = Series([None], pd.MultiIndex.from_arrays([["Level1"], ["Level2"]])) - result = s.loc[("Level1", "Level2")] - assert result is None - - midx = MultiIndex.from_product([["Level1"], ["Level2_a", "Level2_b"]]) - s = Series([None] * len(midx), dtype=object, index=midx) - result = s.loc[("Level1", "Level2_a")] - assert result is None - - s = Series([1] * len(midx), dtype=object, index=midx) - result = s.loc[("Level1", "Level2_a")] - assert result == 1 - - -@pytest.mark.parametrize( - "ix_data, exp_data", - [ - ( - [(pd.NaT, 1), (pd.NaT, 2)], - {"a": [pd.NaT, pd.NaT], "b": [1, 2], "x": [11, 12]}, - ), - ( - [(pd.NaT, 1), (pd.Timestamp("2020-01-01"), 2)], - {"a": [pd.NaT, pd.Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, - ), - ( - [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], - {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, - ), - ], -) -def test_nat_multi_index(ix_data, exp_data): - # GH36541: that reset_index() does not raise ValueError - ix = pd.MultiIndex.from_tuples(ix_data, names=["a", "b"]) - result = pd.DataFrame({"x": [11, 12]}, index=ix) - result = result.reset_index() - - expected = pd.DataFrame(exp_data) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index b5bef46e95ec2..f35f1375732cb 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -110,27 +110,3 @@ def test_slice_floats2(): s.index = i assert len(s.loc[12.0:]) == 8 assert len(s.loc[12.5:]) == 7 - - -def test_int_indexing(): - s = Series(np.random.randn(6), index=[0, 0, 1, 1, 2, 2]) - - with pytest.raises(KeyError, match=r"^5$"): - s[5] - - with pytest.raises(KeyError, match=r"^'c'$"): - s["c"] - - # not monotonic - s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) - - with pytest.raises(KeyError, match=r"^5$"): - s[5] - - with pytest.raises(KeyError, match=r"^'c'$"): - s["c"] - - -def test_getitem_int64(datetime_series): - idx = np.int64(5) - assert datetime_series[idx] == datetime_series[5] diff --git a/pandas/tests/series/indexing/test_set_value.py b/pandas/tests/series/indexing/test_set_value.py new file mode 100644 index 0000000000000..ea09646de71c1 --- /dev/null +++ b/pandas/tests/series/indexing/test_set_value.py @@ -0,0 +1,21 @@ +from datetime import datetime + +import numpy as np + +from pandas import DatetimeIndex, Series +import pandas._testing as tm + + +def test_series_set_value(): + # GH#1561 + + dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] + index = DatetimeIndex(dates) + + s = Series(dtype=object) + s._set_value(dates[0], 1.0) + s._set_value(dates[1], np.nan) + + expected = Series([1.0, np.nan], index=index) + + tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 593d1c78a19e2..967cf5c55845c 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1,8 +1,14 @@ +from datetime import date + import numpy as np +import pytest -from pandas import MultiIndex, NaT, Series, date_range +from pandas import MultiIndex, NaT, Series, Timestamp, date_range, period_range +from pandas.core.indexing import IndexingError import pandas.testing as tm +from pandas.tseries.offsets import BDay + class TestSetitemDT64Values: def test_setitem_none_nan(self): @@ -26,3 +32,133 @@ def test_setitem_multiindex_empty_slice(self): expected = result.copy() result.loc[[]] = 0 tm.assert_series_equal(result, expected) + + def test_setitem_with_string_index(self): + # GH#23451 + ser = Series([1, 2, 3], index=["Date", "b", "other"]) + ser["Date"] = date.today() + assert ser.Date == date.today() + assert ser["Date"] == date.today() + + def test_setitem_with_different_tz_casts_to_object(self): + # GH#24024 + ser = Series(date_range("2000", periods=2, tz="US/Central")) + ser[0] = Timestamp("2000", tz="US/Eastern") + expected = Series( + [ + Timestamp("2000-01-01 00:00:00-05:00", tz="US/Eastern"), + Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"), + ], + dtype=object, + ) + tm.assert_series_equal(ser, expected) + + +class TestSetitemPeriodDtype: + @pytest.mark.parametrize("na_val", [None, np.nan]) + def test_setitem_na_period_dtype_casts_to_nat(self, na_val): + ser = Series(period_range("2000-01-01", periods=10, freq="D")) + + ser[3] = na_val + assert ser[3] is NaT + + ser[3:5] = na_val + assert ser[4] is NaT + + +class TestSetitemBooleanMask: + def test_setitem_boolean(self, string_series): + mask = string_series > string_series.median() + + # similar indexed series + result = string_series.copy() + result[mask] = string_series * 2 + expected = string_series * 2 + tm.assert_series_equal(result[mask], expected[mask]) + + # needs alignment + result = string_series.copy() + result[mask] = (string_series * 2)[0:5] + expected = (string_series * 2)[0:5].reindex_like(string_series) + expected[-mask] = string_series[mask] + tm.assert_series_equal(result[mask], expected[mask]) + + def test_setitem_boolean_corner(self, datetime_series): + ts = datetime_series + mask_shifted = ts.shift(1, freq=BDay()) > ts.median() + + msg = ( + r"Unalignable boolean Series provided as indexer \(index of " + r"the boolean Series and of the indexed object do not match" + ) + with pytest.raises(IndexingError, match=msg): + ts[mask_shifted] = 1 + + with pytest.raises(IndexingError, match=msg): + ts.loc[mask_shifted] = 1 + + def test_setitem_boolean_different_order(self, string_series): + ordered = string_series.sort_values() + + copy = string_series.copy() + copy[ordered > 0] = 0 + + expected = string_series.copy() + expected[expected > 0] = 0 + + tm.assert_series_equal(copy, expected) + + +class TestSetitemViewCopySemantics: + def test_setitem_invalidates_datetime_index_freq(self): + # GH#24096 altering a datetime64tz Series inplace invalidates the + # `freq` attribute on the underlying DatetimeIndex + + dti = date_range("20130101", periods=3, tz="US/Eastern") + ts = dti[1] + ser = Series(dti) + assert ser._values is not dti + assert ser._values._data.base is not dti._data._data.base + assert dti.freq == "D" + ser.iloc[1] = NaT + assert ser._values.freq is None + + # check that the DatetimeIndex was not altered in place + assert ser._values is not dti + assert ser._values._data.base is not dti._data._data.base + assert dti[1] == ts + assert dti.freq == "D" + + def test_dt64tz_setitem_does_not_mutate_dti(self): + # GH#21907, GH#24096 + dti = date_range("2016-01-01", periods=10, tz="US/Pacific") + ts = dti[0] + ser = Series(dti) + assert ser._values is not dti + assert ser._values._data.base is not dti._data._data.base + assert ser._mgr.blocks[0].values is not dti + assert ser._mgr.blocks[0].values._data.base is not dti._data._data.base + + ser[::3] = NaT + assert ser[0] is NaT + assert dti[0] == ts + + +class TestSetitemCallable: + def test_setitem_callable_key(self): + # GH#12533 + ser = Series([1, 2, 3, 4], index=list("ABCD")) + ser[lambda x: "A"] = -1 + + expected = Series([-1, 2, 3, 4], index=list("ABCD")) + tm.assert_series_equal(ser, expected) + + def test_setitem_callable_other(self): + # GH#13299 + inc = lambda x: x + 1 + + ser = Series([1, 2, -1, 4]) + ser[ser < 0] = inc + + expected = Series([1, 2, inc, 4]) + tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index c4a2cb90f7090..404136bdfa2db 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -349,7 +349,7 @@ def test_where_dups(): def test_where_numeric_with_string(): # GH 9280 - s = pd.Series([1, 2, 3]) + s = Series([1, 2, 3]) w = s.where(s > 1, "X") assert not is_integer(w[0]) @@ -425,15 +425,15 @@ def test_where_datetime_conversion(): def test_where_dt_tz_values(tz_naive_fixture): - ser1 = pd.Series( + ser1 = Series( pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture) ) - ser2 = pd.Series( + ser2 = Series( pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture) ) - mask = pd.Series([True, True, False]) + mask = Series([True, True, False]) result = ser1.where(mask, ser2) - exp = pd.Series( + exp = Series( pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture) ) tm.assert_series_equal(exp, result) @@ -441,9 +441,9 @@ def test_where_dt_tz_values(tz_naive_fixture): def test_where_sparse(): # GH#17198 make sure we dont get an AttributeError for sp_index - ser = pd.Series(pd.arrays.SparseArray([1, 2])) + ser = Series(pd.arrays.SparseArray([1, 2])) result = ser.where(ser >= 2, 0) - expected = pd.Series(pd.arrays.SparseArray([0, 2])) + expected = Series(pd.arrays.SparseArray([0, 2])) tm.assert_series_equal(result, expected) @@ -452,3 +452,15 @@ def test_where_empty_series_and_empty_cond_having_non_bool_dtypes(): ser = Series([], dtype=float) result = ser.where([]) tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize("klass", [Series, pd.DataFrame]) +def test_where_categorical(klass): + # https://github.com/pandas-dev/pandas/issues/18888 + exp = klass( + pd.Categorical(["A", "A", "B", "B", np.nan], categories=["A", "B", "C"]), + dtype="category", + ) + df = klass(["A", "A", "B", "B", "C"], dtype="category") + res = df.where(df != "C") + tm.assert_equal(exp, res) diff --git a/pandas/tests/series/indexing/test_xs.py b/pandas/tests/series/indexing/test_xs.py index 43458ca2ebeb2..1a23b09bde816 100644 --- a/pandas/tests/series/indexing/test_xs.py +++ b/pandas/tests/series/indexing/test_xs.py @@ -1,13 +1,14 @@ import numpy as np -import pandas as pd +from pandas import MultiIndex, Series, date_range +import pandas._testing as tm def test_xs_datetimelike_wrapping(): # GH#31630 a case where we shouldn't wrap datetime64 in Timestamp - arr = pd.date_range("2016-01-01", periods=3)._data._data + arr = date_range("2016-01-01", periods=3)._data._data - ser = pd.Series(arr, dtype=object) + ser = Series(arr, dtype=object) for i in range(len(ser)): ser.iloc[i] = arr[i] assert ser.dtype == object @@ -15,3 +16,37 @@ def test_xs_datetimelike_wrapping(): result = ser.xs(0) assert isinstance(result, np.datetime64) + + +class TestXSWithMultiIndex: + def test_xs_level_series(self, multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + ser = df["A"] + expected = ser[:, "two"] + result = df.xs("two", level=1)["A"] + tm.assert_series_equal(result, expected) + + def test_series_getitem_multiindex_xs_by_label(self): + # GH#5684 + idx = MultiIndex.from_tuples( + [("a", "one"), ("a", "two"), ("b", "one"), ("b", "two")] + ) + ser = Series([1, 2, 3, 4], index=idx) + return_value = ser.index.set_names(["L1", "L2"], inplace=True) + assert return_value is None + expected = Series([1, 3], index=["a", "b"]) + return_value = expected.index.set_names(["L1"], inplace=True) + assert return_value is None + + result = ser.xs("one", level="L2") + tm.assert_series_equal(result, expected) + + def test_series_getitem_multiindex_xs(xs): + # GH#6258 + dt = list(date_range("20130903", periods=3)) + idx = MultiIndex.from_product([list("AB"), dt]) + ser = Series([1, 3, 4, 1, 3, 4], index=idx) + expected = Series([1, 1], index=list("AB")) + + result = ser.xs("20130903", level=1) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index 974ba5d1e35a7..ef2b07d592b95 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -124,8 +124,8 @@ def test_align_multiindex(): [range(2), range(3), range(2)], names=("a", "b", "c") ) idx = pd.Index(range(2), name="b") - s1 = pd.Series(np.arange(12, dtype="int64"), index=midx) - s2 = pd.Series(np.arange(2, dtype="int64"), index=idx) + s1 = Series(np.arange(12, dtype="int64"), index=midx) + s2 = Series(np.arange(2, dtype="int64"), index=idx) # these must be the same results (but flipped) res1l, res1r = s1.align(s2, join="left") @@ -134,7 +134,7 @@ def test_align_multiindex(): expl = s1 tm.assert_series_equal(expl, res1l) tm.assert_series_equal(expl, res2r) - expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + expr = Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) tm.assert_series_equal(expr, res1r) tm.assert_series_equal(expr, res2l) @@ -144,10 +144,10 @@ def test_align_multiindex(): exp_idx = pd.MultiIndex.from_product( [range(2), range(2), range(2)], names=("a", "b", "c") ) - expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + expl = Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) tm.assert_series_equal(expl, res1l) tm.assert_series_equal(expl, res2r) - expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) + expr = Series([0, 0, 1, 1] * 2, index=exp_idx) tm.assert_series_equal(expr, res1r) tm.assert_series_equal(expr, res2l) @@ -155,7 +155,7 @@ def test_align_multiindex(): @pytest.mark.parametrize("method", ["backfill", "bfill", "pad", "ffill", None]) def test_align_with_dataframe_method(method): # GH31788 - ser = pd.Series(range(3), index=range(3)) + ser = Series(range(3), index=range(3)) df = pd.DataFrame(0.0, index=range(3), columns=range(3)) result_ser, result_df = ser.align(df, method=method) diff --git a/pandas/tests/series/methods/test_append.py b/pandas/tests/series/methods/test_append.py index 82bde7c233626..4c2bf4683d17d 100644 --- a/pandas/tests/series/methods/test_append.py +++ b/pandas/tests/series/methods/test_append.py @@ -29,14 +29,14 @@ def test_append_many(self, datetime_series): def test_append_duplicates(self): # GH 13677 - s1 = pd.Series([1, 2, 3]) - s2 = pd.Series([4, 5, 6]) - exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + exp = Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) tm.assert_series_equal(s1.append(s2), exp) tm.assert_series_equal(pd.concat([s1, s2]), exp) # the result must have RangeIndex - exp = pd.Series([1, 2, 3, 4, 5, 6]) + exp = Series([1, 2, 3, 4, 5, 6]) tm.assert_series_equal( s1.append(s2, ignore_index=True), exp, check_index_type=True ) @@ -52,7 +52,7 @@ def test_append_duplicates(self): def test_append_tuples(self): # GH 28410 - s = pd.Series([1, 2, 3]) + s = Series([1, 2, 3]) list_input = [s, s] tuple_input = (s, s) @@ -63,7 +63,7 @@ def test_append_tuples(self): def test_append_dataframe_raises(self): # GH 31413 - df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) msg = "to_append should be a Series or list/tuple of Series, got DataFrame" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index eea839c380f0b..8044b590b3463 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -1,11 +1,111 @@ +from datetime import datetime, timedelta +from importlib import reload +import string +import sys + import numpy as np import pytest -from pandas import NA, Interval, Series, Timestamp, date_range +from pandas._libs.tslibs import iNaT + +from pandas import ( + NA, + Categorical, + CategoricalDtype, + Index, + Interval, + NaT, + Series, + Timedelta, + Timestamp, + date_range, +) import pandas._testing as tm +class TestAstypeAPI: + def test_arg_for_errors_in_astype(self): + # see GH#14878 + ser = Series([1, 2, 3]) + + msg = ( + r"Expected value of kwarg 'errors' to be one of \['raise', " + r"'ignore'\]\. Supplied value is 'False'" + ) + with pytest.raises(ValueError, match=msg): + ser.astype(np.float64, errors=False) + + ser.astype(np.int8, errors="raise") + + @pytest.mark.parametrize("dtype_class", [dict, Series]) + def test_astype_dict_like(self, dtype_class): + # see GH#7271 + ser = Series(range(0, 10, 2), name="abc") + + dt1 = dtype_class({"abc": str}) + result = ser.astype(dt1) + expected = Series(["0", "2", "4", "6", "8"], name="abc") + tm.assert_series_equal(result, expected) + + dt2 = dtype_class({"abc": "float64"}) + result = ser.astype(dt2) + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype="float64", name="abc") + tm.assert_series_equal(result, expected) + + dt3 = dtype_class({"abc": str, "def": str}) + msg = ( + "Only the Series name can be used for the key in Series dtype " + r"mappings\." + ) + with pytest.raises(KeyError, match=msg): + ser.astype(dt3) + + dt4 = dtype_class({0: str}) + with pytest.raises(KeyError, match=msg): + ser.astype(dt4) + + # GH#16717 + # if dtypes provided is empty, it should error + if dtype_class is Series: + dt5 = dtype_class({}, dtype=object) + else: + dt5 = dtype_class({}) + + with pytest.raises(KeyError, match=msg): + ser.astype(dt5) + + class TestAstype: + def test_astype_float_to_period(self): + result = Series([np.nan]).astype("period[D]") + expected = Series([NaT], dtype="period[D]") + tm.assert_series_equal(result, expected) + + def test_astype_no_pandas_dtype(self): + # https://github.com/pandas-dev/pandas/pull/24866 + ser = Series([1, 2], dtype="int64") + # Don't have PandasDtype in the public API, so we use `.array.dtype`, + # which is a PandasDtype. + result = ser.astype(ser.array.dtype) + tm.assert_series_equal(result, ser) + + @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) + def test_astype_generic_timestamp_no_frequency(self, dtype, request): + # see GH#15524, GH#15987 + data = [1] + s = Series(data) + + if np.dtype(dtype).name not in ["timedelta64", "datetime64"]: + mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit") + request.node.add_marker(mark) + + msg = ( + fr"The '{dtype.__name__}' dtype has no unit\. " + fr"Please pass in '{dtype.__name__}\[ns\]' instead." + ) + with pytest.raises(ValueError, match=msg): + s.astype(dtype) + def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) @@ -27,6 +127,87 @@ def test_astype_dt64tz_to_str(self): ) tm.assert_series_equal(result, expected) + def test_astype_datetime(self): + s = Series(iNaT, dtype="M8[ns]", index=range(5)) + + s = s.astype("O") + assert s.dtype == np.object_ + + s = Series([datetime(2001, 1, 2, 0, 0)]) + + s = s.astype("O") + assert s.dtype == np.object_ + + s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) + + s[1] = np.nan + assert s.dtype == "M8[ns]" + + s = s.astype("O") + assert s.dtype == np.object_ + + def test_astype_datetime64tz(self): + s = Series(date_range("20130101", periods=3, tz="US/Eastern")) + + # astype + result = s.astype(object) + expected = Series(s.astype(object), dtype=object) + tm.assert_series_equal(result, expected) + + result = Series(s.values).dt.tz_localize("UTC").dt.tz_convert(s.dt.tz) + tm.assert_series_equal(result, s) + + # astype - object, preserves on construction + result = Series(s.astype(object)) + expected = s.astype(object) + tm.assert_series_equal(result, expected) + + # astype - datetime64[ns, tz] + result = Series(s.values).astype("datetime64[ns, US/Eastern]") + tm.assert_series_equal(result, s) + + result = Series(s.values).astype(s.dtype) + tm.assert_series_equal(result, s) + + result = s.astype("datetime64[ns, CET]") + expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET")) + tm.assert_series_equal(result, expected) + + def test_astype_str_cast_dt64(self): + # see GH#9757 + ts = Series([Timestamp("2010-01-04 00:00:00")]) + s = ts.astype(str) + + expected = Series(["2010-01-04"]) + tm.assert_series_equal(s, expected) + + ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) + s = ts.astype(str) + + expected = Series(["2010-01-04 00:00:00-05:00"]) + tm.assert_series_equal(s, expected) + + def test_astype_str_cast_td64(self): + # see GH#9757 + + td = Series([Timedelta(1, unit="d")]) + ser = td.astype(str) + + expected = Series(["1 days"]) + tm.assert_series_equal(ser, expected) + + def test_dt64_series_astype_object(self): + dt64ser = Series(date_range("20130101", periods=3)) + result = dt64ser.astype(object) + assert isinstance(result.iloc[0], datetime) + assert result.dtype == np.object_ + + def test_td64_series_astype_object(self): + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]") + result = tdser.astype(object) + assert isinstance(result.iloc[0], timedelta) + assert result.dtype == np.object_ + @pytest.mark.parametrize( "values", [ @@ -70,3 +251,122 @@ def test_astype_to_str_preserves_na(self, value, string_value): result = s.astype(str) expected = Series(["a", "b", string_value], dtype=object) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) + def test_astype(self, dtype): + s = Series(np.random.randn(5), name="foo") + as_typed = s.astype(dtype) + + assert as_typed.dtype == dtype + assert as_typed.name == s.name + + @pytest.mark.parametrize("value", [np.nan, np.inf]) + @pytest.mark.parametrize("dtype", [np.int32, np.int64]) + def test_astype_cast_nan_inf_int(self, dtype, value): + # gh-14265: check NaN and inf raise error when converting to int + msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" + s = Series([value]) + + with pytest.raises(ValueError, match=msg): + s.astype(dtype) + + @pytest.mark.parametrize("dtype", [int, np.int8, np.int64]) + def test_astype_cast_object_int_fail(self, dtype): + arr = Series(["car", "house", "tree", "1"]) + msg = r"invalid literal for int\(\) with base 10: 'car'" + with pytest.raises(ValueError, match=msg): + arr.astype(dtype) + + def test_astype_cast_object_int(self): + arr = Series(["1", "2", "3", "4"], dtype=object) + result = arr.astype(int) + + tm.assert_series_equal(result, Series(np.arange(1, 5))) + + def test_astype_unicode(self): + # see GH#7758: A bit of magic is required to set + # default encoding to utf-8 + digits = string.digits + test_series = [ + Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), + Series(["データーサイエンス、お前はもう死んでいる"]), + ] + + former_encoding = None + + if sys.getdefaultencoding() == "utf-8": + test_series.append(Series(["野菜食べないとやばい".encode()])) + + for s in test_series: + res = s.astype("unicode") + expec = s.map(str) + tm.assert_series_equal(res, expec) + + # Restore the former encoding + if former_encoding is not None and former_encoding != "utf-8": + reload(sys) + sys.setdefaultencoding(former_encoding) + + +class TestAstypeCategorical: + def test_astype_categoricaldtype(self): + s = Series(["a", "b", "a"]) + result = s.astype(CategoricalDtype(["a", "b"], ordered=True)) + expected = Series(Categorical(["a", "b", "a"], ordered=True)) + tm.assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(["a", "b"], ordered=False)) + expected = Series(Categorical(["a", "b", "a"], ordered=False)) + tm.assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False)) + expected = Series( + Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False) + ) + tm.assert_series_equal(result, expected) + tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"])) + + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("series_ordered", [True, False]) + def test_astype_categorical_to_categorical( + self, name, dtype_ordered, series_ordered + ): + # GH#10696, GH#18593 + s_data = list("abcaacbab") + s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered) + s = Series(s_data, dtype=s_dtype, name=name) + + # unspecified categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = s.astype(dtype) + exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) + expected = Series(s_data, name=name, dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + # different categories + dtype = CategoricalDtype(list("adc"), dtype_ordered) + result = s.astype(dtype) + expected = Series(s_data, name=name, dtype=dtype) + tm.assert_series_equal(result, expected) + + if dtype_ordered is False: + # not specifying ordered, so only test once + expected = s + result = s.astype("category") + tm.assert_series_equal(result, expected) + + def test_astype_bool_missing_to_categorical(self): + # GH-19182 + s = Series([True, False, np.nan]) + assert s.dtypes == np.object_ + + result = s.astype(CategoricalDtype(categories=[True, False])) + expected = Series(Categorical([True, False, np.nan], categories=[True, False])) + tm.assert_series_equal(result, expected) + + def test_astype_categories_raises(self): + # deprecated GH#17636, removed in GH#27141 + s = Series(["a", "b", "a"]) + with pytest.raises(TypeError, match="got an unexpected"): + s.astype("category", categories=["a", "b"], ordered=True) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 37764d3b82c2d..5a5a397222b87 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -62,9 +62,9 @@ def test_clip_against_series(self): @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])]) def test_clip_against_list_like(self, inplace, upper): # GH#15390 - original = pd.Series([5, 6, 7]) + original = Series([5, 6, 7]) result = original.clip(upper=upper, inplace=inplace) - expected = pd.Series([1, 2, 3]) + expected = Series([1, 2, 3]) if inplace: result = original diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 1ee55fbe39513..2a02406f50750 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -76,11 +76,11 @@ def test_combine_first_dt64(self): tm.assert_series_equal(rs, xp) def test_combine_first_dt_tz_values(self, tz_naive_fixture): - ser1 = pd.Series( + ser1 = Series( pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), name="ser1", ) - ser2 = pd.Series( + ser2 = Series( pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture), index=[2, 3, 4], name="ser2", @@ -90,5 +90,5 @@ def test_combine_first_dt_tz_values(self, tz_naive_fixture): ["20150101", "20150102", "20150103", "20160515", "20160516"], tz=tz_naive_fixture, ) - exp = pd.Series(exp_vals, name="ser1") + exp = Series(exp_vals, name="ser1") tm.assert_series_equal(exp, result) diff --git a/pandas/tests/series/methods/test_convert.py b/pandas/tests/series/methods/test_convert.py new file mode 100644 index 0000000000000..b213e4a6c4c8a --- /dev/null +++ b/pandas/tests/series/methods/test_convert.py @@ -0,0 +1,203 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import NaT, Series, Timestamp +import pandas._testing as tm + + +class TestConvert: + def test_convert(self): + # GH#10265 + # Tests: All to nans, coerce, true + # Test coercion returns correct type + ser = Series(["a", "b", "c"]) + results = ser._convert(datetime=True, coerce=True) + expected = Series([NaT] * 3) + tm.assert_series_equal(results, expected) + + results = ser._convert(numeric=True, coerce=True) + expected = Series([np.nan] * 3) + tm.assert_series_equal(results, expected) + + expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]")) + results = ser._convert(timedelta=True, coerce=True) + tm.assert_series_equal(results, expected) + + dt = datetime(2001, 1, 1, 0, 0) + td = dt - datetime(2000, 1, 1, 0, 0) + + # Test coercion with mixed types + ser = Series(["a", "3.1415", dt, td]) + results = ser._convert(datetime=True, coerce=True) + expected = Series([NaT, NaT, dt, NaT]) + tm.assert_series_equal(results, expected) + + results = ser._convert(numeric=True, coerce=True) + expected = Series([np.nan, 3.1415, np.nan, np.nan]) + tm.assert_series_equal(results, expected) + + results = ser._convert(timedelta=True, coerce=True) + expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]")) + tm.assert_series_equal(results, expected) + + # Test standard conversion returns original + results = ser._convert(datetime=True) + tm.assert_series_equal(results, ser) + results = ser._convert(numeric=True) + expected = Series([np.nan, 3.1415, np.nan, np.nan]) + tm.assert_series_equal(results, expected) + results = ser._convert(timedelta=True) + tm.assert_series_equal(results, ser) + + # test pass-through and non-conversion when other types selected + ser = Series(["1.0", "2.0", "3.0"]) + results = ser._convert(datetime=True, numeric=True, timedelta=True) + expected = Series([1.0, 2.0, 3.0]) + tm.assert_series_equal(results, expected) + results = ser._convert(True, False, True) + tm.assert_series_equal(results, ser) + + ser = Series( + [datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O" + ) + results = ser._convert(datetime=True, numeric=True, timedelta=True) + expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)]) + tm.assert_series_equal(results, expected) + results = ser._convert(datetime=False, numeric=True, timedelta=True) + tm.assert_series_equal(results, ser) + + td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) + ser = Series([td, td], dtype="O") + results = ser._convert(datetime=True, numeric=True, timedelta=True) + expected = Series([td, td]) + tm.assert_series_equal(results, expected) + results = ser._convert(True, True, False) + tm.assert_series_equal(results, ser) + + ser = Series([1.0, 2, 3], index=["a", "b", "c"]) + result = ser._convert(numeric=True) + tm.assert_series_equal(result, ser) + + # force numeric conversion + res = ser.copy().astype("O") + res["a"] = "1" + result = res._convert(numeric=True) + tm.assert_series_equal(result, ser) + + res = ser.copy().astype("O") + res["a"] = "1." + result = res._convert(numeric=True) + tm.assert_series_equal(result, ser) + + res = ser.copy().astype("O") + res["a"] = "garbled" + result = res._convert(numeric=True) + expected = ser.copy() + expected["a"] = np.nan + tm.assert_series_equal(result, expected) + + # GH 4119, not converting a mixed type (e.g.floats and object) + ser = Series([1, "na", 3, 4]) + result = ser._convert(datetime=True, numeric=True) + expected = Series([1, np.nan, 3, 4]) + tm.assert_series_equal(result, expected) + + ser = Series([1, "", 3, 4]) + result = ser._convert(datetime=True, numeric=True) + tm.assert_series_equal(result, expected) + + # dates + ser = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + ] + ) + s2 = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + "foo", + 1.0, + 1, + Timestamp("20010104"), + "20010105", + ], + dtype="O", + ) + + result = ser._convert(datetime=True) + expected = Series( + [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")], + dtype="M8[ns]", + ) + tm.assert_series_equal(result, expected) + + result = ser._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + + expected = Series( + [ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20010103"), + NaT, + NaT, + NaT, + Timestamp("20010104"), + Timestamp("20010105"), + ], + dtype="M8[ns]", + ) + result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True) + tm.assert_series_equal(result, expected) + result = s2._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + + ser = Series(["foo", "bar", 1, 1.0], dtype="O") + result = ser._convert(datetime=True, coerce=True) + expected = Series([NaT] * 2 + [Timestamp(1)] * 2) + tm.assert_series_equal(result, expected) + + # preserver if non-object + ser = Series([1], dtype="float32") + result = ser._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, ser) + + # FIXME: dont leave commented-out + # res = ser.copy() + # r[0] = np.nan + # result = res._convert(convert_dates=True,convert_numeric=False) + # assert result.dtype == 'M8[ns]' + + # dateutil parses some single letters into today's value as a date + expected = Series([NaT]) + for x in "abcdefghijklmnopqrstuvwxyz": + ser = Series([x]) + result = ser._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + ser = Series([x.upper()]) + result = ser._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + + def test_convert_no_arg_error(self): + ser = Series(["1.0", "2"]) + msg = r"At least one of datetime, numeric or timedelta must be True\." + with pytest.raises(ValueError, match=msg): + ser._convert() + + def test_convert_preserve_bool(self): + ser = Series([1, True, 3, 5], dtype=object) + res = ser._convert(datetime=True, numeric=True) + expected = Series([1, 1, 3, 5], dtype="i8") + tm.assert_series_equal(res, expected) + + def test_convert_preserve_all_bool(self): + ser = Series([False, True, False, False], dtype=object) + res = ser._convert(datetime=True, numeric=True) + expected = Series([False, True, False, False], dtype=bool) + tm.assert_series_equal(res, expected) diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py new file mode 100644 index 0000000000000..6201c0f5f7c29 --- /dev/null +++ b/pandas/tests/series/methods/test_copy.py @@ -0,0 +1,71 @@ +import numpy as np +import pytest + +from pandas import Series, Timestamp +import pandas._testing as tm + + +class TestCopy: + @pytest.mark.parametrize("deep", [None, False, True]) + def test_copy(self, deep): + + ser = Series(np.arange(10), dtype="float64") + + # default deep is True + if deep is None: + ser2 = ser.copy() + else: + ser2 = ser.copy(deep=deep) + + ser2[::2] = np.NaN + + if deep is None or deep is True: + # Did not modify original Series + assert np.isnan(ser2[0]) + assert not np.isnan(ser[0]) + else: + # we DID modify the original Series + assert np.isnan(ser2[0]) + assert np.isnan(ser[0]) + + @pytest.mark.parametrize("deep", [None, False, True]) + def test_copy_tzaware(self, deep): + # GH#11794 + # copy of tz-aware + expected = Series([Timestamp("2012/01/01", tz="UTC")]) + expected2 = Series([Timestamp("1999/01/01", tz="UTC")]) + + ser = Series([Timestamp("2012/01/01", tz="UTC")]) + + if deep is None: + ser2 = ser.copy() + else: + ser2 = ser.copy(deep=deep) + + ser2[0] = Timestamp("1999/01/01", tz="UTC") + + # default deep is True + if deep is None or deep is True: + # Did not modify original Series + tm.assert_series_equal(ser2, expected2) + tm.assert_series_equal(ser, expected) + else: + # we DID modify the original Series + tm.assert_series_equal(ser2, expected2) + tm.assert_series_equal(ser, expected2) + + def test_copy_name(self, datetime_series): + result = datetime_series.copy() + assert result.name == datetime_series.name + + def test_copy_index_name_checking(self, datetime_series): + # don't want to be able to modify the index stored elsewhere after + # making a copy + + datetime_series.index.name = None + assert datetime_series.index.name is None + assert datetime_series is datetime_series + + cp = datetime_series.copy() + cp.index.name = "foo" + assert datetime_series.index.name is None diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py index 19290b6a5c23f..7fff87c7b55f4 100644 --- a/pandas/tests/series/methods/test_count.py +++ b/pandas/tests/series/methods/test_count.py @@ -7,8 +7,46 @@ class TestSeriesCount: + def test_count_level_series(self): + index = MultiIndex( + levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], + codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], + ) + + ser = Series(np.random.randn(len(index)), index=index) + + result = ser.count(level=0) + expected = ser.groupby(level=0).count() + tm.assert_series_equal( + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) + + result = ser.count(level=1) + expected = ser.groupby(level=1).count() + tm.assert_series_equal( + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) + + def test_count_multiindex(self, series_with_multilevel_index): + ser = series_with_multilevel_index + + series = ser.copy() + series.index.names = ["a", "b"] + + result = series.count(level="b") + expect = ser.count(level=1).rename_axis("b") + tm.assert_series_equal(result, expect) + + result = series.count(level="a") + expect = ser.count(level=0).rename_axis("a") + tm.assert_series_equal(result, expect) + + msg = "Level x not found" + with pytest.raises(KeyError, match=msg): + series.count("x") + def test_count_level_without_multiindex(self): - ser = pd.Series(range(3)) + ser = Series(range(3)) msg = "Series.count level is only valid with a MultiIndex" with pytest.raises(ValueError, match=msg): @@ -33,7 +71,7 @@ def test_count(self, datetime_series): # GH#29478 with pd.option_context("use_inf_as_na", True): - assert pd.Series([pd.Timestamp("1990/1/1")]).count() == 1 + assert Series([pd.Timestamp("1990/1/1")]).count() == 1 def test_count_categorical(self): diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 282f499506aae..f01ed73c0165f 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -135,8 +135,8 @@ def test_corr_rank(self): def test_corr_invalid_method(self): # GH PR #22298 - s1 = pd.Series(np.random.randn(10)) - s2 = pd.Series(np.random.randn(10)) + s1 = Series(np.random.randn(10)) + s2 = Series(np.random.randn(10)) msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " with pytest.raises(ValueError, match=msg): s1.corr(s2, method="____") diff --git a/pandas/tests/series/methods/test_drop.py b/pandas/tests/series/methods/test_drop.py index 197fe9ff68df2..7ded8ac902d78 100644 --- a/pandas/tests/series/methods/test_drop.py +++ b/pandas/tests/series/methods/test_drop.py @@ -1,6 +1,5 @@ import pytest -import pandas as pd from pandas import Series import pandas._testing as tm @@ -66,8 +65,8 @@ def test_drop_with_ignore_errors(): def test_drop_empty_list(index, drop_labels): # GH 21494 expected_index = [i for i in index if i not in drop_labels] - series = pd.Series(index=index, dtype=object).drop(drop_labels) - expected = pd.Series(index=expected_index, dtype=object) + series = Series(index=index, dtype=object).drop(drop_labels) + expected = Series(index=expected_index, dtype=object) tm.assert_series_equal(series, expected) @@ -82,6 +81,6 @@ def test_drop_empty_list(index, drop_labels): def test_drop_non_empty_list(data, index, drop_labels): # GH 21494 and GH 16877 dtype = object if data is None else None - ser = pd.Series(data=data, index=index, dtype=dtype) + ser = Series(data=data, index=index, dtype=dtype) with pytest.raises(KeyError, match="not found in axis"): ser.drop(drop_labels) diff --git a/pandas/tests/series/methods/test_dropna.py b/pandas/tests/series/methods/test_dropna.py new file mode 100644 index 0000000000000..f56230daea190 --- /dev/null +++ b/pandas/tests/series/methods/test_dropna.py @@ -0,0 +1,96 @@ +import numpy as np +import pytest + +from pandas import DatetimeIndex, IntervalIndex, NaT, Period, Series, Timestamp +import pandas._testing as tm + + +class TestDropna: + def test_dropna_empty(self): + ser = Series([], dtype=object) + + assert len(ser.dropna()) == 0 + return_value = ser.dropna(inplace=True) + assert return_value is None + assert len(ser) == 0 + + # invalid axis + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + ser.dropna(axis=1) + + def test_dropna_preserve_name(self, datetime_series): + datetime_series[:5] = np.nan + result = datetime_series.dropna() + assert result.name == datetime_series.name + name = datetime_series.name + ts = datetime_series.copy() + return_value = ts.dropna(inplace=True) + assert return_value is None + assert ts.name == name + + def test_dropna_no_nan(self): + for ser in [ + Series([1, 2, 3], name="x"), + Series([False, True, False], name="x"), + ]: + + result = ser.dropna() + tm.assert_series_equal(result, ser) + assert result is not ser + + s2 = ser.copy() + return_value = s2.dropna(inplace=True) + assert return_value is None + tm.assert_series_equal(s2, ser) + + def test_dropna_intervals(self): + ser = Series( + [np.nan, 1, 2, 3], + IntervalIndex.from_arrays([np.nan, 0, 1, 2], [np.nan, 1, 2, 3]), + ) + + result = ser.dropna() + expected = ser.iloc[1:] + tm.assert_series_equal(result, expected) + + def test_dropna_period_dtype(self): + # GH#13737 + ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")]) + result = ser.dropna() + expected = Series([Period("2011-01", freq="M")]) + + tm.assert_series_equal(result, expected) + + def test_datetime64_tz_dropna(self): + # DatetimeBlock + ser = Series( + [ + Timestamp("2011-01-01 10:00"), + NaT, + Timestamp("2011-01-03 10:00"), + NaT, + ] + ) + result = ser.dropna() + expected = Series( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")], index=[0, 2] + ) + tm.assert_series_equal(result, expected) + + # DatetimeBlockTZ + idx = DatetimeIndex( + ["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz="Asia/Tokyo" + ) + ser = Series(idx) + assert ser.dtype == "datetime64[ns, Asia/Tokyo]" + result = ser.dropna() + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-03 10:00", tz="Asia/Tokyo"), + ], + index=[0, 2], + ) + assert result.dtype == "datetime64[ns, Asia/Tokyo]" + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index b6a6f4e8200d4..d45486b9bdb29 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -1,13 +1,541 @@ -from datetime import timedelta +from datetime import datetime, timedelta import numpy as np import pytest +import pytz -from pandas import Categorical, DataFrame, NaT, Period, Series, Timedelta, Timestamp +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + NaT, + Period, + Series, + Timedelta, + Timestamp, + isna, +) import pandas._testing as tm class TestSeriesFillNA: + def test_fillna_nat(self): + series = Series([0, 1, 2, NaT.value], dtype="M8[ns]") + + filled = series.fillna(method="pad") + filled2 = series.fillna(value=series.values[2]) + + expected = series.copy() + expected.values[3] = expected.values[2] + + tm.assert_series_equal(filled, expected) + tm.assert_series_equal(filled2, expected) + + df = DataFrame({"A": series}) + filled = df.fillna(method="pad") + filled2 = df.fillna(value=series.values[2]) + expected = DataFrame({"A": expected}) + tm.assert_frame_equal(filled, expected) + tm.assert_frame_equal(filled2, expected) + + series = Series([NaT.value, 0, 1, 2], dtype="M8[ns]") + + filled = series.fillna(method="bfill") + filled2 = series.fillna(value=series[1]) + + expected = series.copy() + expected[0] = expected[1] + + tm.assert_series_equal(filled, expected) + tm.assert_series_equal(filled2, expected) + + df = DataFrame({"A": series}) + filled = df.fillna(method="bfill") + filled2 = df.fillna(value=series[1]) + expected = DataFrame({"A": expected}) + tm.assert_frame_equal(filled, expected) + tm.assert_frame_equal(filled2, expected) + + def test_fillna(self, datetime_series): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + + tm.assert_series_equal(ts, ts.fillna(method="ffill")) + + ts[2] = np.NaN + + exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="ffill"), exp) + + exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="backfill"), exp) + + exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(value=5), exp) + + msg = "Must specify a fill 'value' or 'method'" + with pytest.raises(ValueError, match=msg): + ts.fillna() + + msg = "Cannot specify both 'value' and 'method'" + with pytest.raises(ValueError, match=msg): + datetime_series.fillna(value=0, method="ffill") + + # GH#5703 + s1 = Series([np.nan]) + s2 = Series([1]) + result = s1.fillna(s2) + expected = Series([1.0]) + tm.assert_series_equal(result, expected) + result = s1.fillna({}) + tm.assert_series_equal(result, s1) + result = s1.fillna(Series((), dtype=object)) + tm.assert_series_equal(result, s1) + result = s2.fillna(s1) + tm.assert_series_equal(result, s2) + result = s1.fillna({0: 1}) + tm.assert_series_equal(result, expected) + result = s1.fillna({1: 1}) + tm.assert_series_equal(result, Series([np.nan])) + result = s1.fillna({0: 1, 1: 1}) + tm.assert_series_equal(result, expected) + result = s1.fillna(Series({0: 1, 1: 1})) + tm.assert_series_equal(result, expected) + result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) + tm.assert_series_equal(result, s1) + + s1 = Series([0, 1, 2], list("abc")) + s2 = Series([0, np.nan, 2], list("bac")) + result = s2.fillna(s1) + expected = Series([0, 0, 2.0], list("bac")) + tm.assert_series_equal(result, expected) + + # limit + ser = Series(np.nan, index=[0, 1, 2]) + result = ser.fillna(999, limit=1) + expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + result = ser.fillna(999, limit=2) + expected = Series([999, 999, np.nan], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + # GH#9043 + # make sure a string representation of int/float values can be filled + # correctly without raising errors or being converted + vals = ["0", "1.5", "-0.3"] + for val in vals: + ser = Series([0, 1, np.nan, np.nan, 4], dtype="float64") + result = ser.fillna(val) + expected = Series([0, 1, val, val, 4], dtype="object") + tm.assert_series_equal(result, expected) + + def test_fillna_consistency(self): + # GH#16402 + # fillna with a tz aware to a tz-naive, should result in object + + ser = Series([Timestamp("20130101"), NaT]) + + result = ser.fillna(Timestamp("20130101", tz="US/Eastern")) + expected = Series( + [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], + dtype="object", + ) + tm.assert_series_equal(result, expected) + + # where (we ignore the errors=) + result = ser.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) + tm.assert_series_equal(result, expected) + + result = ser.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) + tm.assert_series_equal(result, expected) + + # with a non-datetime + result = ser.fillna("foo") + expected = Series([Timestamp("20130101"), "foo"]) + tm.assert_series_equal(result, expected) + + # assignment + ser2 = ser.copy() + ser2[1] = "foo" + tm.assert_series_equal(ser2, expected) + + def test_fillna_downcast(self): + # GH#15277 + # infer int64 from float64 + ser = Series([1.0, np.nan]) + result = ser.fillna(0, downcast="infer") + expected = Series([1, 0]) + tm.assert_series_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + ser = Series([1.0, np.nan]) + result = ser.fillna({1: 0}, downcast="infer") + expected = Series([1, 0]) + tm.assert_series_equal(result, expected) + + def test_timedelta_fillna(self): + # GH#3371 + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + td = ser.diff() + + # reg fillna + result = td.fillna(Timedelta(seconds=0)) + expected = Series( + [ + timedelta(0), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + # interpreted as seconds, deprecated + with pytest.raises(TypeError, match="Passing integers to fillna"): + td.fillna(1) + + result = td.fillna(Timedelta(seconds=1)) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(timedelta(days=1, seconds=1)) + expected = Series( + [ + timedelta(days=1, seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(np.timedelta64(int(1e9))) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(NaT) + expected = Series( + [ + NaT, + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype="m8[ns]", + ) + tm.assert_series_equal(result, expected) + + # ffill + td[2] = np.nan + result = td.ffill() + expected = td.fillna(Timedelta(seconds=0)) + expected[0] = np.nan + tm.assert_series_equal(result, expected) + + # bfill + td[2] = np.nan + result = td.bfill() + expected = td.fillna(Timedelta(seconds=0)) + expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1) + tm.assert_series_equal(result, expected) + + def test_datetime64_fillna(self): + + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + ser[2] = np.nan + + # ffill + result = ser.ffill() + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + # bfill + result = ser.bfill() + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + # GH#6587 + # make sure that we are treating as integer when filling + # this also tests inference of a datetime-like with NaT's + ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"]) + expected = Series( + [ + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + ], + dtype="M8[ns]", + ) + result = ser.fillna(method="backfill") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) + def test_datetime64_tz_fillna(self, tz): + # DatetimeBlock + ser = Series( + [ + Timestamp("2011-01-01 10:00"), + NaT, + Timestamp("2011-01-03 10:00"), + NaT, + ] + ) + null_loc = Series([False, True, False, True]) + + result = ser.fillna(Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + # check s is not changed + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + "AAA", + Timestamp("2011-01-03 10:00"), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + {1: Timestamp("2011-01-02 10:00"), 3: Timestamp("2011-01-04 10:00")} + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + # DatetimeBlockTZ + idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz) + ser = Series(idx) + assert ser.dtype == f"datetime64[ns, {tz}]" + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) + idx = DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) + idx = DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + "AAA", + Timestamp("2011-01-03 10:00", tz=tz), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00", tz=tz), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + # filling with a naive/other zone, coerce to object + result = ser.fillna(Timestamp("20130101")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("20130101", tz="US/Pacific")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + def test_fillna_dt64tz_with_method(self): + # with timezone + # GH#15855 + ser = Series([Timestamp("2012-11-11 00:00:00+01:00"), NaT]) + exp = Series( + [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + tm.assert_series_equal(ser.fillna(method="pad"), exp) + + ser = Series([NaT, Timestamp("2012-11-11 00:00:00+01:00")]) + exp = Series( + [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + tm.assert_series_equal(ser.fillna(method="bfill"), exp) + def test_fillna_pytimedelta(self): # GH#8209 ser = Series([np.nan, Timedelta("1 days")], index=["A", "B"]) @@ -153,6 +681,12 @@ def test_fillna_categorical_raises(self): # --------------------------------------------------------------- # Invalid Usages + def test_fillna_invalid_method(self, datetime_series): + try: + datetime_series.fillna(method="ffil") + except ValueError as inst: + assert "ffil" in str(inst) + def test_fillna_listlike_invalid(self): ser = Series(np.random.randint(-100, 100, 50)) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' @@ -176,3 +710,104 @@ def test_fillna_method_and_limit_invalid(self): for method in ["backfill", "bfill", "pad", "ffill", None]: with pytest.raises(ValueError, match=msg): ser.fillna(1, limit=limit, method=method) + + +class TestFillnaPad: + def test_fillna_bug(self): + ser = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) + filled = ser.fillna(method="ffill") + expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], ser.index) + tm.assert_series_equal(filled, expected) + + filled = ser.fillna(method="bfill") + expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], ser.index) + tm.assert_series_equal(filled, expected) + + def test_ffill(self): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) + + def test_ffill_mixed_dtypes_without_missing_data(self): + # GH#14956 + series = Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) + result = series.ffill() + tm.assert_series_equal(series, result) + + def test_bfill(self): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) + + def test_pad_nan(self): + x = Series( + [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float + ) + + return_value = x.fillna(method="pad", inplace=True) + assert return_value is None + + expected = Series( + [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float + ) + tm.assert_series_equal(x[1:], expected[1:]) + assert np.isnan(x[0]), np.isnan(expected[0]) + + def test_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index) + result = result.fillna(method="pad", limit=5) + + expected = s[:2].reindex(index).fillna(method="pad") + expected[-3:] = np.nan + tm.assert_series_equal(result, expected) + + result = s[-2:].reindex(index) + result = result.fillna(method="bfill", limit=5) + + expected = s[-2:].reindex(index).fillna(method="backfill") + expected[:3] = np.nan + tm.assert_series_equal(result, expected) + + def test_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index, method="pad", limit=5) + + expected = s[:2].reindex(index).fillna(method="pad") + expected[-3:] = np.nan + tm.assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method="backfill", limit=5) + + expected = s[-2:].reindex(index).fillna(method="backfill") + expected[:3] = np.nan + tm.assert_series_equal(result, expected) + + def test_fillna_int(self): + ser = Series(np.random.randint(-100, 100, 50)) + return_value = ser.fillna(method="ffill", inplace=True) + assert return_value is None + tm.assert_series_equal(ser.fillna(method="ffill", inplace=False), ser) + + def test_datetime64tz_fillna_round_issue(self): + # GH#14872 + + data = Series( + [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + ) + + filled = data.fillna(method="bfill") + + expected = Series( + [ + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + ] + ) + + tm.assert_series_equal(filled, expected) diff --git a/pandas/tests/series/methods/test_infer_objects.py b/pandas/tests/series/methods/test_infer_objects.py new file mode 100644 index 0000000000000..bb83f62f5ebb5 --- /dev/null +++ b/pandas/tests/series/methods/test_infer_objects.py @@ -0,0 +1,23 @@ +import numpy as np + +from pandas import Series +import pandas._testing as tm + + +class TestInferObjects: + def test_infer_objects_series(self): + # GH#11221 + actual = Series(np.array([1, 2, 3], dtype="O")).infer_objects() + expected = Series([1, 2, 3]) + tm.assert_series_equal(actual, expected) + + actual = Series(np.array([1, 2, 3, None], dtype="O")).infer_objects() + expected = Series([1.0, 2.0, 3.0, np.nan]) + tm.assert_series_equal(actual, expected) + + # only soft conversions, unconvertable pass thru unchanged + actual = Series(np.array([1, 2, 3, None, "a"], dtype="O")).infer_objects() + expected = Series([1, 2, 3, None, "a"]) + + assert actual.dtype == "object" + tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 9fc468221ee2d..1b05f72f5cf4d 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -314,14 +314,14 @@ def test_interp_limit(self): @pytest.mark.parametrize("limit", [-1, 0]) def test_interpolate_invalid_nonpositive_limit(self, nontemporal_method, limit): # GH 9217: make sure limit is greater than zero. - s = pd.Series([1, 2, np.nan, 4]) + s = Series([1, 2, np.nan, 4]) method, kwargs = nontemporal_method with pytest.raises(ValueError, match="Limit must be greater than 0"): s.interpolate(limit=limit, method=method, **kwargs) def test_interpolate_invalid_float_limit(self, nontemporal_method): # GH 9217: make sure limit is an integer. - s = pd.Series([1, 2, np.nan, 4]) + s = Series([1, 2, np.nan, 4]) method, kwargs = nontemporal_method limit = 2.0 with pytest.raises(ValueError, match="Limit must be an integer"): @@ -561,17 +561,17 @@ def test_interp_datetime64(self, method, tz_naive_fixture): def test_interp_pad_datetime64tz_values(self): # GH#27628 missing.interpolate_2d should handle datetimetz values dti = pd.date_range("2015-04-05", periods=3, tz="US/Central") - ser = pd.Series(dti) + ser = Series(dti) ser[1] = pd.NaT result = ser.interpolate(method="pad") - expected = pd.Series(dti) + expected = Series(dti) expected[1] = expected[0] tm.assert_series_equal(result, expected) def test_interp_limit_no_nans(self): # GH 7173 - s = pd.Series([1.0, 2.0, 3.0]) + s = Series([1.0, 2.0, 3.0]) result = s.interpolate(limit=1) expected = s tm.assert_series_equal(result, expected) @@ -654,13 +654,13 @@ def test_series_interpolate_method_values(self): def test_series_interpolate_intraday(self): # #1698 index = pd.date_range("1/1/2012", periods=4, freq="12D") - ts = pd.Series([0, 12, 24, 36], index) + ts = Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(days=1)).sort_values() exp = ts.reindex(new_index).interpolate(method="time") index = pd.date_range("1/1/2012", periods=4, freq="12H") - ts = pd.Series([0, 12, 24, 36], index) + ts = Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() result = ts.reindex(new_index).interpolate(method="time") @@ -684,7 +684,7 @@ def test_interp_non_timedelta_index(self, interp_methods_ind, ind): if method == "linear": result = df[0].interpolate(**kwargs) - expected = pd.Series([0.0, 1.0, 2.0, 3.0], name=0, index=ind) + expected = Series([0.0, 1.0, 2.0, 3.0], name=0, index=ind) tm.assert_series_equal(result, expected) else: expected_error = ( @@ -712,7 +712,7 @@ def test_interpolate_timedelta_index(self, interp_methods_ind): if method in {"linear", "pchip"}: result = df[0].interpolate(method=method, **kwargs) - expected = pd.Series([0.0, 1.0, 2.0, 3.0], name=0, index=ind) + expected = Series([0.0, 1.0, 2.0, 3.0], name=0, index=ind) tm.assert_series_equal(result, expected) else: pytest.skip( @@ -725,7 +725,7 @@ def test_interpolate_timedelta_index(self, interp_methods_ind): ) def test_interpolate_unsorted_index(self, ascending, expected_values): # GH 21037 - ts = pd.Series(data=[10, 9, np.nan, 2, 1], index=[10, 9, 3, 2, 1]) + ts = Series(data=[10, 9, np.nan, 2, 1], index=[10, 9, 3, 2, 1]) result = ts.sort_index(ascending=ascending).interpolate(method="index") - expected = pd.Series(data=expected_values, index=expected_values, dtype=float) + expected = Series(data=expected_values, index=expected_values, dtype=float) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 3836c1d56bf87..62766c692f4df 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -80,3 +80,12 @@ def test_isin_empty(self, empty): result = s.isin(empty) tm.assert_series_equal(expected, result) + + def test_isin_read_only(self): + # https://github.com/pandas-dev/pandas/issues/37174 + arr = np.array([1, 2, 3]) + arr.setflags(write=False) + s = Series([1, 2, 3]) + result = s.isin(arr) + expected = Series([True, True, True]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isna.py b/pandas/tests/series/methods/test_isna.py new file mode 100644 index 0000000000000..1760b0b9726e0 --- /dev/null +++ b/pandas/tests/series/methods/test_isna.py @@ -0,0 +1,32 @@ +""" +We also test Series.notna in this file. +""" +import numpy as np + +from pandas import Period, Series +import pandas._testing as tm + + +class TestIsna: + def test_isna_period_dtype(self): + # GH#13737 + ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")]) + + expected = Series([False, True]) + + result = ser.isna() + tm.assert_series_equal(result, expected) + + result = ser.notna() + tm.assert_series_equal(result, ~expected) + + def test_isna(self): + ser = Series([0, 5.4, 3, np.nan, -0.001]) + expected = Series([False, False, False, True, False]) + tm.assert_series_equal(ser.isna(), expected) + tm.assert_series_equal(ser.notna(), ~expected) + + ser = Series(["hi", "", np.nan]) + expected = Series([False, False, True]) + tm.assert_series_equal(ser.isna(), expected) + tm.assert_series_equal(ser.notna(), ~expected) diff --git a/pandas/tests/series/methods/test_matmul.py b/pandas/tests/series/methods/test_matmul.py new file mode 100644 index 0000000000000..c311f1fd880a3 --- /dev/null +++ b/pandas/tests/series/methods/test_matmul.py @@ -0,0 +1,75 @@ +import operator + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestMatmul: + def test_matmul(self): + # matmul test is for GH#10259 + a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) + b = DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T + + # Series @ DataFrame -> Series + result = operator.matmul(a, b) + expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # DataFrame @ Series -> Series + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # Series @ Series -> scalar + result = operator.matmul(a, a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH#21530 + # vector (1D np.array) @ Series (__rmatmul__) + result = operator.matmul(a.values, a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH#21530 + # vector (1D list) @ Series (__rmatmul__) + result = operator.matmul(a.values.tolist(), a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH#21530 + # matrix (2D np.array) @ Series (__rmatmul__) + result = operator.matmul(b.T.values, a) + expected = np.dot(b.T.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH#21530 + # matrix (2D nested lists) @ Series (__rmatmul__) + result = operator.matmul(b.T.values.tolist(), a) + expected = np.dot(b.T.values, a.values) + tm.assert_almost_equal(result, expected) + + # mixed dtype DataFrame @ Series + a["p"] = int(a.p) + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # different dtypes DataFrame @ Series + a = a.astype(int) + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" + # exception raised is of type Exception + with pytest.raises(Exception, match=msg): + a.dot(a.values[:3]) + msg = "matrices are not aligned" + with pytest.raises(ValueError, match=msg): + a.dot(b.T) diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 79f50afca658f..964d62602edaa 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -45,7 +45,7 @@ def test_quantile_multi(self, datetime_series): qs = [0.1, 0.9] result = datetime_series.quantile(qs) - expected = pd.Series( + expected = Series( [ np.percentile(datetime_series.dropna(), 10), np.percentile(datetime_series.dropna(), 90), @@ -66,7 +66,7 @@ def test_quantile_multi(self, datetime_series): tm.assert_series_equal(result, expected) result = datetime_series.quantile([]) - expected = pd.Series( + expected = Series( [], name=datetime_series.name, index=Index([], dtype=float), dtype="float64" ) tm.assert_series_equal(result, expected) @@ -87,18 +87,18 @@ def test_quantile_interpolation_dtype(self): # GH #10174 # interpolation = linear (default case) - q = pd.Series([1, 3, 4]).quantile(0.5, interpolation="lower") + q = Series([1, 3, 4]).quantile(0.5, interpolation="lower") assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q) - q = pd.Series([1, 3, 4]).quantile(0.5, interpolation="higher") + q = Series([1, 3, 4]).quantile(0.5, interpolation="higher") assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q) def test_quantile_nan(self): # GH 13098 - s = pd.Series([1, 2, 3, 4, np.nan]) + s = Series([1, 2, 3, 4, np.nan]) result = s.quantile(0.5) expected = 2.5 assert result == expected @@ -112,10 +112,10 @@ def test_quantile_nan(self): assert np.isnan(res) res = s.quantile([0.5]) - tm.assert_series_equal(res, pd.Series([np.nan], index=[0.5])) + tm.assert_series_equal(res, Series([np.nan], index=[0.5])) res = s.quantile([0.2, 0.3]) - tm.assert_series_equal(res, pd.Series([np.nan, np.nan], index=[0.2, 0.3])) + tm.assert_series_equal(res, Series([np.nan, np.nan], index=[0.2, 0.3])) @pytest.mark.parametrize( "case", @@ -153,12 +153,12 @@ def test_quantile_nan(self): ], ) def test_quantile_box(self, case): - s = pd.Series(case, name="XXX") + s = Series(case, name="XXX") res = s.quantile(0.5) assert res == case[1] res = s.quantile([0.5]) - exp = pd.Series([case[1]], index=[0.5], name="XXX") + exp = Series([case[1]], index=[0.5], name="XXX") tm.assert_series_equal(res, exp) def test_datetime_timedelta_quantiles(self): @@ -171,16 +171,16 @@ def test_quantile_nat(self): assert res is pd.NaT res = Series([pd.NaT, pd.NaT]).quantile([0.5]) - tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) + tm.assert_series_equal(res, Series([pd.NaT], index=[0.5])) @pytest.mark.parametrize( "values, dtype", [([0, 0, 0, 1, 2, 3], "Sparse[int]"), ([0.0, None, 1.0, 2.0], "Sparse[float]")], ) def test_quantile_sparse(self, values, dtype): - ser = pd.Series(values, dtype=dtype) + ser = Series(values, dtype=dtype) result = ser.quantile([0.5]) - expected = pd.Series(np.asarray(ser)).quantile([0.5]) + expected = Series(np.asarray(ser)).quantile([0.5]) tm.assert_series_equal(result, expected) def test_quantile_empty(self): diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/methods/test_reindex.py similarity index 100% rename from pandas/tests/series/indexing/test_alter_index.py rename to pandas/tests/series/methods/test_reindex.py diff --git a/pandas/tests/series/methods/test_repeat.py b/pandas/tests/series/methods/test_repeat.py new file mode 100644 index 0000000000000..32f7384d34ebd --- /dev/null +++ b/pandas/tests/series/methods/test_repeat.py @@ -0,0 +1,37 @@ +import numpy as np +import pytest + +from pandas import MultiIndex, Series +import pandas._testing as tm + + +class TestRepeat: + def test_repeat(self): + ser = Series(np.random.randn(3), index=["a", "b", "c"]) + + reps = ser.repeat(5) + exp = Series(ser.values.repeat(5), index=ser.index.values.repeat(5)) + tm.assert_series_equal(reps, exp) + + to_rep = [2, 3, 4] + reps = ser.repeat(to_rep) + exp = Series(ser.values.repeat(to_rep), index=ser.index.values.repeat(to_rep)) + tm.assert_series_equal(reps, exp) + + def test_numpy_repeat(self): + ser = Series(np.arange(3), name="x") + expected = Series( + ser.values.repeat(2), name="x", index=ser.index.values.repeat(2) + ) + tm.assert_series_equal(np.repeat(ser, 2), expected) + + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.repeat(ser, 2, axis=0) + + def test_repeat_with_multiindex(self): + # GH#9361, fixed by GH#7891 + m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)]) + data = ["a", "b", "c", "d"] + m_df = Series(data, index=m_idx) + assert m_df.repeat(3).shape == (3 * len(data),) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 1474bb95f4af2..13d6a3b1447a1 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -111,6 +111,18 @@ def test_reset_index_drop_errors(self): with pytest.raises(KeyError, match="not found"): s.reset_index("wrong", drop=True) + def test_reset_index_with_drop(self, series_with_multilevel_index): + ser = series_with_multilevel_index + + deleveled = ser.reset_index() + assert isinstance(deleveled, DataFrame) + assert len(deleveled.columns) == len(ser.index.levels) + 1 + assert deleveled.index.name == ser.index.name + + deleveled = ser.reset_index(drop=True) + assert isinstance(deleveled, Series) + assert deleveled.index.name == ser.index.name + @pytest.mark.parametrize( "array, dtype", diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index da6407c73104c..4b23820caeeb4 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -19,6 +19,25 @@ class TestShift: + @pytest.mark.parametrize( + "ser", + [ + Series([np.arange(5)]), + date_range("1/1/2011", periods=24, freq="H"), + Series(range(5), index=date_range("2017", periods=5)), + ], + ) + @pytest.mark.parametrize("shift_size", [0, 1, 2]) + def test_shift_always_copy(self, ser, shift_size): + # GH22397 + assert ser.shift(shift_size) is not ser + + @pytest.mark.parametrize("move_by_freq", [pd.Timedelta("1D"), pd.Timedelta("1M")]) + def test_datetime_shift_always_copy(self, move_by_freq): + # GH#22397 + ser = Series(range(5), index=date_range("2017", periods=5)) + assert ser.shift(freq=move_by_freq) is not ser + def test_shift(self, datetime_series): shifted = datetime_series.shift(1) unshifted = shifted.shift(-1) @@ -135,14 +154,14 @@ def test_shift_fill_value(self): result = ts.shift(2, fill_value=0.0) tm.assert_series_equal(result, exp) - ts = pd.Series([1, 2, 3]) + ts = Series([1, 2, 3]) res = ts.shift(2, fill_value=0) assert res.dtype == ts.dtype def test_shift_categorical_fill_value(self): - ts = pd.Series(["a", "b", "c", "d"], dtype="category") + ts = Series(["a", "b", "c", "d"], dtype="category") res = ts.shift(1, fill_value="a") - expected = pd.Series( + expected = Series( pd.Categorical( ["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False ) @@ -302,7 +321,7 @@ def test_shift_object_non_scalar_fill(self): def test_shift_categorical(self): # GH#9416 - s = pd.Series(["a", "b", "c", "d"], dtype="category") + s = Series(["a", "b", "c", "d"], dtype="category") tm.assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna()) @@ -321,25 +340,25 @@ def test_shift_categorical(self): def test_shift_dt64values_int_fill_deprecated(self): # GH#31971 - ser = pd.Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) + ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) with tm.assert_produces_warning(FutureWarning): result = ser.shift(1, fill_value=0) - expected = pd.Series([pd.Timestamp(0), ser[0]]) + expected = Series([pd.Timestamp(0), ser[0]]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("periods", [1, 2, 3, 4]) def test_shift_preserve_freqstr(self, periods): # GH#21275 - ser = pd.Series( + ser = Series( range(periods), index=pd.date_range("2016-1-1 00:00:00", periods=periods, freq="H"), ) result = ser.shift(1, "2H") - expected = pd.Series( + expected = Series( range(periods), index=pd.date_range("2016-1-1 02:00:00", periods=periods, freq="H"), ) @@ -353,7 +372,7 @@ def test_shift_non_writable_array(self, input_data, output_data): # GH21049 Verify whether non writable numpy array is shiftable input_data.setflags(write=False) - result = pd.Series(input_data).shift(1) - expected = pd.Series(output_data, dtype="float64") + result = Series(input_data).shift(1) + expected = Series(output_data, dtype="float64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/methods/test_to_csv.py similarity index 75% rename from pandas/tests/series/test_io.py rename to pandas/tests/series/methods/test_to_csv.py index b12ebd58e6a7b..a72e860340f25 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -5,7 +5,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Series +from pandas import Series import pandas._testing as tm from pandas.io.common import get_handle @@ -180,62 +180,3 @@ def test_to_csv_interval_index(self): expected.index = expected.index.astype(str) tm.assert_series_equal(result, expected) - - -class TestSeriesIO: - def test_to_frame(self, datetime_series): - datetime_series.name = None - rs = datetime_series.to_frame() - xp = pd.DataFrame(datetime_series.values, index=datetime_series.index) - tm.assert_frame_equal(rs, xp) - - datetime_series.name = "testname" - rs = datetime_series.to_frame() - xp = pd.DataFrame( - dict(testname=datetime_series.values), index=datetime_series.index - ) - tm.assert_frame_equal(rs, xp) - - rs = datetime_series.to_frame(name="testdifferent") - xp = pd.DataFrame( - dict(testdifferent=datetime_series.values), index=datetime_series.index - ) - tm.assert_frame_equal(rs, xp) - - def test_timeseries_periodindex(self): - # GH2891 - from pandas import period_range - - prng = period_range("1/1/2011", "1/1/2012", freq="M") - ts = Series(np.random.randn(len(prng)), prng) - new_ts = tm.round_trip_pickle(ts) - assert new_ts.index.freq == "M" - - def test_pickle_preserve_name(self): - for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, 2)]: - unpickled = self._pickle_roundtrip_name(tm.makeTimeSeries(name=n)) - assert unpickled.name == n - - def _pickle_roundtrip_name(self, obj): - - with tm.ensure_clean() as path: - obj.to_pickle(path) - unpickled = pd.read_pickle(path) - return unpickled - - def test_to_frame_expanddim(self): - # GH 9762 - - class SubclassedSeries(Series): - @property - def _constructor_expanddim(self): - return SubclassedFrame - - class SubclassedFrame(DataFrame): - pass - - s = SubclassedSeries([1, 2, 3], name="X") - result = s.to_frame() - assert isinstance(result, SubclassedFrame) - expected = SubclassedFrame({"X": [1, 2, 3]}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_to_frame.py b/pandas/tests/series/methods/test_to_frame.py new file mode 100644 index 0000000000000..b324fab5d97d4 --- /dev/null +++ b/pandas/tests/series/methods/test_to_frame.py @@ -0,0 +1,40 @@ +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestToFrame: + def test_to_frame(self, datetime_series): + datetime_series.name = None + rs = datetime_series.to_frame() + xp = DataFrame(datetime_series.values, index=datetime_series.index) + tm.assert_frame_equal(rs, xp) + + datetime_series.name = "testname" + rs = datetime_series.to_frame() + xp = DataFrame( + dict(testname=datetime_series.values), index=datetime_series.index + ) + tm.assert_frame_equal(rs, xp) + + rs = datetime_series.to_frame(name="testdifferent") + xp = DataFrame( + dict(testdifferent=datetime_series.values), index=datetime_series.index + ) + tm.assert_frame_equal(rs, xp) + + def test_to_frame_expanddim(self): + # GH#9762 + + class SubclassedSeries(Series): + @property + def _constructor_expanddim(self): + return SubclassedFrame + + class SubclassedFrame(DataFrame): + pass + + ser = SubclassedSeries([1, 2, 3], name="X") + result = ser.to_frame() + assert isinstance(result, SubclassedFrame) + expected = SubclassedFrame({"X": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index 858b1d6b4df8c..b03f516eeffc5 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -67,14 +67,14 @@ def test_truncate(self, datetime_series): def test_truncate_nonsortedindex(self): # GH#17935 - s = pd.Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) + s = Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): s.truncate(before=3, after=9) rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") - ts = pd.Series(np.random.randn(len(rng)), index=rng) + ts = Series(np.random.randn(len(rng)), index=rng) msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): @@ -92,7 +92,7 @@ def test_truncate_decreasing_index(self, before, after, indices, klass): before = pd.Timestamp(before) if before is not None else None after = pd.Timestamp(after) if after is not None else None indices = [pd.Timestamp(i) for i in indices] - values = pd.Series(range(len(idx)), index=idx) + values = Series(range(len(idx)), index=idx) result = values.truncate(before=before, after=after) expected = values.loc[indices] tm.assert_series_equal(result, expected) @@ -116,27 +116,27 @@ def test_truncate_periodindex(self): idx1 = pd.PeriodIndex( [pd.Period("2017-09-02"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] ) - series1 = pd.Series([1, 2, 3], index=idx1) + series1 = Series([1, 2, 3], index=idx1) result1 = series1.truncate(after="2017-09-02") expected_idx1 = pd.PeriodIndex( [pd.Period("2017-09-02"), pd.Period("2017-09-02")] ) - tm.assert_series_equal(result1, pd.Series([1, 2], index=expected_idx1)) + tm.assert_series_equal(result1, Series([1, 2], index=expected_idx1)) idx2 = pd.PeriodIndex( [pd.Period("2017-09-03"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] ) - series2 = pd.Series([1, 2, 3], index=idx2) + series2 = Series([1, 2, 3], index=idx2) result2 = series2.sort_index().truncate(after="2017-09-02") expected_idx2 = pd.PeriodIndex([pd.Period("2017-09-02")]) - tm.assert_series_equal(result2, pd.Series([2], index=expected_idx2)) + tm.assert_series_equal(result2, Series([2], index=expected_idx2)) def test_truncate_multiindex(self): # GH 34564 mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"]) - s1 = pd.Series(range(mi.shape[0]), index=mi, name="col") + s1 = Series(range(mi.shape[0]), index=mi, name="col") result = s1.truncate(before=2, after=3) df = pd.DataFrame.from_dict( @@ -150,7 +150,7 @@ def test_truncate_multiindex(self): def test_truncate_one_element_series(self): # GH 35544 - series = pd.Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) + series = Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) before = pd.Timestamp("2020-08-02") after = pd.Timestamp("2020-08-04") diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index d651315d64561..38955ea7f06c4 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -41,7 +41,7 @@ def test_unstack(): # GH5873 idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) - ts = pd.Series([1, 2], index=idx) + ts = Series([1, 2], index=idx) left = ts.unstack() right = DataFrame( [[np.nan, 1], [2, np.nan]], index=[101, 102], columns=[np.nan, 3.5] @@ -55,7 +55,7 @@ def test_unstack(): [1, 2, 1, 1, np.nan], ] ) - ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) + ts = Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) right = DataFrame( [[1.0, 1.3], [1.1, np.nan], [np.nan, 1.4], [1.2, np.nan]], columns=["cat", "dog"], @@ -70,10 +70,10 @@ def test_unstack_tuplename_in_multiindex(): idx = pd.MultiIndex.from_product( [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")] ) - ser = pd.Series(1, index=idx) + ser = Series(1, index=idx) result = ser.unstack(("A", "a")) - expected = pd.DataFrame( + expected = DataFrame( [[1, 1, 1], [1, 1, 1], [1, 1, 1]], columns=pd.MultiIndex.from_tuples([("a",), ("b",), ("c",)], names=[("A", "a")]), index=pd.Index([1, 2, 3], name=("B", "b")), @@ -109,10 +109,10 @@ def test_unstack_mixed_type_name_in_multiindex( idx = pd.MultiIndex.from_product( [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"] ) - ser = pd.Series(1, index=idx) + ser = Series(1, index=idx) result = ser.unstack(unstack_idx) - expected = pd.DataFrame( + expected = DataFrame( expected_values, columns=expected_columns, index=expected_index ) tm.assert_frame_equal(result, expected) @@ -121,7 +121,7 @@ def test_unstack_mixed_type_name_in_multiindex( def test_unstack_multi_index_categorical_values(): mi = tm.makeTimeDataFrame().stack().index.rename(["major", "minor"]) - ser = pd.Series(["foo"] * len(mi), index=mi, name="category", dtype="category") + ser = Series(["foo"] * len(mi), index=mi, name="category", dtype="category") result = ser.unstack() diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f97362ce9c2a9..37da31fb2329a 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -21,16 +21,16 @@ def test_value_counts_datetime(self): exp_idx = pd.DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="xxx") - ser = pd.Series(values, name="xxx") + ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) # check DatetimeIndex outputs the same result idx = pd.DatetimeIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -48,14 +48,14 @@ def test_value_counts_datetime_tz(self): ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], tz="US/Eastern", ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="xxx") - ser = pd.Series(values, name="xxx") + ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) idx = pd.DatetimeIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -70,16 +70,16 @@ def test_value_counts_period(self): ] exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="xxx") - ser = pd.Series(values, name="xxx") + ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) # check DatetimeIndex outputs the same result idx = pd.PeriodIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -88,16 +88,16 @@ def test_value_counts_categorical_ordered(self): values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="xxx") - ser = pd.Series(values, name="xxx") + ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) # check CategoricalIndex outputs the same result idx = pd.CategoricalIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -105,16 +105,16 @@ def test_value_counts_categorical_not_ordered(self): values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = Series([3, 2, 1], index=exp_idx, name="xxx") - ser = pd.Series(values, name="xxx") + ser = Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) # check CategoricalIndex outputs the same result idx = pd.CategoricalIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -183,19 +183,19 @@ def test_value_counts_categorical_with_nan(self): "ser, dropna, exp", [ ( - pd.Series([False, True, True, pd.NA]), + Series([False, True, True, pd.NA]), False, - pd.Series([2, 1, 1], index=[True, False, pd.NA]), + Series([2, 1, 1], index=[True, False, pd.NA]), ), ( - pd.Series([False, True, True, pd.NA]), + Series([False, True, True, pd.NA]), True, - pd.Series([2, 1], index=[True, False]), + Series([2, 1], index=[True, False]), ), ( - pd.Series(range(3), index=[True, False, np.nan]).index, + Series(range(3), index=[True, False, np.nan]).index, False, - pd.Series([1, 1, 1], index=[True, False, pd.NA]), + Series([1, 1, 1], index=[True, False, pd.NA]), ), ], ) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1a469d3e3d88b..ebb75adde5b13 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1,108 +1,10 @@ -import operator - import numpy as np -import pytest import pandas as pd -from pandas import DataFrame, Series -import pandas._testing as tm +from pandas import Series class TestSeriesAnalytics: - def test_matmul(self): - # matmul test is for GH #10259 - a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) - b = DataFrame( - np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] - ).T - - # Series @ DataFrame -> Series - result = operator.matmul(a, b) - expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) - tm.assert_series_equal(result, expected) - - # DataFrame @ Series -> Series - result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) - tm.assert_series_equal(result, expected) - - # Series @ Series -> scalar - result = operator.matmul(a, a) - expected = np.dot(a.values, a.values) - tm.assert_almost_equal(result, expected) - - # GH 21530 - # vector (1D np.array) @ Series (__rmatmul__) - result = operator.matmul(a.values, a) - expected = np.dot(a.values, a.values) - tm.assert_almost_equal(result, expected) - - # GH 21530 - # vector (1D list) @ Series (__rmatmul__) - result = operator.matmul(a.values.tolist(), a) - expected = np.dot(a.values, a.values) - tm.assert_almost_equal(result, expected) - - # GH 21530 - # matrix (2D np.array) @ Series (__rmatmul__) - result = operator.matmul(b.T.values, a) - expected = np.dot(b.T.values, a.values) - tm.assert_almost_equal(result, expected) - - # GH 21530 - # matrix (2D nested lists) @ Series (__rmatmul__) - result = operator.matmul(b.T.values.tolist(), a) - expected = np.dot(b.T.values, a.values) - tm.assert_almost_equal(result, expected) - - # mixed dtype DataFrame @ Series - a["p"] = int(a.p) - result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) - tm.assert_series_equal(result, expected) - - # different dtypes DataFrame @ Series - a = a.astype(int) - result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) - tm.assert_series_equal(result, expected) - - msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" - # exception raised is of type Exception - with pytest.raises(Exception, match=msg): - a.dot(a.values[:3]) - msg = "matrices are not aligned" - with pytest.raises(ValueError, match=msg): - a.dot(b.T) - - def test_ptp(self): - # GH21614 - N = 1000 - arr = np.random.randn(N) - ser = Series(arr) - assert np.ptp(ser) == np.ptp(arr) - - def test_repeat(self): - s = Series(np.random.randn(3), index=["a", "b", "c"]) - - reps = s.repeat(5) - exp = Series(s.values.repeat(5), index=s.index.values.repeat(5)) - tm.assert_series_equal(reps, exp) - - to_rep = [2, 3, 4] - reps = s.repeat(to_rep) - exp = Series(s.values.repeat(to_rep), index=s.index.values.repeat(to_rep)) - tm.assert_series_equal(reps, exp) - - def test_numpy_repeat(self): - s = Series(np.arange(3), name="x") - expected = Series(s.values.repeat(2), name="x", index=s.index.values.repeat(2)) - tm.assert_series_equal(np.repeat(s, 2), expected) - - msg = "the 'axis' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.repeat(s, 2, axis=0) - def test_is_monotonic(self): s = Series(np.random.randint(0, 10, size=1000)) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index d92edb6fe149a..c948af41c5e8f 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -1,4 +1,3 @@ -from collections import OrderedDict import pydoc import warnings @@ -25,67 +24,12 @@ import pandas._testing as tm from pandas.core.arrays import PeriodArray -import pandas.io.formats.printing as printing - class TestSeriesMisc: - def test_scalarop_preserve_name(self, datetime_series): - result = datetime_series * 2 - assert result.name == datetime_series.name - - def test_copy_name(self, datetime_series): - result = datetime_series.copy() - assert result.name == datetime_series.name - - def test_copy_index_name_checking(self, datetime_series): - # don't want to be able to modify the index stored elsewhere after - # making a copy - - datetime_series.index.name = None - assert datetime_series.index.name is None - assert datetime_series is datetime_series - - cp = datetime_series.copy() - cp.index.name = "foo" - printing.pprint_thing(datetime_series.index.name) - assert datetime_series.index.name is None - def test_append_preserve_name(self, datetime_series): result = datetime_series[:5].append(datetime_series[5:]) assert result.name == datetime_series.name - def test_binop_maybe_preserve_name(self, datetime_series): - # names match, preserve - result = datetime_series * datetime_series - assert result.name == datetime_series.name - result = datetime_series.mul(datetime_series) - assert result.name == datetime_series.name - - result = datetime_series * datetime_series[:-2] - assert result.name == datetime_series.name - - # names don't match, don't preserve - cp = datetime_series.copy() - cp.name = "something else" - result = datetime_series + cp - assert result.name is None - result = datetime_series.add(cp) - assert result.name is None - - ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"] - ops = ops + ["r" + op for op in ops] - for op in ops: - # names match, preserve - s = datetime_series.copy() - result = getattr(s, op)(s) - assert result.name == datetime_series.name - - # names don't match, don't preserve - cp = datetime_series.copy() - cp.name = "changed" - result = getattr(s, op)(cp) - assert result.name is None - def test_getitem_preserve_name(self, datetime_series): result = datetime_series[datetime_series > 0] assert result.name == datetime_series.name @@ -111,75 +55,8 @@ def _pickle_roundtrip(self, obj): unpickled = pd.read_pickle(path) return unpickled - def test_constructor_dict(self): - d = {"a": 0.0, "b": 1.0, "c": 2.0} - result = Series(d) - expected = Series(d, index=sorted(d.keys())) - tm.assert_series_equal(result, expected) - - result = Series(d, index=["b", "c", "d", "a"]) - expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) - tm.assert_series_equal(result, expected) - - def test_constructor_subclass_dict(self, dict_subclass): - data = dict_subclass((x, 10.0 * x) for x in range(10)) - series = Series(data) - expected = Series(dict(data.items())) - tm.assert_series_equal(series, expected) - - def test_constructor_ordereddict(self): - # GH3283 - data = OrderedDict((f"col{i}", np.random.random()) for i in range(12)) - - series = Series(data) - expected = Series(list(data.values()), list(data.keys())) - tm.assert_series_equal(series, expected) - - # Test with subclass - class A(OrderedDict): - pass - - series = Series(A(data)) - tm.assert_series_equal(series, expected) - - def test_constructor_dict_multiindex(self): - d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} - _d = sorted(d.items()) - result = Series(d) - expected = Series( - [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d]) - ) - tm.assert_series_equal(result, expected) - - d["z"] = 111.0 - _d.insert(0, ("z", d["z"])) - result = Series(d) - expected = Series( - [x[1] for x in _d], index=pd.Index([x[0] for x in _d], tupleize_cols=False) - ) - result = result.reindex(index=expected.index) - tm.assert_series_equal(result, expected) - - def test_constructor_dict_timedelta_index(self): - # GH #12169 : Resample category data with timedelta index - # construct Series from dict as data and TimedeltaIndex as index - # will result NaN in result Series data - expected = Series( - data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s") - ) - - result = Series( - data={ - pd.to_timedelta(0, unit="s"): "A", - pd.to_timedelta(10, unit="s"): "B", - pd.to_timedelta(20, unit="s"): "C", - }, - index=pd.to_timedelta([0, 10, 20], unit="s"), - ) - tm.assert_series_equal(result, expected) - def test_sparse_accessor_updates_on_inplace(self): - s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]") + s = Series([1, 1, 2, 3], dtype="Sparse[int]") return_value = s.drop([0, 1], inplace=True) assert return_value is None assert s.sparse.density == 1.0 @@ -257,7 +134,7 @@ def get_dir(s): ) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. - s = pd.Series(index=index, dtype=object) + s = Series(index=index, dtype=object) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: @@ -324,55 +201,6 @@ def test_raise_on_info(self): with pytest.raises(AttributeError, match=msg): s.info() - def test_copy(self): - - for deep in [None, False, True]: - s = Series(np.arange(10), dtype="float64") - - # default deep is True - if deep is None: - s2 = s.copy() - else: - s2 = s.copy(deep=deep) - - s2[::2] = np.NaN - - if deep is None or deep is True: - # Did not modify original Series - assert np.isnan(s2[0]) - assert not np.isnan(s[0]) - else: - # we DID modify the original Series - assert np.isnan(s2[0]) - assert np.isnan(s[0]) - - def test_copy_tzaware(self): - # GH#11794 - # copy of tz-aware - expected = Series([Timestamp("2012/01/01", tz="UTC")]) - expected2 = Series([Timestamp("1999/01/01", tz="UTC")]) - - for deep in [None, False, True]: - - s = Series([Timestamp("2012/01/01", tz="UTC")]) - - if deep is None: - s2 = s.copy() - else: - s2 = s.copy(deep=deep) - - s2[0] = pd.Timestamp("1999/01/01", tz="UTC") - - # default deep is True - if deep is None or deep is True: - # Did not modify original Series - tm.assert_series_equal(s2, expected2) - tm.assert_series_equal(s, expected) - else: - # we DID modify the original Series - tm.assert_series_equal(s2, expected2) - tm.assert_series_equal(s, expected2) - def test_axis_alias(self): s = Series([1, 2, np.nan]) tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) @@ -460,7 +288,7 @@ def f(x): tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) def test_str_accessor_updates_on_inplace(self): - s = pd.Series(list("abc")) + s = Series(list("abc")) return_value = s.drop([0], inplace=True) assert return_value is None assert len(s.str.lower()) == 2 @@ -479,11 +307,11 @@ def test_str_attribute(self): s.str.repeat(2) def test_empty_method(self): - s_empty = pd.Series(dtype=object) + s_empty = Series(dtype=object) assert s_empty.empty - s2 = pd.Series(index=[1], dtype=object) - for full_series in [pd.Series([1]), s2]: + s2 = Series(index=[1], dtype=object) + for full_series in [Series([1]), s2]: assert not full_series.empty @async_mark() @@ -493,7 +321,7 @@ async def test_tab_complete_warning(self, ip): pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter - code = "import pandas as pd; s = pd.Series(dtype=object)" + code = "import pandas as pd; s = Series(dtype=object)" await ip.run_code(code) # TODO: remove it when Ipython updates @@ -518,7 +346,7 @@ def test_integer_series_size(self): assert s.size == 9 def test_attrs(self): - s = pd.Series([0, 1], name="abc") + s = Series([0, 1], name="abc") assert s.attrs == {} s.attrs["version"] = 1 result = s + 1 @@ -526,7 +354,7 @@ def test_attrs(self): @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags(self, allows_duplicate_labels): - df = pd.Series([1, 2]) + df = Series([1, 2]) result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) if allows_duplicate_labels is None: # We don't update when it's not provided diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 09181201beee4..ae6a4df8ba699 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -191,7 +191,7 @@ def test_add_with_duplicate_index(self): s1 = Series([1, 2], index=[1, 1]) s2 = Series([10, 10], index=[1, 2]) result = s1 + s2 - expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) + expected = Series([11, 12, np.nan], index=[1, 1, 2]) tm.assert_series_equal(result, expected) def test_add_na_handling(self): @@ -258,8 +258,8 @@ def test_alignment_doesnt_change_tz(self): # GH#33671 dti = pd.date_range("2016-01-01", periods=10, tz="CET") dti_utc = dti.tz_convert("UTC") - ser = pd.Series(10, index=dti) - ser_utc = pd.Series(10, index=dti_utc) + ser = Series(10, index=dti) + ser_utc = Series(10, index=dti_utc) # we don't care about the result, just that original indexes are unchanged ser * ser_utc @@ -276,16 +276,16 @@ class TestSeriesFlexComparison: @pytest.mark.parametrize("axis", [0, None, "index"]) def test_comparison_flex_basic(self, axis, all_compare_operators): op = all_compare_operators.strip("__") - left = pd.Series(np.random.randn(10)) - right = pd.Series(np.random.randn(10)) + left = Series(np.random.randn(10)) + right = Series(np.random.randn(10)) result = getattr(left, op)(right, axis=axis) expected = getattr(operator, op)(left, right) tm.assert_series_equal(result, expected) def test_comparison_bad_axis(self, all_compare_operators): op = all_compare_operators.strip("__") - left = pd.Series(np.random.randn(10)) - right = pd.Series(np.random.randn(10)) + left = Series(np.random.randn(10)) + right = Series(np.random.randn(10)) msg = "No axis named 1 for object type" with pytest.raises(ValueError, match=msg): @@ -306,7 +306,7 @@ def test_comparison_flex_alignment(self, values, op): left = Series([1, 3, 2], index=list("abc")) right = Series([2, 2, 2], index=list("bcd")) result = getattr(left, op)(right) - expected = pd.Series(values, index=list("abcd")) + expected = Series(values, index=list("abcd")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -324,7 +324,7 @@ def test_comparison_flex_alignment_fill(self, values, op, fill_value): left = Series([1, 3, 2], index=list("abc")) right = Series([2, 2, 2], index=list("bcd")) result = getattr(left, op)(right, fill_value=fill_value) - expected = pd.Series(values, index=list("abcd")) + expected = Series(values, index=list("abcd")) tm.assert_series_equal(result, expected) @@ -585,12 +585,12 @@ def test_ne(self): "left, right", [ ( - pd.Series([1, 2, 3], index=list("ABC"), name="x"), - pd.Series([2, 2, 2], index=list("ABD"), name="x"), + Series([1, 2, 3], index=list("ABC"), name="x"), + Series([2, 2, 2], index=list("ABD"), name="x"), ), ( - pd.Series([1, 2, 3], index=list("ABC"), name="x"), - pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x"), + Series([1, 2, 3], index=list("ABC"), name="x"), + Series([2, 2, 2, 2], index=list("ABCD"), name="x"), ), ], ) @@ -694,10 +694,10 @@ def test_series_add_aware_naive_raises(self): def test_datetime_understood(self): # Ensures it doesn't fail to create the right series # reported in issue#16726 - series = pd.Series(pd.date_range("2012-01-01", periods=3)) + series = Series(pd.date_range("2012-01-01", periods=3)) offset = pd.offsets.DateOffset(days=6) result = series - offset - expected = pd.Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) + expected = Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) tm.assert_series_equal(result, expected) @@ -718,8 +718,8 @@ def test_series_ops_name_retention(flex, box, names, all_binary_operators): if op is ops.rfloordiv and box in [list, tuple]: pytest.xfail("op fails because of inconsistent ndarray-wrapping GH#28759") - left = pd.Series(range(10), name=names[0]) - right = pd.Series(range(10), name=names[1]) + left = Series(range(10), name=names[0]) + right = Series(range(10), name=names[1]) right = box(right) if flex: @@ -741,3 +741,41 @@ def test_series_ops_name_retention(flex, box, names, all_binary_operators): assert result.name == names[2] else: assert result.name == names[0] + + +class TestNamePreservation: + def test_binop_maybe_preserve_name(self, datetime_series): + # names match, preserve + result = datetime_series * datetime_series + assert result.name == datetime_series.name + result = datetime_series.mul(datetime_series) + assert result.name == datetime_series.name + + result = datetime_series * datetime_series[:-2] + assert result.name == datetime_series.name + + # names don't match, don't preserve + cp = datetime_series.copy() + cp.name = "something else" + result = datetime_series + cp + assert result.name is None + result = datetime_series.add(cp) + assert result.name is None + + ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"] + ops = ops + ["r" + op for op in ops] + for op in ops: + # names match, preserve + ser = datetime_series.copy() + result = getattr(ser, op)(ser) + assert result.name == datetime_series.name + + # names don't match, don't preserve + cp = datetime_series.copy() + cp.name = "changed" + result = getattr(ser, op)(cp) + assert result.name is None + + def test_scalarop_preserve_name(self, datetime_series): + result = datetime_series * 2 + assert result.name == datetime_series.name diff --git a/pandas/tests/series/test_block_internals.py b/pandas/tests/series/test_block_internals.py deleted file mode 100644 index d0dfbe6f5b569..0000000000000 --- a/pandas/tests/series/test_block_internals.py +++ /dev/null @@ -1,39 +0,0 @@ -import pandas as pd - -# Segregated collection of methods that require the BlockManager internal data -# structure - - -class TestSeriesBlockInternals: - def test_setitem_invalidates_datetime_index_freq(self): - # GH#24096 altering a datetime64tz Series inplace invalidates the - # `freq` attribute on the underlying DatetimeIndex - - dti = pd.date_range("20130101", periods=3, tz="US/Eastern") - ts = dti[1] - ser = pd.Series(dti) - assert ser._values is not dti - assert ser._values._data.base is not dti._data._data.base - assert dti.freq == "D" - ser.iloc[1] = pd.NaT - assert ser._values.freq is None - - # check that the DatetimeIndex was not altered in place - assert ser._values is not dti - assert ser._values._data.base is not dti._data._data.base - assert dti[1] == ts - assert dti.freq == "D" - - def test_dt64tz_setitem_does_not_mutate_dti(self): - # GH#21907, GH#24096 - dti = pd.date_range("2016-01-01", periods=10, tz="US/Pacific") - ts = dti[0] - ser = pd.Series(dti) - assert ser._values is not dti - assert ser._values._data.base is not dti._data._data.base - assert ser._mgr.blocks[0].values is not dti - assert ser._mgr.blocks[0].values._data.base is not dti._data._data.base - - ser[::3] = pd.NaT - assert ser[0] is pd.NaT - assert dti[0] == ts diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py deleted file mode 100644 index 95eba6ccc4df8..0000000000000 --- a/pandas/tests/series/test_combine_concat.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas import Series - - -class TestSeriesConcat: - @pytest.mark.parametrize( - "dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"] - ) - def test_concat_empty_series_dtypes_match_roundtrips(self, dtype): - dtype = np.dtype(dtype) - - result = pd.concat([Series(dtype=dtype)]) - assert result.dtype == dtype - - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype)]) - assert result.dtype == dtype - - def test_concat_empty_series_dtypes_roundtrips(self): - - # round-tripping with self & like self - dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) - - def int_result_type(dtype, dtype2): - typs = {dtype.kind, dtype2.kind} - if not len(typs - {"i", "u", "b"}) and ( - dtype.kind == "i" or dtype2.kind == "i" - ): - return "i" - elif not len(typs - {"u", "b"}) and ( - dtype.kind == "u" or dtype2.kind == "u" - ): - return "u" - return None - - def float_result_type(dtype, dtype2): - typs = {dtype.kind, dtype2.kind} - if not len(typs - {"f", "i", "u"}) and ( - dtype.kind == "f" or dtype2.kind == "f" - ): - return "f" - return None - - def get_result_type(dtype, dtype2): - result = float_result_type(dtype, dtype2) - if result is not None: - return result - result = int_result_type(dtype, dtype2) - if result is not None: - return result - return "O" - - for dtype in dtypes: - for dtype2 in dtypes: - if dtype == dtype2: - continue - - expected = get_result_type(dtype, dtype2) - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype - assert result.kind == expected - - @pytest.mark.parametrize( - "left,right,expected", - [ - # booleans - (np.bool_, np.int32, np.int32), - (np.bool_, np.float32, np.object_), - # datetime-like - ("m8[ns]", np.bool_, np.object_), - ("m8[ns]", np.int64, np.object_), - ("M8[ns]", np.bool_, np.object_), - ("M8[ns]", np.int64, np.object_), - # categorical - ("category", "category", "category"), - ("category", "object", "object"), - ], - ) - def test_concat_empty_series_dtypes(self, left, right, expected): - result = pd.concat([Series(dtype=left), Series(dtype=right)]) - assert result.dtype == expected - - def test_concat_empty_series_dtypes_triple(self): - - assert ( - pd.concat( - [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] - ).dtype - == np.object_ - ) - - def test_concat_empty_series_dtype_category_with_array(self): - # GH 18515 - assert ( - pd.concat( - [Series(np.array([]), dtype="category"), Series(dtype="float64")] - ).dtype - == "float64" - ) - - def test_concat_empty_series_dtypes_sparse(self): - result = pd.concat( - [ - Series(dtype="float64").astype("Sparse"), - Series(dtype="float64").astype("Sparse"), - ] - ) - assert result.dtype == "Sparse[float64]" - - result = pd.concat( - [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] - ) - # TODO: release-note: concat sparse dtype - expected = pd.SparseDtype(np.float64) - assert result.dtype == expected - - result = pd.concat( - [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] - ) - # TODO: release-note: concat sparse dtype - expected = pd.SparseDtype("object") - assert result.dtype == expected diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a950ca78fc742..a2fab5c7e0f0e 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -19,6 +19,7 @@ from pandas import ( Categorical, DataFrame, + DatetimeIndex, Index, Interval, IntervalIndex, @@ -34,6 +35,7 @@ ) import pandas._testing as tm from pandas.core.arrays import IntervalArray, period_array +from pandas.core.internals.blocks import IntBlock class TestSeriesConstructors: @@ -176,27 +178,27 @@ def test_constructor_nan(self, input_arg): "dtype", ["f8", "i8", "M8[ns]", "m8[ns]", "category", "object", "datetime64[ns, UTC]"], ) - @pytest.mark.parametrize("index", [None, pd.Index([])]) + @pytest.mark.parametrize("index", [None, Index([])]) def test_constructor_dtype_only(self, dtype, index): # GH-20865 - result = pd.Series(dtype=dtype, index=index) + result = Series(dtype=dtype, index=index) assert result.dtype == dtype assert len(result) == 0 def test_constructor_no_data_index_order(self): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - result = pd.Series(index=["b", "a", "c"]) + result = Series(index=["b", "a", "c"]) assert result.index.tolist() == ["b", "a", "c"] def test_constructor_no_data_string_type(self): # GH 22477 - result = pd.Series(index=[1], dtype=str) + result = Series(index=[1], dtype=str) assert np.isnan(result.iloc[0]) @pytest.mark.parametrize("item", ["entry", "ѐ", 13]) def test_constructor_string_element_string_type(self, item): # GH 22477 - result = pd.Series(item, index=[1], dtype=str) + result = Series(item, index=[1], dtype=str) assert result.iloc[0] == str(item) def test_constructor_dtype_str_na_values(self, string_dtype): @@ -313,7 +315,7 @@ def test_constructor_categorical(self): # can cast to a new dtype result = Series(pd.Categorical([1, 2, 3]), dtype="int64") - expected = pd.Series([1, 2, 3], dtype="int64") + expected = Series([1, 2, 3], dtype="int64") tm.assert_series_equal(result, expected) # GH12574 @@ -379,16 +381,16 @@ def test_constructor_categorical_with_coercion(self): assert result == expected def test_constructor_categorical_dtype(self): - result = pd.Series( + result = Series( ["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True) ) assert is_categorical_dtype(result.dtype) is True - tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"])) + tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"])) assert result.cat.ordered - result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"])) + result = Series(["a", "b"], dtype=CategoricalDtype(["b", "a"])) assert is_categorical_dtype(result.dtype) - tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"])) + tm.assert_index_equal(result.cat.categories, Index(["b", "a"])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype @@ -449,8 +451,8 @@ def test_categorical_sideeffects_free(self): tm.assert_numpy_array_equal(cat.__array__(), exp_s2) def test_unordered_compare_equal(self): - left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"])) - right = pd.Series(pd.Categorical(["a", "b", np.nan], categories=["a", "b"])) + left = Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"])) + right = Series(pd.Categorical(["a", "b", np.nan], categories=["a", "b"])) tm.assert_series_equal(left, right) def test_constructor_maskedarray(self): @@ -533,8 +535,8 @@ def test_constructor_maskedarray(self): def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 data = ma.masked_all((3,), dtype=float).harden_mask() - result = pd.Series(data) - expected = pd.Series([np.nan, np.nan, np.nan]) + result = Series(data) + expected = Series([np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): @@ -546,7 +548,7 @@ def test_series_ctor_plus_datetimeindex(self): def test_constructor_default_index(self): s = Series([0, 1, 2]) - tm.assert_index_equal(s.index, pd.Index(np.arange(3))) + tm.assert_index_equal(s.index, Index(np.arange(3))) @pytest.mark.parametrize( "input", @@ -601,7 +603,7 @@ def test_constructor_copy(self): # test dtype parameter has no side effects on copy=True for data in [[1.0], np.array([1.0])]: x = Series(data) - y = pd.Series(x, copy=True, dtype=float) + y = Series(x, copy=True, dtype=float) # copy=True maintains original data in Series tm.assert_series_equal(x, y) @@ -619,7 +621,7 @@ def test_constructor_copy(self): pd.date_range("20170101", periods=3), pd.timedelta_range("1 day", periods=3), pd.period_range("2012Q1", periods=3, freq="Q"), - pd.Index(list("abc")), + Index(list("abc")), pd.Int64Index([1, 2, 3]), pd.RangeIndex(0, 3), ], @@ -628,7 +630,7 @@ def test_constructor_copy(self): def test_constructor_limit_copies(self, index): # GH 17449 # limit copies of input - s = pd.Series(index) + s = Series(index) # we make 1 copy; this is just a smoke test here assert s._mgr.blocks[0].values is not index @@ -688,6 +690,13 @@ def test_constructor_coerce_float_valid(self, float_dtype): expected = Series([1, 2, 3.5]).astype(float_dtype) tm.assert_series_equal(s, expected) + def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_dtype): + # GH 22585 + + msg = "cannot convert float NaN to integer" + with pytest.raises(ValueError, match=msg): + Series([1, 2, np.nan], dtype=any_int_dtype) + def test_constructor_dtype_no_cast(self): # see gh-1572 s = Series([1, 2, 3]) @@ -712,7 +721,7 @@ def test_constructor_datelike_coercion(self): wing1 = "2T15 4H19".split() wing2 = "416 4T20".split() mat = pd.to_datetime("2016-01-22 2019-09-07".split()) - df = pd.DataFrame({"wing1": wing1, "wing2": wing2, "mat": mat}, index=belly) + df = DataFrame({"wing1": wing1, "wing2": wing2, "mat": mat}, index=belly) result = df.loc["3T19"] assert result.dtype == object @@ -995,8 +1004,8 @@ def test_construction_interval(self, interval_constructor): def test_constructor_infer_interval(self, data_constructor): # GH 23563: consistent closed results in interval dtype data = [pd.Interval(0, 1), pd.Interval(0, 2), None] - result = pd.Series(data_constructor(data)) - expected = pd.Series(IntervalArray(data)) + result = Series(data_constructor(data)) + expected = Series(IntervalArray(data)) assert result.dtype == "interval[float64]" tm.assert_series_equal(result, expected) @@ -1030,14 +1039,14 @@ def test_construction_consistency(self): ) def test_constructor_infer_period(self, data_constructor): data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] - result = pd.Series(data_constructor(data)) - expected = pd.Series(period_array(data)) + result = Series(data_constructor(data)) + expected = Series(period_array(data)) tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" def test_constructor_period_incompatible_frequency(self): data = [pd.Period("2000", "D"), pd.Period("2001", "A")] - result = pd.Series(data) + result = Series(data) assert result.dtype == object assert result.tolist() == data @@ -1053,6 +1062,11 @@ def test_constructor_periodindex(self): def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} + + result = Series(d) + expected = Series(d, index=sorted(d.keys())) + tm.assert_series_equal(result, expected) + result = Series(d, index=["b", "c", "d", "a"]) expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) tm.assert_series_equal(result, expected) @@ -1406,7 +1420,7 @@ def test_constructor_cast_object(self, index): exp = Series(index).astype(object) tm.assert_series_equal(s, exp) - s = Series(pd.Index(index, dtype=object), dtype=object) + s = Series(Index(index, dtype=object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) @@ -1473,7 +1487,7 @@ def test_constructor_datetime64(self): def test_constructor_datetimelike_scalar_to_string_dtype(self): # https://github.com/pandas-dev/pandas/pull/33846 result = Series("M", index=[1, 2, 3], dtype="string") - expected = pd.Series(["M", "M", "M"], index=[1, 2, 3], dtype="string") + expected = Series(["M", "M", "M"], index=[1, 2, 3], dtype="string") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1486,9 +1500,9 @@ def test_constructor_datetimelike_scalar_to_string_dtype(self): def test_constructor_sparse_datetime64(self, values): # https://github.com/pandas-dev/pandas/issues/35762 dtype = pd.SparseDtype("datetime64[ns]") - result = pd.Series(values, dtype=dtype) + result = Series(values, dtype=dtype) arr = pd.arrays.SparseArray(values, dtype=dtype) - expected = pd.Series(arr) + expected = Series(arr) tm.assert_series_equal(result, expected) def test_construction_from_ordered_collection(self): @@ -1507,3 +1521,113 @@ def test_construction_from_large_int_scalar_no_overflow(self): result = Series(n, index=[0]) expected = Series(n) tm.assert_series_equal(result, expected) + + def test_constructor_list_of_periods_infers_period_dtype(self): + series = Series(list(period_range("2000-01-01", periods=10, freq="D"))) + assert series.dtype == "Period[D]" + + series = Series( + [pd.Period("2011-01-01", freq="D"), pd.Period("2011-02-01", freq="D")] + ) + assert series.dtype == "Period[D]" + + def test_constructor_subclass_dict(self, dict_subclass): + data = dict_subclass((x, 10.0 * x) for x in range(10)) + series = Series(data) + expected = Series(dict(data.items())) + tm.assert_series_equal(series, expected) + + def test_constructor_ordereddict(self): + # GH3283 + data = OrderedDict((f"col{i}", np.random.random()) for i in range(12)) + + series = Series(data) + expected = Series(list(data.values()), list(data.keys())) + tm.assert_series_equal(series, expected) + + # Test with subclass + class A(OrderedDict): + pass + + series = Series(A(data)) + tm.assert_series_equal(series, expected) + + def test_constructor_dict_multiindex(self): + d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} + _d = sorted(d.items()) + result = Series(d) + expected = Series( + [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d]) + ) + tm.assert_series_equal(result, expected) + + d["z"] = 111.0 + _d.insert(0, ("z", d["z"])) + result = Series(d) + expected = Series( + [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False) + ) + result = result.reindex(index=expected.index) + tm.assert_series_equal(result, expected) + + def test_constructor_dict_timedelta_index(self): + # GH #12169 : Resample category data with timedelta index + # construct Series from dict as data and TimedeltaIndex as index + # will result NaN in result Series data + expected = Series( + data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s") + ) + + result = Series( + data={ + pd.to_timedelta(0, unit="s"): "A", + pd.to_timedelta(10, unit="s"): "B", + pd.to_timedelta(20, unit="s"): "C", + }, + index=pd.to_timedelta([0, 10, 20], unit="s"), + ) + tm.assert_series_equal(result, expected) + + +class TestSeriesConstructorIndexCoercion: + def test_series_constructor_datetimelike_index_coercion(self): + idx = tm.makeDateIndex(10000) + with tm.assert_produces_warning(FutureWarning): + ser = Series(np.random.randn(len(idx)), idx.astype(object)) + with tm.assert_produces_warning(FutureWarning): + assert ser.index.is_all_dates + assert isinstance(ser.index, DatetimeIndex) + + def test_series_constructor_infer_multiindex(self): + index_lists = [["a", "a", "b", "b"], ["x", "y", "x", "y"]] + + multi = Series(1.0, index=[np.array(x) for x in index_lists]) + assert isinstance(multi.index, MultiIndex) + + multi = Series(1.0, index=index_lists) + assert isinstance(multi.index, MultiIndex) + + multi = Series(range(4), index=index_lists) + assert isinstance(multi.index, MultiIndex) + + +class TestSeriesConstructorInternals: + def test_constructor_no_pandas_array(self): + ser = Series([1, 2, 3]) + result = Series(ser.array) + tm.assert_series_equal(ser, result) + assert isinstance(result._mgr.blocks[0], IntBlock) + + def test_from_array(self): + result = Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) + assert result._mgr.blocks[0].is_extension is False + + result = Series(pd.array(["2015"], dtype="datetime64[ns]")) + assert result._mgr.blocks[0].is_extension is False + + def test_from_list_dtype(self): + result = Series(["1H", "2H"], dtype="timedelta64[ns]") + assert result._mgr.blocks[0].is_extension is False + + result = Series(["2015"], dtype="datetime64[ns]") + assert result._mgr.blocks[0].is_extension is False diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_dt_accessor.py similarity index 93% rename from pandas/tests/series/test_datetime_values.py rename to pandas/tests/series/test_dt_accessor.py index b0926089bd7b4..2f30c0621cc06 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_dt_accessor.py @@ -16,6 +16,7 @@ DataFrame, DatetimeIndex, Index, + Period, PeriodIndex, Series, TimedeltaIndex, @@ -192,7 +193,7 @@ def compare(s, name): exp = Series(np.array([0, 1, 2], dtype="int64"), index=index, name="xxx") tm.assert_series_equal(s.dt.second, exp) - exp = pd.Series([s[0]] * 3, index=index, name="xxx") + exp = Series([s[0]] * 3, index=index, name="xxx") tm.assert_series_equal(s.dt.normalize(), exp) # periodindex @@ -350,7 +351,7 @@ def test_dt_namespace_accessor_categorical(self): def test_dt_tz_localize_categorical(self, tz_aware_fixture): # GH 27952 tz = tz_aware_fixture - datetimes = pd.Series( + datetimes = Series( ["2019-01-01", "2019-01-01", "2019-01-02"], dtype="datetime64[ns]" ) categorical = datetimes.astype("category") @@ -361,7 +362,7 @@ def test_dt_tz_localize_categorical(self, tz_aware_fixture): def test_dt_tz_convert_categorical(self, tz_aware_fixture): # GH 27952 tz = tz_aware_fixture - datetimes = pd.Series( + datetimes = Series( ["2019-01-01", "2019-01-01", "2019-01-02"], dtype="datetime64[ns, MET]" ) categorical = datetimes.astype("category") @@ -372,7 +373,7 @@ def test_dt_tz_convert_categorical(self, tz_aware_fixture): @pytest.mark.parametrize("accessor", ["year", "month", "day"]) def test_dt_other_accessors_categorical(self, accessor): # GH 27952 - datetimes = pd.Series( + datetimes = Series( ["2018-01-01", "2018-01-01", "2019-01-02"], dtype="datetime64[ns]" ) categorical = datetimes.astype("category") @@ -655,26 +656,6 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): result = s.dt.timetz tm.assert_series_equal(result, expected) - def test_setitem_with_string_index(self): - # GH 23451 - x = pd.Series([1, 2, 3], index=["Date", "b", "other"]) - x["Date"] = date.today() - assert x.Date == date.today() - assert x["Date"] == date.today() - - def test_setitem_with_different_tz(self): - # GH#24024 - ser = pd.Series(pd.date_range("2000", periods=2, tz="US/Central")) - ser[0] = pd.Timestamp("2000", tz="US/Eastern") - expected = pd.Series( - [ - pd.Timestamp("2000-01-01 00:00:00-05:00", tz="US/Eastern"), - pd.Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"), - ], - dtype=object, - ) - tm.assert_series_equal(ser, expected) - @pytest.mark.parametrize( "input_series, expected_output", [ @@ -688,16 +669,55 @@ def test_setitem_with_different_tz(self): ], ) def test_isocalendar(self, input_series, expected_output): - result = pd.to_datetime(pd.Series(input_series)).dt.isocalendar() + result = pd.to_datetime(Series(input_series)).dt.isocalendar() expected_frame = pd.DataFrame( expected_output, columns=["year", "week", "day"], dtype="UInt32" ) tm.assert_frame_equal(result, expected_frame) +class TestSeriesPeriodValuesDtAccessor: + @pytest.mark.parametrize( + "input_vals", + [ + [Period("2016-01", freq="M"), Period("2016-02", freq="M")], + [Period("2016-01-01", freq="D"), Period("2016-01-02", freq="D")], + [ + Period("2016-01-01 00:00:00", freq="H"), + Period("2016-01-01 01:00:00", freq="H"), + ], + [ + Period("2016-01-01 00:00:00", freq="M"), + Period("2016-01-01 00:01:00", freq="M"), + ], + [ + Period("2016-01-01 00:00:00", freq="S"), + Period("2016-01-01 00:00:01", freq="S"), + ], + ], + ) + def test_end_time_timevalues(self, input_vals): + # GH#17157 + # Check that the time part of the Period is adjusted by end_time + # when using the dt accessor on a Series + input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) + + s = Series(input_vals) + result = s.dt.end_time + expected = s.apply(lambda x: x.end_time) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("input_vals", [("2001"), ("NaT")]) + def test_to_period(self, input_vals): + # GH#21205 + expected = Series([input_vals], dtype="Period[D]") + result = Series([input_vals], dtype="datetime64[ns]").dt.to_period("D") + tm.assert_series_equal(result, expected) + + def test_week_and_weekofyear_are_deprecated(): # GH#33595 Deprecate week and weekofyear - series = pd.to_datetime(pd.Series(["2020-01-01"])) + series = pd.to_datetime(Series(["2020-01-01"])) with tm.assert_produces_warning(FutureWarning): series.dt.week with tm.assert_produces_warning(FutureWarning): @@ -706,7 +726,7 @@ def test_week_and_weekofyear_are_deprecated(): def test_normalize_pre_epoch_dates(): # GH: 36294 - s = pd.to_datetime(pd.Series(["1969-01-01 09:00:00", "2016-01-01 09:00:00"])) + s = pd.to_datetime(Series(["1969-01-01 09:00:00", "2016-01-01 09:00:00"])) result = s.dt.normalize() - expected = pd.to_datetime(pd.Series(["1969-01-01", "2016-01-01"])) + expected = pd.to_datetime(Series(["1969-01-01", "2016-01-01"])) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index ae89e16ca7667..b85a53960b0f6 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -1,123 +1,21 @@ -from datetime import datetime, timedelta -from importlib import reload import string -import sys import numpy as np import pytest -from pandas._libs.tslibs import iNaT - from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd -from pandas import ( - Categorical, - DataFrame, - Index, - Series, - Timedelta, - Timestamp, - date_range, -) +from pandas import Categorical, DataFrame, Series, date_range import pandas._testing as tm class TestSeriesDtypes: - def test_dt64_series_astype_object(self): - dt64ser = Series(date_range("20130101", periods=3)) - result = dt64ser.astype(object) - assert isinstance(result.iloc[0], datetime) - assert result.dtype == np.object_ - - def test_td64_series_astype_object(self): - tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]") - result = tdser.astype(object) - assert isinstance(result.iloc[0], timedelta) - assert result.dtype == np.object_ - - @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) - def test_astype(self, dtype): - s = Series(np.random.randn(5), name="foo") - as_typed = s.astype(dtype) - - assert as_typed.dtype == dtype - assert as_typed.name == s.name - def test_dtype(self, datetime_series): assert datetime_series.dtype == np.dtype("float64") assert datetime_series.dtypes == np.dtype("float64") - @pytest.mark.parametrize("value", [np.nan, np.inf]) - @pytest.mark.parametrize("dtype", [np.int32, np.int64]) - def test_astype_cast_nan_inf_int(self, dtype, value): - # gh-14265: check NaN and inf raise error when converting to int - msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" - s = Series([value]) - - with pytest.raises(ValueError, match=msg): - s.astype(dtype) - - @pytest.mark.parametrize("dtype", [int, np.int8, np.int64]) - def test_astype_cast_object_int_fail(self, dtype): - arr = Series(["car", "house", "tree", "1"]) - msg = r"invalid literal for int\(\) with base 10: 'car'" - with pytest.raises(ValueError, match=msg): - arr.astype(dtype) - - def test_astype_cast_object_int(self): - arr = Series(["1", "2", "3", "4"], dtype=object) - result = arr.astype(int) - - tm.assert_series_equal(result, Series(np.arange(1, 5))) - - def test_astype_datetime(self): - s = Series(iNaT, dtype="M8[ns]", index=range(5)) - - s = s.astype("O") - assert s.dtype == np.object_ - - s = Series([datetime(2001, 1, 2, 0, 0)]) - - s = s.astype("O") - assert s.dtype == np.object_ - - s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) - - s[1] = np.nan - assert s.dtype == "M8[ns]" - - s = s.astype("O") - assert s.dtype == np.object_ - - def test_astype_datetime64tz(self): - s = Series(date_range("20130101", periods=3, tz="US/Eastern")) - - # astype - result = s.astype(object) - expected = Series(s.astype(object), dtype=object) - tm.assert_series_equal(result, expected) - - result = Series(s.values).dt.tz_localize("UTC").dt.tz_convert(s.dt.tz) - tm.assert_series_equal(result, s) - - # astype - object, preserves on construction - result = Series(s.astype(object)) - expected = s.astype(object) - tm.assert_series_equal(result, expected) - - # astype - datetime64[ns, tz] - result = Series(s.values).astype("datetime64[ns, US/Eastern]") - tm.assert_series_equal(result, s) - - result = Series(s.values).astype(s.dtype) - tm.assert_series_equal(result, s) - - result = s.astype("datetime64[ns, CET]") - expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET")) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("dtype", [str, np.str_]) @pytest.mark.parametrize( "series", @@ -132,96 +30,6 @@ def test_astype_str_map(self, dtype, series): expected = series.map(str) tm.assert_series_equal(result, expected) - def test_astype_str_cast_dt64(self): - # see gh-9757 - ts = Series([Timestamp("2010-01-04 00:00:00")]) - s = ts.astype(str) - - expected = Series(["2010-01-04"]) - tm.assert_series_equal(s, expected) - - ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) - s = ts.astype(str) - - expected = Series(["2010-01-04 00:00:00-05:00"]) - tm.assert_series_equal(s, expected) - - def test_astype_str_cast_td64(self): - # see gh-9757 - - td = Series([Timedelta(1, unit="d")]) - ser = td.astype(str) - - expected = Series(["1 days"]) - tm.assert_series_equal(ser, expected) - - def test_astype_unicode(self): - # see gh-7758: A bit of magic is required to set - # default encoding to utf-8 - digits = string.digits - test_series = [ - Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), - Series(["データーサイエンス、お前はもう死んでいる"]), - ] - - former_encoding = None - - if sys.getdefaultencoding() == "utf-8": - test_series.append(Series(["野菜食べないとやばい".encode()])) - - for s in test_series: - res = s.astype("unicode") - expec = s.map(str) - tm.assert_series_equal(res, expec) - - # Restore the former encoding - if former_encoding is not None and former_encoding != "utf-8": - reload(sys) - sys.setdefaultencoding(former_encoding) - - @pytest.mark.parametrize("dtype_class", [dict, Series]) - def test_astype_dict_like(self, dtype_class): - # see gh-7271 - s = Series(range(0, 10, 2), name="abc") - - dt1 = dtype_class({"abc": str}) - result = s.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc") - tm.assert_series_equal(result, expected) - - dt2 = dtype_class({"abc": "float64"}) - result = s.astype(dt2) - expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype="float64", name="abc") - tm.assert_series_equal(result, expected) - - dt3 = dtype_class({"abc": str, "def": str}) - msg = ( - "Only the Series name can be used for the key in Series dtype " - r"mappings\." - ) - with pytest.raises(KeyError, match=msg): - s.astype(dt3) - - dt4 = dtype_class({0: str}) - with pytest.raises(KeyError, match=msg): - s.astype(dt4) - - # GH16717 - # if dtypes provided is empty, it should error - if dtype_class is Series: - dt5 = dtype_class({}, dtype=object) - else: - dt5 = dtype_class({}) - - with pytest.raises(KeyError, match=msg): - s.astype(dt5) - - def test_astype_categories_raises(self): - # deprecated 17636, removed in GH-27141 - s = Series(["a", "b", "a"]) - with pytest.raises(TypeError, match="got an unexpected"): - s.astype("category", categories=["a", "b"], ordered=True) - def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] s = Series(items) @@ -325,79 +133,6 @@ def cmp(a, b): with pytest.raises(TypeError, match=msg): invalid(s) - @pytest.mark.parametrize("name", [None, "foo"]) - @pytest.mark.parametrize("dtype_ordered", [True, False]) - @pytest.mark.parametrize("series_ordered", [True, False]) - def test_astype_categorical_to_categorical( - self, name, dtype_ordered, series_ordered - ): - # GH 10696/18593 - s_data = list("abcaacbab") - s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered) - s = Series(s_data, dtype=s_dtype, name=name) - - # unspecified categories - dtype = CategoricalDtype(ordered=dtype_ordered) - result = s.astype(dtype) - exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) - expected = Series(s_data, name=name, dtype=exp_dtype) - tm.assert_series_equal(result, expected) - - # different categories - dtype = CategoricalDtype(list("adc"), dtype_ordered) - result = s.astype(dtype) - expected = Series(s_data, name=name, dtype=dtype) - tm.assert_series_equal(result, expected) - - if dtype_ordered is False: - # not specifying ordered, so only test once - expected = s - result = s.astype("category") - tm.assert_series_equal(result, expected) - - def test_astype_bool_missing_to_categorical(self): - # GH-19182 - s = Series([True, False, np.nan]) - assert s.dtypes == np.object_ - - result = s.astype(CategoricalDtype(categories=[True, False])) - expected = Series(Categorical([True, False, np.nan], categories=[True, False])) - tm.assert_series_equal(result, expected) - - def test_astype_categoricaldtype(self): - s = Series(["a", "b", "a"]) - result = s.astype(CategoricalDtype(["a", "b"], ordered=True)) - expected = Series(Categorical(["a", "b", "a"], ordered=True)) - tm.assert_series_equal(result, expected) - - result = s.astype(CategoricalDtype(["a", "b"], ordered=False)) - expected = Series(Categorical(["a", "b", "a"], ordered=False)) - tm.assert_series_equal(result, expected) - - result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False)) - expected = Series( - Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False) - ) - tm.assert_series_equal(result, expected) - tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"])) - - @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) - def test_astype_generic_timestamp_no_frequency(self, dtype, request): - # see gh-15524, gh-15987 - data = [1] - s = Series(data) - - if np.dtype(dtype).name not in ["timedelta64", "datetime64"]: - mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit") - request.node.add_marker(mark) - - msg = ( - fr"The '{dtype.__name__}' dtype has no unit\. " - fr"Please pass in '{dtype.__name__}\[ns\]' instead." - ) - with pytest.raises(ValueError, match=msg): - s.astype(dtype) - @pytest.mark.parametrize("dtype", np.typecodes["All"]) def test_astype_empty_constructor_equality(self, dtype): # see gh-15524 @@ -413,19 +148,6 @@ def test_astype_empty_constructor_equality(self, dtype): as_type_empty = Series([]).astype(dtype) tm.assert_series_equal(init_empty, as_type_empty) - def test_arg_for_errors_in_astype(self): - # see gh-14878 - s = Series([1, 2, 3]) - - msg = ( - r"Expected value of kwarg 'errors' to be one of \['raise', " - r"'ignore'\]\. Supplied value is 'False'" - ) - with pytest.raises(ValueError, match=msg): - s.astype(np.float64, errors=False) - - s.astype(np.int8, errors="raise") - def test_intercept_astype_object(self): series = Series(date_range("1/1/2000", periods=10)) @@ -456,23 +178,6 @@ def test_series_to_categorical(self): tm.assert_series_equal(result, expected) - def test_infer_objects_series(self): - # GH 11221 - actual = Series(np.array([1, 2, 3], dtype="O")).infer_objects() - expected = Series([1, 2, 3]) - tm.assert_series_equal(actual, expected) - - actual = Series(np.array([1, 2, 3, None], dtype="O")).infer_objects() - expected = Series([1.0, 2.0, 3.0, np.nan]) - tm.assert_series_equal(actual, expected) - - # only soft conversions, unconvertable pass thru unchanged - actual = Series(np.array([1, 2, 3, None, "a"], dtype="O")).infer_objects() - expected = Series([1, 2, 3, None, "a"]) - - assert actual.dtype == "object" - tm.assert_series_equal(actual, expected) - @pytest.mark.parametrize( "data", [ @@ -482,7 +187,7 @@ def test_infer_objects_series(self): ) def test_values_compatibility(self, data): # https://github.com/pandas-dev/pandas/issues/23995 - result = pd.Series(data).values + result = Series(data).values expected = np.array(data.astype(object)) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py deleted file mode 100644 index 51410fce7efae..0000000000000 --- a/pandas/tests/series/test_internals.py +++ /dev/null @@ -1,245 +0,0 @@ -from datetime import datetime - -import numpy as np -import pytest - -import pandas as pd -from pandas import NaT, Series, Timestamp -import pandas._testing as tm -from pandas.core.internals.blocks import IntBlock - - -class TestSeriesInternals: - - # GH 10265 - def test_convert(self): - # Tests: All to nans, coerce, true - # Test coercion returns correct type - s = Series(["a", "b", "c"]) - results = s._convert(datetime=True, coerce=True) - expected = Series([NaT] * 3) - tm.assert_series_equal(results, expected) - - results = s._convert(numeric=True, coerce=True) - expected = Series([np.nan] * 3) - tm.assert_series_equal(results, expected) - - expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]")) - results = s._convert(timedelta=True, coerce=True) - tm.assert_series_equal(results, expected) - - dt = datetime(2001, 1, 1, 0, 0) - td = dt - datetime(2000, 1, 1, 0, 0) - - # Test coercion with mixed types - s = Series(["a", "3.1415", dt, td]) - results = s._convert(datetime=True, coerce=True) - expected = Series([NaT, NaT, dt, NaT]) - tm.assert_series_equal(results, expected) - - results = s._convert(numeric=True, coerce=True) - expected = Series([np.nan, 3.1415, np.nan, np.nan]) - tm.assert_series_equal(results, expected) - - results = s._convert(timedelta=True, coerce=True) - expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]")) - tm.assert_series_equal(results, expected) - - # Test standard conversion returns original - results = s._convert(datetime=True) - tm.assert_series_equal(results, s) - results = s._convert(numeric=True) - expected = Series([np.nan, 3.1415, np.nan, np.nan]) - tm.assert_series_equal(results, expected) - results = s._convert(timedelta=True) - tm.assert_series_equal(results, s) - - # test pass-through and non-conversion when other types selected - s = Series(["1.0", "2.0", "3.0"]) - results = s._convert(datetime=True, numeric=True, timedelta=True) - expected = Series([1.0, 2.0, 3.0]) - tm.assert_series_equal(results, expected) - results = s._convert(True, False, True) - tm.assert_series_equal(results, s) - - s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O") - results = s._convert(datetime=True, numeric=True, timedelta=True) - expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)]) - tm.assert_series_equal(results, expected) - results = s._convert(datetime=False, numeric=True, timedelta=True) - tm.assert_series_equal(results, s) - - td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) - s = Series([td, td], dtype="O") - results = s._convert(datetime=True, numeric=True, timedelta=True) - expected = Series([td, td]) - tm.assert_series_equal(results, expected) - results = s._convert(True, True, False) - tm.assert_series_equal(results, s) - - s = Series([1.0, 2, 3], index=["a", "b", "c"]) - result = s._convert(numeric=True) - tm.assert_series_equal(result, s) - - # force numeric conversion - r = s.copy().astype("O") - r["a"] = "1" - result = r._convert(numeric=True) - tm.assert_series_equal(result, s) - - r = s.copy().astype("O") - r["a"] = "1." - result = r._convert(numeric=True) - tm.assert_series_equal(result, s) - - r = s.copy().astype("O") - r["a"] = "garbled" - result = r._convert(numeric=True) - expected = s.copy() - expected["a"] = np.nan - tm.assert_series_equal(result, expected) - - # GH 4119, not converting a mixed type (e.g.floats and object) - s = Series([1, "na", 3, 4]) - result = s._convert(datetime=True, numeric=True) - expected = Series([1, np.nan, 3, 4]) - tm.assert_series_equal(result, expected) - - s = Series([1, "", 3, 4]) - result = s._convert(datetime=True, numeric=True) - tm.assert_series_equal(result, expected) - - # dates - s = Series( - [ - datetime(2001, 1, 1, 0, 0), - datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0), - ] - ) - s2 = Series( - [ - datetime(2001, 1, 1, 0, 0), - datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0), - "foo", - 1.0, - 1, - Timestamp("20010104"), - "20010105", - ], - dtype="O", - ) - - result = s._convert(datetime=True) - expected = Series( - [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")], - dtype="M8[ns]", - ) - tm.assert_series_equal(result, expected) - - result = s._convert(datetime=True, coerce=True) - tm.assert_series_equal(result, expected) - - expected = Series( - [ - Timestamp("20010101"), - Timestamp("20010102"), - Timestamp("20010103"), - NaT, - NaT, - NaT, - Timestamp("20010104"), - Timestamp("20010105"), - ], - dtype="M8[ns]", - ) - result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True) - tm.assert_series_equal(result, expected) - result = s2._convert(datetime=True, coerce=True) - tm.assert_series_equal(result, expected) - - s = Series(["foo", "bar", 1, 1.0], dtype="O") - result = s._convert(datetime=True, coerce=True) - expected = Series([NaT] * 2 + [Timestamp(1)] * 2) - tm.assert_series_equal(result, expected) - - # preserver if non-object - s = Series([1], dtype="float32") - result = s._convert(datetime=True, coerce=True) - tm.assert_series_equal(result, s) - - # FIXME: dont leave commented-out - # r = s.copy() - # r[0] = np.nan - # result = r._convert(convert_dates=True,convert_numeric=False) - # assert result.dtype == 'M8[ns]' - - # dateutil parses some single letters into today's value as a date - expected = Series([NaT]) - for x in "abcdefghijklmnopqrstuvwxyz": - s = Series([x]) - result = s._convert(datetime=True, coerce=True) - tm.assert_series_equal(result, expected) - s = Series([x.upper()]) - result = s._convert(datetime=True, coerce=True) - tm.assert_series_equal(result, expected) - - def test_convert_no_arg_error(self): - s = Series(["1.0", "2"]) - msg = r"At least one of datetime, numeric or timedelta must be True\." - with pytest.raises(ValueError, match=msg): - s._convert() - - def test_convert_preserve_bool(self): - s = Series([1, True, 3, 5], dtype=object) - r = s._convert(datetime=True, numeric=True) - e = Series([1, 1, 3, 5], dtype="i8") - tm.assert_series_equal(r, e) - - def test_convert_preserve_all_bool(self): - s = Series([False, True, False, False], dtype=object) - r = s._convert(datetime=True, numeric=True) - e = Series([False, True, False, False], dtype=bool) - tm.assert_series_equal(r, e) - - def test_constructor_no_pandas_array(self): - ser = pd.Series([1, 2, 3]) - result = pd.Series(ser.array) - tm.assert_series_equal(ser, result) - assert isinstance(result._mgr.blocks[0], IntBlock) - - def test_astype_no_pandas_dtype(self): - # https://github.com/pandas-dev/pandas/pull/24866 - ser = pd.Series([1, 2], dtype="int64") - # Don't have PandasDtype in the public API, so we use `.array.dtype`, - # which is a PandasDtype. - result = ser.astype(ser.array.dtype) - tm.assert_series_equal(result, ser) - - def test_from_array(self): - result = pd.Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) - assert result._mgr.blocks[0].is_extension is False - - result = pd.Series(pd.array(["2015"], dtype="datetime64[ns]")) - assert result._mgr.blocks[0].is_extension is False - - def test_from_list_dtype(self): - result = pd.Series(["1H", "2H"], dtype="timedelta64[ns]") - assert result._mgr.blocks[0].is_extension is False - - result = pd.Series(["2015"], dtype="datetime64[ns]") - assert result._mgr.blocks[0].is_extension is False - - -def test_hasnans_uncached_for_series(): - # GH#19700 - idx = pd.Index([0, 1]) - assert idx.hasnans is False - assert "hasnans" in idx._cache - ser = idx.to_series() - assert ser.hasnans is False - assert not hasattr(ser, "_cache") - ser.iloc[-1] = np.nan - assert ser.hasnans is True - assert Series.hasnans.__doc__ == pd.Index.hasnans.__doc__ diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_logical_ops.py similarity index 78% rename from pandas/tests/series/test_operators.py rename to pandas/tests/series/test_logical_ops.py index df6b8187964e8..3d53e7ac26338 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_logical_ops.py @@ -4,7 +4,6 @@ import numpy as np import pytest -import pandas as pd from pandas import DataFrame, Index, Series, bdate_range import pandas._testing as tm from pandas.core import ops @@ -42,42 +41,6 @@ def test_logical_operators_bool_dtype_with_empty(self): expected = s_tft tm.assert_series_equal(res, expected) - @pytest.mark.parametrize( - "left, right, op, expected", - [ - ( - [True, False, np.nan], - [True, False, True], - operator.and_, - [True, False, False], - ), - ( - [True, False, True], - [True, False, np.nan], - operator.and_, - [True, False, False], - ), - ( - [True, False, np.nan], - [True, False, True], - operator.or_, - [True, False, False], - ), - ( - [True, False, True], - [True, False, np.nan], - operator.or_, - [True, False, True], - ), - ], - ) - def test_logical_operators_nans(self, left, right, op, expected): - # GH 13896 - result = op(Series(left), Series(right)) - expected = Series(expected) - - tm.assert_series_equal(result, expected) - def test_logical_operators_int_dtype_with_int_dtype(self): # GH#9016: support bitwise op for integer types @@ -189,37 +152,37 @@ def test_logical_operators_bool_dtype_with_int(self): def test_logical_ops_bool_dtype_with_ndarray(self): # make sure we operate on ndarray the same as Series - left = pd.Series([True, True, True, False, True]) + left = Series([True, True, True, False, True]) right = [True, False, None, True, np.nan] - expected = pd.Series([True, False, False, False, False]) + expected = Series([True, False, False, False, False]) result = left & right tm.assert_series_equal(result, expected) result = left & np.array(right) tm.assert_series_equal(result, expected) - result = left & pd.Index(right) + result = left & Index(right) tm.assert_series_equal(result, expected) - result = left & pd.Series(right) + result = left & Series(right) tm.assert_series_equal(result, expected) - expected = pd.Series([True, True, True, True, True]) + expected = Series([True, True, True, True, True]) result = left | right tm.assert_series_equal(result, expected) result = left | np.array(right) tm.assert_series_equal(result, expected) - result = left | pd.Index(right) + result = left | Index(right) tm.assert_series_equal(result, expected) - result = left | pd.Series(right) + result = left | Series(right) tm.assert_series_equal(result, expected) - expected = pd.Series([False, True, True, True, True]) + expected = Series([False, True, True, True, True]) result = left ^ right tm.assert_series_equal(result, expected) result = left ^ np.array(right) tm.assert_series_equal(result, expected) - result = left ^ pd.Index(right) + result = left ^ Index(right) tm.assert_series_equal(result, expected) - result = left ^ pd.Series(right) + result = left ^ Series(right) tm.assert_series_equal(result, expected) def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self): @@ -340,20 +303,20 @@ def test_reversed_logical_op_with_index_returns_series(self, op): idx1 = Index([True, False, True, False]) idx2 = Index([1, 0, 1, 0]) - expected = pd.Series(op(idx1.values, ser.values)) + expected = Series(op(idx1.values, ser.values)) result = op(ser, idx1) tm.assert_series_equal(result, expected) - expected = pd.Series(op(idx2.values, ser.values)) + expected = Series(op(idx2.values, ser.values)) result = op(ser, idx2) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "op, expected", [ - (ops.rand_, pd.Index([False, True])), - (ops.ror_, pd.Index([False, True])), - (ops.rxor, pd.Index([])), + (ops.rand_, Index([False, True])), + (ops.ror_, Index([False, True])), + (ops.rxor, Index([])), ], ) def test_reverse_ops_with_index(self, op, expected): @@ -467,41 +430,41 @@ def test_logical_ops_label_based(self): def test_logical_ops_df_compat(self): # GH#1134 - s1 = pd.Series([True, False, True], index=list("ABC"), name="x") - s2 = pd.Series([True, True, False], index=list("ABD"), name="x") + s1 = Series([True, False, True], index=list("ABC"), name="x") + s2 = Series([True, True, False], index=list("ABD"), name="x") - exp = pd.Series([True, False, False, False], index=list("ABCD"), name="x") + exp = Series([True, False, False, False], index=list("ABCD"), name="x") tm.assert_series_equal(s1 & s2, exp) tm.assert_series_equal(s2 & s1, exp) # True | np.nan => True - exp_or1 = pd.Series([True, True, True, False], index=list("ABCD"), name="x") + exp_or1 = Series([True, True, True, False], index=list("ABCD"), name="x") tm.assert_series_equal(s1 | s2, exp_or1) # np.nan | True => np.nan, filled with False - exp_or = pd.Series([True, True, False, False], index=list("ABCD"), name="x") + exp_or = Series([True, True, False, False], index=list("ABCD"), name="x") tm.assert_series_equal(s2 | s1, exp_or) # DataFrame doesn't fill nan with False tm.assert_frame_equal(s1.to_frame() & s2.to_frame(), exp.to_frame()) tm.assert_frame_equal(s2.to_frame() & s1.to_frame(), exp.to_frame()) - exp = pd.DataFrame({"x": [True, True, np.nan, np.nan]}, index=list("ABCD")) + exp = DataFrame({"x": [True, True, np.nan, np.nan]}, index=list("ABCD")) tm.assert_frame_equal(s1.to_frame() | s2.to_frame(), exp_or1.to_frame()) tm.assert_frame_equal(s2.to_frame() | s1.to_frame(), exp_or.to_frame()) # different length - s3 = pd.Series([True, False, True], index=list("ABC"), name="x") - s4 = pd.Series([True, True, True, True], index=list("ABCD"), name="x") + s3 = Series([True, False, True], index=list("ABC"), name="x") + s4 = Series([True, True, True, True], index=list("ABCD"), name="x") - exp = pd.Series([True, False, True, False], index=list("ABCD"), name="x") + exp = Series([True, False, True, False], index=list("ABCD"), name="x") tm.assert_series_equal(s3 & s4, exp) tm.assert_series_equal(s4 & s3, exp) # np.nan | True => np.nan, filled with False - exp_or1 = pd.Series([True, True, True, False], index=list("ABCD"), name="x") + exp_or1 = Series([True, True, True, False], index=list("ABCD"), name="x") tm.assert_series_equal(s3 | s4, exp_or1) # True | np.nan => True - exp_or = pd.Series([True, True, True, True], index=list("ABCD"), name="x") + exp_or = Series([True, True, True, True], index=list("ABCD"), name="x") tm.assert_series_equal(s4 | s3, exp_or) tm.assert_frame_equal(s3.to_frame() & s4.to_frame(), exp.to_frame()) @@ -509,56 +472,3 @@ def test_logical_ops_df_compat(self): tm.assert_frame_equal(s3.to_frame() | s4.to_frame(), exp_or1.to_frame()) tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp_or.to_frame()) - - -class TestSeriesUnaryOps: - # __neg__, __pos__, __inv__ - - def test_neg(self): - ser = tm.makeStringSeries() - ser.name = "series" - tm.assert_series_equal(-ser, -1 * ser) - - def test_invert(self): - ser = tm.makeStringSeries() - ser.name = "series" - tm.assert_series_equal(-(ser < 0), ~(ser < 0)) - - @pytest.mark.parametrize( - "source, target", - [ - ([1, 2, 3], [-1, -2, -3]), - ([1, 2, None], [-1, -2, None]), - ([-1, 0, 1], [1, 0, -1]), - ], - ) - def test_unary_minus_nullable_int( - self, any_signed_nullable_int_dtype, source, target - ): - dtype = any_signed_nullable_int_dtype - s = pd.Series(source, dtype=dtype) - result = -s - expected = pd.Series(target, dtype=dtype) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]]) - def test_unary_plus_nullable_int(self, any_signed_nullable_int_dtype, source): - dtype = any_signed_nullable_int_dtype - expected = pd.Series(source, dtype=dtype) - result = +expected - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "source, target", - [ - ([1, 2, 3], [1, 2, 3]), - ([1, -2, None], [1, 2, None]), - ([-1, 0, 1], [1, 0, 1]), - ], - ) - def test_abs_nullable_int(self, any_signed_nullable_int_dtype, source, target): - dtype = any_signed_nullable_int_dtype - s = pd.Series(source, dtype=dtype) - result = abs(s) - expected = pd.Series(target, dtype=dtype) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 0144e4257efe0..6fefeaa818a77 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1,468 +1,15 @@ -from datetime import datetime, timedelta +from datetime import timedelta import numpy as np -import pytest -import pytz from pandas._libs import iNaT import pandas as pd -from pandas import ( - Categorical, - DataFrame, - Index, - IntervalIndex, - NaT, - Series, - Timedelta, - Timestamp, - date_range, - isna, -) +from pandas import Categorical, Index, NaT, Series, isna import pandas._testing as tm class TestSeriesMissingData: - def test_timedelta_fillna(self): - # GH 3371 - s = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130102"), - Timestamp("20130103 9:01:01"), - ] - ) - td = s.diff() - - # reg fillna - result = td.fillna(Timedelta(seconds=0)) - expected = Series( - [ - timedelta(0), - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] - ) - tm.assert_series_equal(result, expected) - - # interpreted as seconds, deprecated - with pytest.raises(TypeError, match="Passing integers to fillna"): - td.fillna(1) - - result = td.fillna(Timedelta(seconds=1)) - expected = Series( - [ - timedelta(seconds=1), - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] - ) - tm.assert_series_equal(result, expected) - - result = td.fillna(timedelta(days=1, seconds=1)) - expected = Series( - [ - timedelta(days=1, seconds=1), - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] - ) - tm.assert_series_equal(result, expected) - - result = td.fillna(np.timedelta64(int(1e9))) - expected = Series( - [ - timedelta(seconds=1), - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] - ) - tm.assert_series_equal(result, expected) - - result = td.fillna(NaT) - expected = Series( - [ - NaT, - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ], - dtype="m8[ns]", - ) - tm.assert_series_equal(result, expected) - - # ffill - td[2] = np.nan - result = td.ffill() - expected = td.fillna(Timedelta(seconds=0)) - expected[0] = np.nan - tm.assert_series_equal(result, expected) - - # bfill - td[2] = np.nan - result = td.bfill() - expected = td.fillna(Timedelta(seconds=0)) - expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1) - tm.assert_series_equal(result, expected) - - def test_datetime64_fillna(self): - - s = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130102"), - Timestamp("20130103 9:01:01"), - ] - ) - s[2] = np.nan - - # ffill - result = s.ffill() - expected = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130103 9:01:01"), - ] - ) - tm.assert_series_equal(result, expected) - - # bfill - result = s.bfill() - expected = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130103 9:01:01"), - Timestamp("20130103 9:01:01"), - ] - ) - tm.assert_series_equal(result, expected) - - # GH 6587 - # make sure that we are treating as integer when filling - # this also tests inference of a datetime-like with NaT's - s = Series([pd.NaT, pd.NaT, "2013-08-05 15:30:00.000001"]) - expected = Series( - [ - "2013-08-05 15:30:00.000001", - "2013-08-05 15:30:00.000001", - "2013-08-05 15:30:00.000001", - ], - dtype="M8[ns]", - ) - result = s.fillna(method="backfill") - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) - def test_datetime64_tz_fillna(self, tz): - # DatetimeBlock - s = Series( - [ - Timestamp("2011-01-01 10:00"), - pd.NaT, - Timestamp("2011-01-03 10:00"), - pd.NaT, - ] - ) - null_loc = pd.Series([False, True, False, True]) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00")) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-02 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - # check s is not changed - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna("AAA") - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - "AAA", - Timestamp("2011-01-03 10:00"), - "AAA", - ], - dtype=object, - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - {1: pd.Timestamp("2011-01-02 10:00"), 3: pd.Timestamp("2011-01-04 10:00")} - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - # DatetimeBlockTZ - idx = pd.DatetimeIndex( - ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz - ) - s = pd.Series(idx) - assert s.dtype == f"datetime64[ns, {tz}]" - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-02 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) - idx = pd.DatetimeIndex( - [ - "2011-01-01 10:00", - "2011-01-02 10:00", - "2011-01-03 10:00", - "2011-01-02 10:00", - ], - tz=tz, - ) - expected = Series(idx) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) - idx = pd.DatetimeIndex( - [ - "2011-01-01 10:00", - "2011-01-02 10:00", - "2011-01-03 10:00", - "2011-01-02 10:00", - ], - tz=tz, - ) - expected = Series(idx) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna("AAA") - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - "AAA", - Timestamp("2011-01-03 10:00", tz=tz), - "AAA", - ], - dtype=object, - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00", tz=tz), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-04 10:00", tz=tz), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - # filling with a naive/other zone, coerce to object - result = s.fillna(Timestamp("20130101")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2013-01-01"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2013-01-01"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(Timestamp("20130101", tz="US/Pacific")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2013-01-01", tz="US/Pacific"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2013-01-01", tz="US/Pacific"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - def test_fillna_dt64tz_with_method(self): - # with timezone - # GH 15855 - ser = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) - exp = pd.Series( - [ - pd.Timestamp("2012-11-11 00:00:00+01:00"), - pd.Timestamp("2012-11-11 00:00:00+01:00"), - ] - ) - tm.assert_series_equal(ser.fillna(method="pad"), exp) - - ser = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) - exp = pd.Series( - [ - pd.Timestamp("2012-11-11 00:00:00+01:00"), - pd.Timestamp("2012-11-11 00:00:00+01:00"), - ] - ) - tm.assert_series_equal(ser.fillna(method="bfill"), exp) - - def test_fillna_consistency(self): - # GH 16402 - # fillna with a tz aware to a tz-naive, should result in object - - s = Series([Timestamp("20130101"), pd.NaT]) - - result = s.fillna(Timestamp("20130101", tz="US/Eastern")) - expected = Series( - [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], - dtype="object", - ) - tm.assert_series_equal(result, expected) - - # where (we ignore the errors=) - result = s.where( - [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" - ) - tm.assert_series_equal(result, expected) - - result = s.where( - [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" - ) - tm.assert_series_equal(result, expected) - - # with a non-datetime - result = s.fillna("foo") - expected = Series([Timestamp("20130101"), "foo"]) - tm.assert_series_equal(result, expected) - - # assignment - s2 = s.copy() - s2[1] = "foo" - tm.assert_series_equal(s2, expected) - - def test_datetime64tz_fillna_round_issue(self): - # GH 14872 - - data = pd.Series( - [pd.NaT, pd.NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] - ) - - filled = data.fillna(method="bfill") - - expected = pd.Series( - [ - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - ] - ) - - tm.assert_series_equal(filled, expected) - - def test_fillna_downcast(self): - # GH 15277 - # infer int64 from float64 - s = pd.Series([1.0, np.nan]) - result = s.fillna(0, downcast="infer") - expected = pd.Series([1, 0]) - tm.assert_series_equal(result, expected) - - # infer int64 from float64 when fillna value is a dict - s = pd.Series([1.0, np.nan]) - result = s.fillna({1: 0}, downcast="infer") - expected = pd.Series([1, 0]) - tm.assert_series_equal(result, expected) - - def test_fillna_int(self): - s = Series(np.random.randint(-100, 100, 50)) - return_value = s.fillna(method="ffill", inplace=True) - assert return_value is None - tm.assert_series_equal(s.fillna(method="ffill", inplace=False), s) - - def test_categorical_nan_equality(self): - cat = Series(Categorical(["a", "b", "c", np.nan])) - exp = Series([True, True, True, False]) - res = cat == cat - tm.assert_series_equal(res, exp) - def test_categorical_nan_handling(self): # NaNs are represented as -1 in labels @@ -472,43 +19,6 @@ def test_categorical_nan_handling(self): s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8) ) - def test_fillna_nat(self): - series = Series([0, 1, 2, iNaT], dtype="M8[ns]") - - filled = series.fillna(method="pad") - filled2 = series.fillna(value=series.values[2]) - - expected = series.copy() - expected.values[3] = expected.values[2] - - tm.assert_series_equal(filled, expected) - tm.assert_series_equal(filled2, expected) - - df = DataFrame({"A": series}) - filled = df.fillna(method="pad") - filled2 = df.fillna(value=series.values[2]) - expected = DataFrame({"A": expected}) - tm.assert_frame_equal(filled, expected) - tm.assert_frame_equal(filled2, expected) - - series = Series([iNaT, 0, 1, 2], dtype="M8[ns]") - - filled = series.fillna(method="bfill") - filled2 = series.fillna(value=series[1]) - - expected = series.copy() - expected[0] = expected[1] - - tm.assert_series_equal(filled, expected) - tm.assert_series_equal(filled2, expected) - - df = DataFrame({"A": series}) - filled = df.fillna(method="bfill") - filled2 = df.fillna(value=series[1]) - expected = DataFrame({"A": expected}) - tm.assert_frame_equal(filled, expected) - tm.assert_frame_equal(filled2, expected) - def test_isna_for_inf(self): s = Series(["a", np.inf, np.nan, pd.NA, 1.0]) with pd.option_context("mode.use_inf_as_na", True): @@ -531,111 +41,6 @@ def test_isnull_for_inf_deprecated(self): tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de) - def test_fillna(self, datetime_series): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - - tm.assert_series_equal(ts, ts.fillna(method="ffill")) - - ts[2] = np.NaN - - exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) - tm.assert_series_equal(ts.fillna(method="ffill"), exp) - - exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) - tm.assert_series_equal(ts.fillna(method="backfill"), exp) - - exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) - tm.assert_series_equal(ts.fillna(value=5), exp) - - msg = "Must specify a fill 'value' or 'method'" - with pytest.raises(ValueError, match=msg): - ts.fillna() - - msg = "Cannot specify both 'value' and 'method'" - with pytest.raises(ValueError, match=msg): - datetime_series.fillna(value=0, method="ffill") - - # GH 5703 - s1 = Series([np.nan]) - s2 = Series([1]) - result = s1.fillna(s2) - expected = Series([1.0]) - tm.assert_series_equal(result, expected) - result = s1.fillna({}) - tm.assert_series_equal(result, s1) - result = s1.fillna(Series((), dtype=object)) - tm.assert_series_equal(result, s1) - result = s2.fillna(s1) - tm.assert_series_equal(result, s2) - result = s1.fillna({0: 1}) - tm.assert_series_equal(result, expected) - result = s1.fillna({1: 1}) - tm.assert_series_equal(result, Series([np.nan])) - result = s1.fillna({0: 1, 1: 1}) - tm.assert_series_equal(result, expected) - result = s1.fillna(Series({0: 1, 1: 1})) - tm.assert_series_equal(result, expected) - result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) - tm.assert_series_equal(result, s1) - - s1 = Series([0, 1, 2], list("abc")) - s2 = Series([0, np.nan, 2], list("bac")) - result = s2.fillna(s1) - expected = Series([0, 0, 2.0], list("bac")) - tm.assert_series_equal(result, expected) - - # limit - s = Series(np.nan, index=[0, 1, 2]) - result = s.fillna(999, limit=1) - expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) - tm.assert_series_equal(result, expected) - - result = s.fillna(999, limit=2) - expected = Series([999, 999, np.nan], index=[0, 1, 2]) - tm.assert_series_equal(result, expected) - - # GH 9043 - # make sure a string representation of int/float values can be filled - # correctly without raising errors or being converted - vals = ["0", "1.5", "-0.3"] - for val in vals: - s = Series([0, 1, np.nan, np.nan, 4], dtype="float64") - result = s.fillna(val) - expected = Series([0, 1, val, val, 4], dtype="object") - tm.assert_series_equal(result, expected) - - def test_fillna_bug(self): - x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) - filled = x.fillna(method="ffill") - expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], x.index) - tm.assert_series_equal(filled, expected) - - filled = x.fillna(method="bfill") - expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], x.index) - tm.assert_series_equal(filled, expected) - - def test_fillna_invalid_method(self, datetime_series): - try: - datetime_series.fillna(method="ffil") - except ValueError as inst: - assert "ffil" in str(inst) - - def test_ffill(self): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - ts[2] = np.NaN - tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) - - def test_ffill_mixed_dtypes_without_missing_data(self): - # GH14956 - series = pd.Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) - result = series.ffill() - tm.assert_series_equal(series, result) - - def test_bfill(self): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - ts[2] = np.NaN - tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) - def test_timedelta64_nan(self): td = Series([timedelta(days=i) for i in range(10)]) @@ -677,74 +82,6 @@ def test_timedelta64_nan(self): # expected = (datetime_series >= -0.5) & (datetime_series <= 0.5) # tm.assert_series_equal(selector, expected) - def test_dropna_empty(self): - s = Series([], dtype=object) - - assert len(s.dropna()) == 0 - return_value = s.dropna(inplace=True) - assert return_value is None - assert len(s) == 0 - - # invalid axis - msg = "No axis named 1 for object type Series" - with pytest.raises(ValueError, match=msg): - s.dropna(axis=1) - - def test_datetime64_tz_dropna(self): - # DatetimeBlock - s = Series( - [ - Timestamp("2011-01-01 10:00"), - pd.NaT, - Timestamp("2011-01-03 10:00"), - pd.NaT, - ] - ) - result = s.dropna() - expected = Series( - [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")], index=[0, 2] - ) - tm.assert_series_equal(result, expected) - - # DatetimeBlockTZ - idx = pd.DatetimeIndex( - ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz="Asia/Tokyo" - ) - s = pd.Series(idx) - assert s.dtype == "datetime64[ns, Asia/Tokyo]" - result = s.dropna() - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), - Timestamp("2011-01-03 10:00", tz="Asia/Tokyo"), - ], - index=[0, 2], - ) - assert result.dtype == "datetime64[ns, Asia/Tokyo]" - tm.assert_series_equal(result, expected) - - def test_dropna_no_nan(self): - for s in [Series([1, 2, 3], name="x"), Series([False, True, False], name="x")]: - - result = s.dropna() - tm.assert_series_equal(result, s) - assert result is not s - - s2 = s.copy() - return_value = s2.dropna(inplace=True) - assert return_value is None - tm.assert_series_equal(s2, s) - - def test_dropna_intervals(self): - s = Series( - [np.nan, 1, 2, 3], - IntervalIndex.from_arrays([np.nan, 0, 1, 2], [np.nan, 1, 2, 3]), - ) - - result = s.dropna() - expected = s.iloc[1:] - tm.assert_series_equal(result, expected) - def test_valid(self, datetime_series): ts = datetime_series.copy() ts.index = ts.index._with_freq(None) @@ -755,88 +92,15 @@ def test_valid(self, datetime_series): tm.assert_series_equal(result, ts[1::2]) tm.assert_series_equal(result, ts[pd.notna(ts)]) - def test_isna(self): - ser = Series([0, 5.4, 3, np.nan, -0.001]) - expected = Series([False, False, False, True, False]) - tm.assert_series_equal(ser.isna(), expected) - - ser = Series(["hi", "", np.nan]) - expected = Series([False, False, True]) - tm.assert_series_equal(ser.isna(), expected) - - def test_notna(self): - ser = Series([0, 5.4, 3, np.nan, -0.001]) - expected = Series([True, True, True, False, True]) - tm.assert_series_equal(ser.notna(), expected) - - ser = Series(["hi", "", np.nan]) - expected = Series([True, True, False]) - tm.assert_series_equal(ser.notna(), expected) - - def test_pad_nan(self): - x = Series( - [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float - ) - - return_value = x.fillna(method="pad", inplace=True) - assert return_value is None - - expected = Series( - [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float - ) - tm.assert_series_equal(x[1:], expected[1:]) - assert np.isnan(x[0]), np.isnan(expected[0]) - - def test_pad_require_monotonicity(self): - rng = date_range("1/1/2000", "3/1/2000", freq="B") - - # neither monotonic increasing or decreasing - rng2 = rng[[1, 0, 2]] - - msg = "index must be monotonic increasing or decreasing" - with pytest.raises(ValueError, match=msg): - rng2.get_indexer(rng, method="pad") - - def test_dropna_preserve_name(self, datetime_series): - datetime_series[:5] = np.nan - result = datetime_series.dropna() - assert result.name == datetime_series.name - name = datetime_series.name - ts = datetime_series.copy() - return_value = ts.dropna(inplace=True) - assert return_value is None - assert ts.name == name - - def test_series_fillna_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - - result = s[:2].reindex(index) - result = result.fillna(method="pad", limit=5) - - expected = s[:2].reindex(index).fillna(method="pad") - expected[-3:] = np.nan - tm.assert_series_equal(result, expected) - - result = s[-2:].reindex(index) - result = result.fillna(method="bfill", limit=5) - - expected = s[-2:].reindex(index).fillna(method="backfill") - expected[:3] = np.nan - tm.assert_series_equal(result, expected) - - def test_series_pad_backfill_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - - result = s[:2].reindex(index, method="pad", limit=5) - - expected = s[:2].reindex(index).fillna(method="pad") - expected[-3:] = np.nan - tm.assert_series_equal(result, expected) - - result = s[-2:].reindex(index, method="backfill", limit=5) - expected = s[-2:].reindex(index).fillna(method="backfill") - expected[:3] = np.nan - tm.assert_series_equal(result, expected) +def test_hasnans_uncached_for_series(): + # GH#19700 + idx = Index([0, 1]) + assert idx.hasnans is False + assert "hasnans" in idx._cache + ser = idx.to_series() + assert ser.hasnans is False + assert not hasattr(ser, "_cache") + ser.iloc[-1] = np.nan + assert ser.hasnans is True + assert Series.hasnans.__doc__ == Index.hasnans.__doc__ diff --git a/pandas/tests/series/test_npfuncs.py b/pandas/tests/series/test_npfuncs.py new file mode 100644 index 0000000000000..645a849015c23 --- /dev/null +++ b/pandas/tests/series/test_npfuncs.py @@ -0,0 +1,16 @@ +""" +Tests for np.foo applied to Series, not necessarily ufuncs. +""" + +import numpy as np + +from pandas import Series + + +class TestPtp: + def test_ptp(self): + # GH#21614 + N = 1000 + arr = np.random.randn(N) + ser = Series(arr) + assert np.ptp(ser) == np.ptp(arr) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index b54c09e5750fd..d079111aa12d6 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -2,35 +2,13 @@ import pytest import pandas as pd -from pandas import DataFrame, Period, Series, period_range -import pandas._testing as tm -from pandas.core.arrays import PeriodArray +from pandas import DataFrame, Series, period_range class TestSeriesPeriod: def setup_method(self, method): self.series = Series(period_range("2000-01-01", periods=10, freq="D")) - def test_auto_conversion(self): - series = Series(list(period_range("2000-01-01", periods=10, freq="D"))) - assert series.dtype == "Period[D]" - - series = pd.Series( - [pd.Period("2011-01-01", freq="D"), pd.Period("2011-02-01", freq="D")] - ) - assert series.dtype == "Period[D]" - - def test_isna(self): - # GH 13737 - s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) - tm.assert_series_equal(s.isna(), Series([False, True])) - tm.assert_series_equal(s.notna(), Series([True, False])) - - def test_dropna(self): - # GH 13737 - s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) - tm.assert_series_equal(s.dropna(), Series([pd.Period("2011-01", freq="M")])) - # --------------------------------------------------------------------- # NaT support @@ -44,26 +22,6 @@ def test_NaT_scalar(self): series[2] = val assert pd.isna(series[2]) - def test_NaT_cast(self): - result = Series([np.nan]).astype("period[D]") - expected = Series([pd.NaT], dtype="period[D]") - tm.assert_series_equal(result, expected) - - def test_set_none(self): - self.series[3] = None - assert self.series[3] is pd.NaT - - self.series[3:5] = None - assert self.series[4] is pd.NaT - - def test_set_nan(self): - # Do we want to allow this? - self.series[5] = np.nan - assert self.series[5] is pd.NaT - - self.series[5:7] = np.nan - assert self.series[6] is pd.NaT - def test_intercept_astype_object(self): expected = self.series.astype("object") @@ -76,40 +34,3 @@ def test_intercept_astype_object(self): result = df.values.squeeze() assert (result[:, 0] == expected.values).all() - - @pytest.mark.parametrize( - "input_vals", - [ - [Period("2016-01", freq="M"), Period("2016-02", freq="M")], - [Period("2016-01-01", freq="D"), Period("2016-01-02", freq="D")], - [ - Period("2016-01-01 00:00:00", freq="H"), - Period("2016-01-01 01:00:00", freq="H"), - ], - [ - Period("2016-01-01 00:00:00", freq="M"), - Period("2016-01-01 00:01:00", freq="M"), - ], - [ - Period("2016-01-01 00:00:00", freq="S"), - Period("2016-01-01 00:00:01", freq="S"), - ], - ], - ) - def test_end_time_timevalues(self, input_vals): - # GH 17157 - # Check that the time part of the Period is adjusted by end_time - # when using the dt accessor on a Series - input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) - - s = Series(input_vals) - result = s.dt.end_time - expected = s.apply(lambda x: x.end_time) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("input_vals", [("2001"), ("NaT")]) - def test_to_period(self, input_vals): - # GH 21205 - expected = Series([input_vals], dtype="Period[D]") - result = Series([input_vals], dtype="datetime64[ns]").dt.to_period("D") - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 28d29c69f6526..0e8bf8f052206 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -2,7 +2,8 @@ import pytest import pandas as pd -from pandas import Series +from pandas import MultiIndex, Series +import pandas._testing as tm def test_reductions_td64_with_nat(): @@ -14,6 +15,16 @@ def test_reductions_td64_with_nat(): assert ser.max() == exp +@pytest.mark.parametrize("skipna", [True, False]) +def test_td64_sum_empty(skipna): + # GH#37151 + ser = Series([], dtype="timedelta64[ns]") + + result = ser.sum(skipna=skipna) + assert isinstance(result, pd.Timedelta) + assert result == pd.Timedelta(0) + + def test_td64_summation_overflow(): # GH#9442 ser = Series(pd.date_range("20130101", periods=100000, freq="H")) @@ -46,6 +57,14 @@ def test_prod_numpy16_bug(): assert not isinstance(result, Series) +def test_sum_with_level(): + obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) + + result = obj.sum(level=0) + expected = Series([10.0], index=[2]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) def test_validate_any_all_out_keepdims_raises(kwargs, func): diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 32e1220f83f40..7325505ce233b 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -252,7 +252,7 @@ def __repr__(self) -> str: return self.name + ", " + self.state cat = pd.Categorical([County() for _ in range(61)]) - idx = pd.Index(cat) + idx = Index(cat) ser = idx.to_series() repr(ser) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index bab3853e3bd1d..8b32be45e8d57 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -1,20 +1,11 @@ import numpy as np -import pytest import pandas as pd -from pandas import DataFrame, DatetimeIndex, Series, date_range, timedelta_range +from pandas import DataFrame, Series, date_range, timedelta_range import pandas._testing as tm class TestTimeSeries: - def test_timeseries_coercion(self): - idx = tm.makeDateIndex(10000) - with tm.assert_produces_warning(FutureWarning): - ser = Series(np.random.randn(len(idx)), idx.astype(object)) - with tm.assert_produces_warning(FutureWarning): - assert ser.index.is_all_dates - assert isinstance(ser.index, DatetimeIndex) - def test_contiguous_boolean_preserve_freq(self): rng = date_range("1/1/2000", "3/1/2000", freq="B") @@ -54,20 +45,6 @@ def test_promote_datetime_date(self): expected = rng.get_indexer(ts_slice.index) tm.assert_numpy_array_equal(result, expected) - def test_groupby_count_dateparseerror(self): - dr = date_range(start="1/1/2012", freq="5min", periods=10) - - # BAD Example, datetimes first - s = Series(np.arange(10), index=[dr, np.arange(10)]) - grouped = s.groupby(lambda x: x[1] % 2 == 0) - result = grouped.count() - - s = Series(np.arange(10), index=[np.arange(10), dr]) - grouped = s.groupby(lambda x: x[0] % 2 == 0) - expected = grouped.count() - - tm.assert_series_equal(result, expected) - def test_series_map_box_timedelta(self): # GH 11349 s = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) @@ -79,29 +56,11 @@ def f(x): s.apply(f) DataFrame(s).applymap(f) - def test_asfreq_resample_set_correct_freq(self): - # GH5613 - # we test if .asfreq() and .resample() set the correct value for .freq - df = pd.DataFrame( - {"date": ["2012-01-01", "2012-01-02", "2012-01-03"], "col": [1, 2, 3]} - ) - df = df.set_index(pd.to_datetime(df.date)) - - # testing the settings before calling .asfreq() and .resample() - assert df.index.freq is None - assert df.index.inferred_freq == "D" - - # does .asfreq() set .freq correctly? - assert df.asfreq("D").index.freq == "D" - - # does .resample() set .freq correctly? - assert df.resample("D").asfreq().index.freq == "D" - def test_view_tz(self): # GH#24024 - ser = pd.Series(pd.date_range("2000", periods=4, tz="US/Central")) + ser = Series(pd.date_range("2000", periods=4, tz="US/Central")) result = ser.view("i8") - expected = pd.Series( + expected = Series( [ 946706400000000000, 946792800000000000, @@ -110,37 +69,3 @@ def test_view_tz(self): ] ) tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("tz", [None, "US/Central"]) - def test_asarray_object_dt64(self, tz): - ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) - - with tm.assert_produces_warning(None): - # Future behavior (for tzaware case) with no warning - result = np.asarray(ser, dtype=object) - - expected = np.array( - [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] - ) - tm.assert_numpy_array_equal(result, expected) - - def test_asarray_tz_naive(self): - # This shouldn't produce a warning. - ser = pd.Series(pd.date_range("2000", periods=2)) - expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - result = np.asarray(ser) - - tm.assert_numpy_array_equal(result, expected) - - def test_asarray_tz_aware(self): - tz = "US/Central" - ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) - expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") - result = np.asarray(ser, dtype="datetime64[ns]") - - tm.assert_numpy_array_equal(result, expected) - - # Old behavior with no warning - result = np.asarray(ser, dtype="M8[ns]") - - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index c7fc37a278e83..bcd6a7a7308a3 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -30,7 +30,7 @@ def arrays_for_binary_ufunc(): @pytest.mark.parametrize("ufunc", UNARY_UFUNCS) @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) def test_unary_ufunc(ufunc, sparse): - # Test that ufunc(Series) == Series(ufunc) + # Test that ufunc(pd.Series) == pd.Series(ufunc) array = np.random.randint(0, 10, 10, dtype="int64") array[::2] = 0 if sparse: @@ -49,13 +49,13 @@ def test_unary_ufunc(ufunc, sparse): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) @pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): - # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) + # Test that ufunc(pd.Series(a), array) == pd.Series(ufunc(a, b)) a1, a2 = arrays_for_binary_ufunc if sparse: a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) - name = "name" # op(Series, array) preserves the name. + name = "name" # op(pd.Series, array) preserves the name. series = pd.Series(a1, name=name) other = a2 @@ -76,14 +76,14 @@ def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): @pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that - # * func(Series(a), Series(b)) == Series(ufunc(a, b)) - # * ufunc(Index, Series) dispatches to Series (returns a Series) + # * func(pd.Series(a), pd.Series(b)) == pd.Series(ufunc(a, b)) + # * ufunc(Index, pd.Series) dispatches to pd.Series (returns a pd.Series) a1, a2 = arrays_for_binary_ufunc if sparse: a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) - name = "name" # op(Series, array) preserves the name. + name = "name" # op(pd.Series, array) preserves the name. series = pd.Series(a1, name=name) other = pd.Index(a2, name=name).astype("int64") @@ -107,14 +107,14 @@ def test_binary_ufunc_with_series( flip, shuffle, sparse, ufunc, arrays_for_binary_ufunc ): # Test that - # * func(Series(a), Series(b)) == Series(ufunc(a, b)) + # * func(pd.Series(a), pd.Series(b)) == pd.Series(ufunc(a, b)) # with alignment between the indices a1, a2 = arrays_for_binary_ufunc if sparse: a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) - name = "name" # op(Series, array) preserves the name. + name = "name" # op(pd.Series, array) preserves the name. series = pd.Series(a1, name=name) other = pd.Series(a2, name=name) @@ -146,8 +146,8 @@ def test_binary_ufunc_with_series( @pytest.mark.parametrize("flip", [True, False]) def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): # Test that - # * ufunc(Series, scalar) == Series(ufunc(array, scalar)) - # * ufunc(Series, scalar) == ufunc(scalar, Series) + # * ufunc(pd.Series, scalar) == pd.Series(ufunc(array, scalar)) + # * ufunc(pd.Series, scalar) == ufunc(scalar, pd.Series) array, _ = arrays_for_binary_ufunc if sparse: array = SparseArray(array) diff --git a/pandas/tests/series/test_unary.py b/pandas/tests/series/test_unary.py new file mode 100644 index 0000000000000..40d5e56203c6c --- /dev/null +++ b/pandas/tests/series/test_unary.py @@ -0,0 +1,57 @@ +import pytest + +from pandas import Series +import pandas._testing as tm + + +class TestSeriesUnaryOps: + # __neg__, __pos__, __inv__ + + def test_neg(self): + ser = tm.makeStringSeries() + ser.name = "series" + tm.assert_series_equal(-ser, -1 * ser) + + def test_invert(self): + ser = tm.makeStringSeries() + ser.name = "series" + tm.assert_series_equal(-(ser < 0), ~(ser < 0)) + + @pytest.mark.parametrize( + "source, target", + [ + ([1, 2, 3], [-1, -2, -3]), + ([1, 2, None], [-1, -2, None]), + ([-1, 0, 1], [1, 0, -1]), + ], + ) + def test_unary_minus_nullable_int( + self, any_signed_nullable_int_dtype, source, target + ): + dtype = any_signed_nullable_int_dtype + ser = Series(source, dtype=dtype) + result = -ser + expected = Series(target, dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]]) + def test_unary_plus_nullable_int(self, any_signed_nullable_int_dtype, source): + dtype = any_signed_nullable_int_dtype + expected = Series(source, dtype=dtype) + result = +expected + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "source, target", + [ + ([1, 2, 3], [1, 2, 3]), + ([1, -2, None], [1, 2, None]), + ([-1, 0, 1], [1, 0, 1]), + ], + ) + def test_abs_nullable_int(self, any_signed_nullable_int_dtype, source, target): + dtype = any_signed_nullable_int_dtype + ser = Series(source, dtype=dtype) + result = abs(ser) + expected = Series(target, dtype=dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 28ceaa61c558f..aefbcee76b5d7 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -3,11 +3,9 @@ import struct import numpy as np -from numpy.random import RandomState import pytest from pandas._libs import algos as libalgos, hashtable as ht -from pandas._libs.groupby import group_var_float32, group_var_float64 from pandas.compat import IS64 from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -25,11 +23,21 @@ from pandas import ( Categorical, CategoricalIndex, + DataFrame, DatetimeIndex, Index, IntervalIndex, + MultiIndex, + NaT, + Period, + PeriodIndex, Series, + Timedelta, Timestamp, + date_range, + timedelta_range, + to_datetime, + to_timedelta, ) import pandas._testing as tm import pandas.core.algorithms as algos @@ -38,6 +46,40 @@ class TestFactorize: + @pytest.mark.parametrize("sort", [True, False]) + def test_factorize(self, index_or_series_obj, sort): + obj = index_or_series_obj + result_codes, result_uniques = obj.factorize(sort=sort) + + constructor = Index + if isinstance(obj, MultiIndex): + constructor = MultiIndex.from_tuples + expected_uniques = constructor(obj.unique()) + + if sort: + expected_uniques = expected_uniques.sort_values() + + # construct an integer ndarray so that + # `expected_uniques.take(expected_codes)` is equal to `obj` + expected_uniques_list = list(expected_uniques) + expected_codes = [expected_uniques_list.index(val) for val in obj] + expected_codes = np.asarray(expected_codes, dtype=np.intp) + + tm.assert_numpy_array_equal(result_codes, expected_codes) + tm.assert_index_equal(result_uniques, expected_uniques) + + def test_series_factorize_na_sentinel_none(self): + # GH#35667 + values = np.array([1, 2, 1, np.nan]) + ser = Series(values) + codes, uniques = ser.factorize(na_sentinel=None) + + expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) + expected_uniques = Index([1.0, 2.0, np.nan]) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_index_equal(uniques, expected_uniques) + def test_basic(self): codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -113,34 +155,34 @@ def test_datelike(self): tm.assert_index_equal(uniques, exp) # period - v1 = pd.Period("201302", freq="M") - v2 = pd.Period("201303", freq="M") + v1 = Period("201302", freq="M") + v2 = Period("201303", freq="M") x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index codes, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) + tm.assert_index_equal(uniques, PeriodIndex([v1, v2])) codes, uniques = algos.factorize(x, sort=True) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) + tm.assert_index_equal(uniques, PeriodIndex([v1, v2])) # GH 5986 - v1 = pd.to_timedelta("1 day 1 min") - v2 = pd.to_timedelta("1 day") + v1 = to_timedelta("1 day 1 min") + v2 = to_timedelta("1 day") x = Series([v1, v2, v1, v1, v2, v2, v1]) codes, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) + tm.assert_index_equal(uniques, to_timedelta([v1, v2])) codes, uniques = algos.factorize(x, sort=True) exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1])) + tm.assert_index_equal(uniques, to_timedelta([v2, v1])) def test_factorize_nan(self): # nan should map to na_sentinel, not reverse_indexer[na_sentinel] @@ -243,7 +285,7 @@ def test_string_factorize(self, writable): tm.assert_numpy_array_equal(uniques, expected_uniques) def test_object_factorize(self, writable): - data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object) + data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object) data.setflags(write=writable) expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) expected_uniques = np.array(["a", "c", "b"], dtype=object) @@ -406,7 +448,7 @@ def test_object_refcount_bug(self): def test_on_index_object(self): - mindex = pd.MultiIndex.from_arrays( + mindex = MultiIndex.from_arrays( [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] ) expected = mindex.values @@ -458,7 +500,7 @@ def test_datetime64_dtype_array_returned(self): dtype="M8[ns]", ) - dt_index = pd.to_datetime( + dt_index = to_datetime( [ "2015-01-03T00:00:00.000000000", "2015-01-01T00:00:00.000000000", @@ -495,7 +537,7 @@ def test_timedelta64_dtype_array_returned(self): # GH 9431 expected = np.array([31200, 45678, 10000], dtype="m8[ns]") - td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) + td_index = to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype @@ -774,7 +816,7 @@ def test_basic(self): def test_i8(self): - arr = pd.date_range("20130101", periods=3).values + arr = date_range("20130101", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -787,7 +829,7 @@ def test_i8(self): expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) - arr = pd.timedelta_range("1 day", periods=3).values + arr = timedelta_range("1 day", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -801,7 +843,7 @@ def test_i8(self): tm.assert_numpy_array_equal(result, expected) def test_large(self): - s = pd.date_range("20000101", periods=2000000, freq="s").values + s = date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) expected[0] = True @@ -851,10 +893,10 @@ def test_same_nan_is_in_large(self): def test_same_nan_is_in_large_series(self): # https://github.com/pandas-dev/pandas/issues/22205 s = np.tile(1.0, 1_000_001) - series = pd.Series(s) + series = Series(s) s[0] = np.nan result = series.isin([np.nan, 1]) - expected = pd.Series(np.ones(len(s), dtype=bool)) + expected = Series(np.ones(len(s), dtype=bool)) tm.assert_series_equal(result, expected) def test_same_object_is_in(self): @@ -952,27 +994,27 @@ def test_different_nans_as_float64(self): def test_isin_int_df_string_search(self): """Comparing df with int`s (1,2) with a string at isin() ("1") -> should not match values because int 1 is not equal str 1""" - df = pd.DataFrame({"values": [1, 2]}) + df = DataFrame({"values": [1, 2]}) result = df.isin(["1"]) - expected_false = pd.DataFrame({"values": [False, False]}) + expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @pytest.mark.xfail(reason="problem related with issue #34125") def test_isin_nan_df_string_search(self): """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") -> should not match values because np.nan is not equal str NaN""" - df = pd.DataFrame({"values": [np.nan, 2]}) + df = DataFrame({"values": [np.nan, 2]}) result = df.isin(["NaN"]) - expected_false = pd.DataFrame({"values": [False, False]}) + expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @pytest.mark.xfail(reason="problem related with issue #34125") def test_isin_float_df_string_search(self): """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") -> should not match values because float 1.4245 is not equal str 1.4245""" - df = pd.DataFrame({"values": [1.4245, 2.32441]}) + df = DataFrame({"values": [1.4245, 2.32441]}) result = df.isin(["1.4245"]) - expected_false = pd.DataFrame({"values": [False, False]}) + expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @@ -1018,8 +1060,8 @@ def test_value_counts_dtypes(self): algos.value_counts(["1", 1], bins=1) def test_value_counts_nat(self): - td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]") - dt = pd.to_datetime(["NaT", "2014-01-01"]) + td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]") + dt = to_datetime(["NaT", "2014-01-01"]) for s in [td, dt]: vc = algos.value_counts(s) @@ -1053,7 +1095,7 @@ def test_value_counts_datetime_outofbounds(self): tm.assert_series_equal(res, exp) # GH 12424 - res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) @@ -1325,9 +1367,9 @@ def test_datetime_likes(self): cases = [ np.array([Timestamp(d) for d in dt]), np.array([Timestamp(d, tz="US/Eastern") for d in dt]), - np.array([pd.Period(d, freq="D") for d in dt]), + np.array([Period(d, freq="D") for d in dt]), np.array([np.datetime64(d) for d in dt]), - np.array([pd.Timedelta(d) for d in td]), + np.array([Timedelta(d) for d in td]), ] exp_first = np.array( @@ -1409,122 +1451,6 @@ def test_unique_tuples(self, arr, unique): tm.assert_numpy_array_equal(result, expected) -class GroupVarTestMixin: - def test_group_var_generic_1d(self): - prng = RandomState(1234) - - out = (np.nan * np.ones((5, 1))).astype(self.dtype) - counts = np.zeros(5, dtype="int64") - values = 10 * prng.rand(15, 1).astype(self.dtype) - labels = np.tile(np.arange(5), (3,)).astype("int64") - - expected_out = ( - np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 - )[:, np.newaxis] - expected_counts = counts + 3 - - self.algo(out, counts, values, labels) - assert np.allclose(out, expected_out, self.rtol) - tm.assert_numpy_array_equal(counts, expected_counts) - - def test_group_var_generic_1d_flat_labels(self): - prng = RandomState(1234) - - out = (np.nan * np.ones((1, 1))).astype(self.dtype) - counts = np.zeros(1, dtype="int64") - values = 10 * prng.rand(5, 1).astype(self.dtype) - labels = np.zeros(5, dtype="int64") - - expected_out = np.array([[values.std(ddof=1) ** 2]]) - expected_counts = counts + 5 - - self.algo(out, counts, values, labels) - - assert np.allclose(out, expected_out, self.rtol) - tm.assert_numpy_array_equal(counts, expected_counts) - - def test_group_var_generic_2d_all_finite(self): - prng = RandomState(1234) - - out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype="int64") - values = 10 * prng.rand(10, 2).astype(self.dtype) - labels = np.tile(np.arange(5), (2,)).astype("int64") - - expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 - expected_counts = counts + 2 - - self.algo(out, counts, values, labels) - assert np.allclose(out, expected_out, self.rtol) - tm.assert_numpy_array_equal(counts, expected_counts) - - def test_group_var_generic_2d_some_nan(self): - prng = RandomState(1234) - - out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype="int64") - values = 10 * prng.rand(10, 2).astype(self.dtype) - values[:, 1] = np.nan - labels = np.tile(np.arange(5), (2,)).astype("int64") - - expected_out = np.vstack( - [ - values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, - np.nan * np.ones(5), - ] - ).T.astype(self.dtype) - expected_counts = counts + 2 - - self.algo(out, counts, values, labels) - tm.assert_almost_equal(out, expected_out, rtol=0.5e-06) - tm.assert_numpy_array_equal(counts, expected_counts) - - def test_group_var_constant(self): - # Regression test from GH 10448. - - out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype="int64") - values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) - labels = np.zeros(3, dtype="int64") - - self.algo(out, counts, values, labels) - - assert counts[0] == 3 - assert out[0, 0] >= 0 - tm.assert_almost_equal(out[0, 0], 0.0) - - -class TestGroupVarFloat64(GroupVarTestMixin): - __test__ = True - - algo = staticmethod(group_var_float64) - dtype = np.float64 - rtol = 1e-5 - - def test_group_var_large_inputs(self): - - prng = RandomState(1234) - - out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype="int64") - values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) - values.shape = (10 ** 6, 1) - labels = np.zeros(10 ** 6, dtype="int64") - - self.algo(out, counts, values, labels) - - assert counts[0] == 10 ** 6 - tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3) - - -class TestGroupVarFloat32(GroupVarTestMixin): - __test__ = True - - algo = staticmethod(group_var_float32) - dtype = np.float32 - rtol = 1e-2 - - class TestHashTable: def test_string_hashtable_set_item_signature(self): # GH#30419 fix typing in StringHashTable.set_item to prevent segfault @@ -1648,7 +1574,7 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): s.loc[500] = np.nan elif htable == ht.PyObjectHashTable: # use different NaN types for object column - s.loc[500:502] = [np.nan, None, pd.NaT] + s.loc[500:502] = [np.nan, None, NaT] # create duplicated selection s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) @@ -1688,7 +1614,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): s.loc[500] = np.nan elif htable == ht.PyObjectHashTable: # use different NaN types for object column - s.loc[500:502] = [np.nan, None, pd.NaT] + s.loc[500:502] = [np.nan, None, NaT] # create duplicated selection s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) @@ -1765,11 +1691,13 @@ def _check(arr): _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan])) _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan])) - def test_basic(self): + def test_basic(self, writable): exp = np.array([1, 2], dtype=np.float64) for dtype in np.typecodes["AllInteger"]: - s = Series([1, 100], dtype=dtype) + data = np.array([1, 100], dtype=dtype) + data.setflags(write=writable) + s = Series(data) tm.assert_numpy_array_equal(algos.rank(s), exp) def test_uint64_overflow(self): @@ -2405,3 +2333,28 @@ def test_index(self): dtype="timedelta64[ns]", ) tm.assert_series_equal(algos.mode(idx), exp) + + +class TestDiff: + @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) + def test_diff_datetimelike_nat(self, dtype): + # NaT - NaT is NaT, not 0 + arr = np.arange(12).astype(np.int64).view(dtype).reshape(3, 4) + arr[:, 2] = arr.dtype.type("NaT", "ns") + result = algos.diff(arr, 1, axis=0) + + expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4 + expected[:, 2] = np.timedelta64("NaT", "ns") + expected[0, :] = np.timedelta64("NaT", "ns") + + tm.assert_numpy_array_equal(result, expected) + + result = algos.diff(arr.T, 1, axis=1) + tm.assert_numpy_array_equal(result, expected.T) + + def test_diff_ea_axis(self): + dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data + + msg = "cannot diff DatetimeArray on axis=1" + with pytest.raises(ValueError, match=msg): + algos.diff(dta, 1, axis=1) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 17d7527a2b687..366a1970f6f64 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -106,7 +106,7 @@ def test_random_state(): ], ) def test_maybe_match_name(left, right, expected): - assert ops._maybe_match_name(left, right) == expected + assert ops.common._maybe_match_name(left, right) == expected def test_dict_compat(): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index c03e8e26952e5..392be699b6fc0 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -150,6 +150,18 @@ def test_missing_required_dependency(): # https://github.com/MacPython/pandas-wheels/pull/50 pyexe = sys.executable.replace("\\", "/") + + # We skip this test if pandas is installed as a site package. We first + # import the package normally and check the path to the module before + # executing the test which imports pandas with site packages disabled. + call = [pyexe, "-c", "import pandas;print(pandas.__file__)"] + output = subprocess.check_output(call).decode() + if "site-packages" in output: + pytest.skip("pandas installed as site package") + + # This test will fail if pandas is installed as a site package. The flags + # prevent pandas being imported and the test will report Failed: DID NOT + # RAISE call = [pyexe, "-sSE", "-c", "import pandas"] msg = ( diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 6db1078fcde4f..6d4c1594146de 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -48,30 +48,37 @@ def setup_method(self, method): def teardown_method(self, method): expr._MIN_ELEMENTS = self._MIN_ELEMENTS - def run_arithmetic(self, df, other): + @staticmethod + def call_op(df, other, flex: bool, opname: str): + if flex: + op = lambda x, y: getattr(x, opname)(y) + op.__name__ = opname + else: + op = getattr(operator, opname) + + expr.set_use_numexpr(False) + expected = op(df, other) + expr.set_use_numexpr(True) + + expr.get_test_result() + + result = op(df, other) + return result, expected + + def run_arithmetic(self, df, other, flex: bool): expr._MIN_ELEMENTS = 0 operations = ["add", "sub", "mul", "mod", "truediv", "floordiv"] - for test_flex in [True, False]: - for arith in operations: - # TODO: share with run_binary - if test_flex: - op = lambda x, y: getattr(x, arith)(y) - op.__name__ = arith + for arith in operations: + result, expected = self.call_op(df, other, flex, arith) + + if arith == "truediv": + if expected.ndim == 1: + assert expected.dtype.kind == "f" else: - op = getattr(operator, arith) - expr.set_use_numexpr(False) - expected = op(df, other) - expr.set_use_numexpr(True) - - result = op(df, other) - if arith == "truediv": - if expected.ndim == 1: - assert expected.dtype.kind == "f" - else: - assert all(x.kind == "f" for x in expected.dtypes.values) - tm.assert_equal(expected, result) - - def run_binary(self, df, other): + assert all(x.kind == "f" for x in expected.dtypes.values) + tm.assert_equal(expected, result) + + def run_binary(self, df, other, flex: bool): """ tests solely that the result is the same whether or not numexpr is enabled. Need to test whether the function does the correct thing @@ -81,37 +88,27 @@ def run_binary(self, df, other): expr.set_test_mode(True) operations = ["gt", "lt", "ge", "le", "eq", "ne"] - for test_flex in [True, False]: - for arith in operations: - if test_flex: - op = lambda x, y: getattr(x, arith)(y) - op.__name__ = arith - else: - op = getattr(operator, arith) - expr.set_use_numexpr(False) - expected = op(df, other) - expr.set_use_numexpr(True) - - expr.get_test_result() - result = op(df, other) - used_numexpr = expr.get_test_result() - assert used_numexpr, "Did not use numexpr as expected." - tm.assert_equal(expected, result) - - def run_frame(self, df, other, run_binary=True): - self.run_arithmetic(df, other) - if run_binary: - expr.set_use_numexpr(False) - binary_comp = other + 1 - expr.set_use_numexpr(True) - self.run_binary(df, binary_comp) + for arith in operations: + result, expected = self.call_op(df, other, flex, arith) + + used_numexpr = expr.get_test_result() + assert used_numexpr, "Did not use numexpr as expected." + tm.assert_equal(expected, result) + + def run_frame(self, df, other, flex: bool): + self.run_arithmetic(df, other, flex) + + expr.set_use_numexpr(False) + binary_comp = other + 1 + expr.set_use_numexpr(True) + self.run_binary(df, binary_comp, flex) for i in range(len(df.columns)): - self.run_arithmetic(df.iloc[:, i], other.iloc[:, i]) + self.run_arithmetic(df.iloc[:, i], other.iloc[:, i], flex) # FIXME: dont leave commented-out # series doesn't uses vec_compare instead of numexpr... # binary_comp = other.iloc[:, i] + 1 - # self.run_binary(df.iloc[:, i], binary_comp) + # self.run_binary(df.iloc[:, i], binary_comp, flex) @pytest.mark.parametrize( "df", @@ -126,14 +123,9 @@ def run_frame(self, df, other, run_binary=True): _mixed2, ], ) - def test_arithmetic(self, df): - # TODO: FIGURE OUT HOW TO GET RUN_BINARY TO WORK WITH MIXED=... - # can't do arithmetic because comparison methods try to do *entire* - # frame instead of by-column - kinds = {x.kind for x in df.dtypes.values} - should = len(kinds) == 1 - - self.run_frame(df, df, run_binary=should) + @pytest.mark.parametrize("flex", [True, False]) + def test_arithmetic(self, df, flex): + self.run_frame(df, df, flex) def test_invalid(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 274860b3fdb5c..6f5345f802a7c 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1,16 +1,8 @@ -import datetime -from io import StringIO -import itertools -from itertools import product - import numpy as np -from numpy.random import randn import pytest -from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype - import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp +from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm AGG_FUNCTIONS = [ @@ -28,116 +20,45 @@ ] -class Base: - def setup_method(self, method): - - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["first", "second"], - ) - self.frame = DataFrame( - np.random.randn(10, 3), - index=index, - columns=Index(["A", "B", "C"], name="exp"), - ) - - self.single_level = MultiIndex( - levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] - ) - - # create test series object - arrays = [ - ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], - ["one", "two", "one", "two", "one", "two", "one", "two"], - ] - tuples = zip(*arrays) - index = MultiIndex.from_tuples(tuples) - s = Series(randn(8), index=index) - s[3] = np.NaN - self.series = s - - self.tdf = tm.makeTimeDataFrame(100) - self.ymd = self.tdf.groupby( - [lambda x: x.year, lambda x: x.month, lambda x: x.day] - ).sum() - - # use Int64Index, to make sure things work - self.ymd.index = self.ymd.index.set_levels( - [lev.astype("i8") for lev in self.ymd.index.levels] - ) - self.ymd.index.set_names(["year", "month", "day"], inplace=True) - - -class TestMultiLevel(Base): - def test_append(self): - a, b = self.frame[:5], self.frame[5:] - - result = a.append(b) - tm.assert_frame_equal(result, self.frame) - - result = a["A"].append(b["A"]) - tm.assert_series_equal(result, self.frame["A"]) - - def test_dataframe_constructor(self): - multi = DataFrame( - np.random.randn(4, 4), - index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], - ) - assert isinstance(multi.index, MultiIndex) - assert not isinstance(multi.columns, MultiIndex) - - multi = DataFrame( - np.random.randn(4, 4), columns=[["a", "a", "b", "b"], ["x", "y", "x", "y"]] - ) - assert isinstance(multi.columns, MultiIndex) - - def test_series_constructor(self): - multi = Series( - 1.0, index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])] - ) - assert isinstance(multi.index, MultiIndex) - - multi = Series(1.0, index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) - assert isinstance(multi.index, MultiIndex) - - multi = Series(range(4), index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) - assert isinstance(multi.index, MultiIndex) - - def test_reindex_level(self): +class TestMultiLevel: + def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data): # axis=0 - month_sums = self.ymd.sum(level="month") - result = month_sums.reindex(self.ymd.index, level=1) - expected = self.ymd.groupby(level="month").transform(np.sum) + ymd = multiindex_year_month_day_dataframe_random_data + + month_sums = ymd.sum(level="month") + result = month_sums.reindex(ymd.index, level=1) + expected = ymd.groupby(level="month").transform(np.sum) tm.assert_frame_equal(result, expected) # Series - result = month_sums["A"].reindex(self.ymd.index, level=1) - expected = self.ymd["A"].groupby(level="month").transform(np.sum) + result = month_sums["A"].reindex(ymd.index, level=1) + expected = ymd["A"].groupby(level="month").transform(np.sum) tm.assert_series_equal(result, expected, check_names=False) # axis=1 - month_sums = self.ymd.T.sum(axis=1, level="month") - result = month_sums.reindex(columns=self.ymd.index, level=1) - expected = self.ymd.groupby(level="month").transform(np.sum).T + month_sums = ymd.T.sum(axis=1, level="month") + result = month_sums.reindex(columns=ymd.index, level=1) + expected = ymd.groupby(level="month").transform(np.sum).T tm.assert_frame_equal(result, expected) - def test_binops_level(self): + def test_binops_level(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + def _check_op(opname): op = getattr(DataFrame, opname) - month_sums = self.ymd.sum(level="month") - result = op(self.ymd, month_sums, level="month") + month_sums = ymd.sum(level="month") + result = op(ymd, month_sums, level="month") - broadcasted = self.ymd.groupby(level="month").transform(np.sum) - expected = op(self.ymd, broadcasted) + broadcasted = ymd.groupby(level="month").transform(np.sum) + expected = op(ymd, broadcasted) tm.assert_frame_equal(result, expected) # Series op = getattr(Series, opname) - result = op(self.ymd["A"], month_sums["A"], level="month") - broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum) - expected = op(self.ymd["A"], broadcasted) + result = op(ymd["A"], month_sums["A"], level="month") + broadcasted = ymd["A"].groupby(level="month").transform(np.sum) + expected = op(ymd["A"], broadcasted) expected.name = "A" tm.assert_series_equal(result, expected) @@ -146,645 +67,36 @@ def _check_op(opname): _check_op("mul") _check_op("div") - def test_pickle(self): - def _test_roundtrip(frame): - unpickled = tm.round_trip_pickle(frame) - tm.assert_frame_equal(frame, unpickled) - - _test_roundtrip(self.frame) - _test_roundtrip(self.frame.T) - _test_roundtrip(self.ymd) - _test_roundtrip(self.ymd.T) + def test_reindex(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data - def test_reindex(self): - expected = self.frame.iloc[[0, 3]] - reindexed = self.frame.loc[[("foo", "one"), ("bar", "one")]] + expected = frame.iloc[[0, 3]] + reindexed = frame.loc[[("foo", "one"), ("bar", "one")]] tm.assert_frame_equal(reindexed, expected) - def test_reindex_preserve_levels(self): - new_index = self.ymd.index[::10] - chunk = self.ymd.reindex(new_index) + def test_reindex_preserve_levels( + self, multiindex_year_month_day_dataframe_random_data + ): + ymd = multiindex_year_month_day_dataframe_random_data + + new_index = ymd.index[::10] + chunk = ymd.reindex(new_index) assert chunk.index is new_index - chunk = self.ymd.loc[new_index] + chunk = ymd.loc[new_index] assert chunk.index is new_index - ymdT = self.ymd.T + ymdT = ymd.T chunk = ymdT.reindex(columns=new_index) assert chunk.columns is new_index chunk = ymdT.loc[:, new_index] assert chunk.columns is new_index - def test_repr_to_string(self): - repr(self.frame) - repr(self.ymd) - repr(self.frame.T) - repr(self.ymd.T) - - buf = StringIO() - self.frame.to_string(buf=buf) - self.ymd.to_string(buf=buf) - self.frame.T.to_string(buf=buf) - self.ymd.T.to_string(buf=buf) - - def test_repr_name_coincide(self): - index = MultiIndex.from_tuples( - [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"] - ) - - df = DataFrame({"value": [0, 1]}, index=index) - - lines = repr(df).split("\n") - assert lines[2].startswith("a 0 foo") - - def test_delevel_infer_dtype(self): - tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1])) - index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) - df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) - deleveled = df.reset_index() - assert is_integer_dtype(deleveled["prm1"]) - assert is_float_dtype(deleveled["prm2"]) - - def test_reset_index_with_drop(self): - deleveled = self.ymd.reset_index(drop=True) - assert len(deleveled.columns) == len(self.ymd.columns) - assert deleveled.index.name == self.ymd.index.name - - deleveled = self.series.reset_index() - assert isinstance(deleveled, DataFrame) - assert len(deleveled.columns) == len(self.series.index.levels) + 1 - assert deleveled.index.name == self.series.index.name - - deleveled = self.series.reset_index(drop=True) - assert isinstance(deleveled, Series) - assert deleveled.index.name == self.series.index.name - - def test_count_level(self): - def _check_counts(frame, axis=0): - index = frame._get_axis(axis) - for i in range(index.nlevels): - result = frame.count(axis=axis, level=i) - expected = frame.groupby(axis=axis, level=i).count() - expected = expected.reindex_like(result).astype("i8") - tm.assert_frame_equal(result, expected) - - self.frame.iloc[1, [1, 2]] = np.nan - self.frame.iloc[7, [0, 1]] = np.nan - self.ymd.iloc[1, [1, 2]] = np.nan - self.ymd.iloc[7, [0, 1]] = np.nan - - _check_counts(self.frame) - _check_counts(self.ymd) - _check_counts(self.frame.T, axis=1) - _check_counts(self.ymd.T, axis=1) - - # can't call with level on regular DataFrame - df = tm.makeTimeDataFrame() - with pytest.raises(TypeError, match="hierarchical"): - df.count(level=0) - - self.frame["D"] = "foo" - result = self.frame.count(level=0, numeric_only=True) - tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) - - def test_count_index_with_nan(self): - # https://github.com/pandas-dev/pandas/issues/21824 - df = DataFrame( - { - "Person": ["John", "Myla", None, "John", "Myla"], - "Age": [24.0, 5, 21.0, 33, 26], - "Single": [False, True, True, True, False], - } - ) - - # count on row labels - res = df.set_index(["Person", "Single"]).count(level="Person") - expected = DataFrame( - index=Index(["John", "Myla"], name="Person"), - columns=Index(["Age"]), - data=[2, 2], - ) - tm.assert_frame_equal(res, expected) - - # count on column labels - res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1) - expected = DataFrame( - columns=Index(["John", "Myla"], name="Person"), - index=Index(["Age"]), - data=[[2, 2]], - ) - tm.assert_frame_equal(res, expected) - - def test_count_level_series(self): - index = MultiIndex( - levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], - codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], - ) - - s = Series(np.random.randn(len(index)), index=index) - - result = s.count(level=0) - expected = s.groupby(level=0).count() - tm.assert_series_equal( - result.astype("f8"), expected.reindex(result.index).fillna(0) - ) - - result = s.count(level=1) - expected = s.groupby(level=1).count() - tm.assert_series_equal( - result.astype("f8"), expected.reindex(result.index).fillna(0) - ) - - def test_count_level_corner(self): - s = self.frame["A"][:0] - result = s.count(level=0) - expected = Series(0, index=s.index.levels[0], name="A") - tm.assert_series_equal(result, expected) - - df = self.frame[:0] - result = df.count(level=0) - expected = ( - DataFrame(index=s.index.levels[0].set_names(["first"]), columns=df.columns) - .fillna(0) - .astype(np.int64) - ) - tm.assert_frame_equal(result, expected) - - def test_get_level_number_out_of_bounds(self): - with pytest.raises(IndexError, match="Too many levels"): - self.frame.index._get_level_number(2) - with pytest.raises(IndexError, match="not a valid level number"): - self.frame.index._get_level_number(-3) - - def test_unstack(self): - # just check that it works for now - unstacked = self.ymd.unstack() - unstacked.unstack() - - # test that ints work - self.ymd.astype(int).unstack() - - # test that int32 work - self.ymd.astype(np.int32).unstack() - - @pytest.mark.parametrize( - "result_rows,result_columns,index_product,expected_row", - [ - ( - [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]], - ["ix1", "ix2", "col1", "col2", "col3", "col4"], - 2, - [None, None, 30.0, None], - ), - ( - [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]], - ["ix1", "ix2", "col1", "col2", "col3"], - 2, - [None, None, 30.0], - ), - ( - [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]], - ["ix1", "ix2", "col1", "col2", "col3"], - None, - [None, None, 30.0], - ), - ], - ) - def test_unstack_partial( - self, result_rows, result_columns, index_product, expected_row - ): - # check for regressions on this issue: - # https://github.com/pandas-dev/pandas/issues/19351 - # make sure DataFrame.unstack() works when its run on a subset of the DataFrame - # and the Index levels contain values that are not present in the subset - result = pd.DataFrame(result_rows, columns=result_columns).set_index( - ["ix1", "ix2"] - ) - result = result.iloc[1:2].unstack("ix2") - expected = pd.DataFrame( - [expected_row], - columns=pd.MultiIndex.from_product( - [result_columns[2:], [index_product]], names=[None, "ix2"] - ), - index=pd.Index([2], name="ix1"), - ) - tm.assert_frame_equal(result, expected) - - def test_unstack_multiple_no_empty_columns(self): - index = MultiIndex.from_tuples( - [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)] - ) - - s = Series(np.random.randn(4), index=index) - - unstacked = s.unstack([1, 2]) - expected = unstacked.dropna(axis=1, how="all") - tm.assert_frame_equal(unstacked, expected) - - def test_stack(self): - # regular roundtrip - unstacked = self.ymd.unstack() - restacked = unstacked.stack() - tm.assert_frame_equal(restacked, self.ymd) - - unlexsorted = self.ymd.sort_index(level=2) - - unstacked = unlexsorted.unstack(2) - restacked = unstacked.stack() - tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) - - unlexsorted = unlexsorted[::-1] - unstacked = unlexsorted.unstack(1) - restacked = unstacked.stack().swaplevel(1, 2) - tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) - - unlexsorted = unlexsorted.swaplevel(0, 1) - unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) - restacked = unstacked.stack(0).swaplevel(1, 2) - tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) - - # columns unsorted - unstacked = self.ymd.unstack() - unstacked = unstacked.sort_index(axis=1, ascending=False) - restacked = unstacked.stack() - tm.assert_frame_equal(restacked, self.ymd) - - # more than 2 levels in the columns - unstacked = self.ymd.unstack(1).unstack(1) - - result = unstacked.stack(1) - expected = self.ymd.unstack() - tm.assert_frame_equal(result, expected) - - result = unstacked.stack(2) - expected = self.ymd.unstack(1) - tm.assert_frame_equal(result, expected) - - result = unstacked.stack(0) - expected = self.ymd.stack().unstack(1).unstack(1) - tm.assert_frame_equal(result, expected) - - # not all levels present in each echelon - unstacked = self.ymd.unstack(2).loc[:, ::3] - stacked = unstacked.stack().stack() - ymd_stacked = self.ymd.stack() - tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) - - # stack with negative number - result = self.ymd.unstack(0).stack(-2) - expected = self.ymd.unstack(0).stack(0) - - # GH10417 - def check(left, right): - tm.assert_series_equal(left, right) - assert left.index.is_unique is False - li, ri = left.index, right.index - tm.assert_index_equal(li, ri) - - df = DataFrame( - np.arange(12).reshape(4, 3), - index=list("abab"), - columns=["1st", "2nd", "3rd"], - ) - - mi = MultiIndex( - levels=[["a", "b"], ["1st", "2nd", "3rd"]], - codes=[np.tile(np.arange(2).repeat(3), 2), np.tile(np.arange(3), 4)], - ) - - left, right = df.stack(), Series(np.arange(12), index=mi) - check(left, right) - - df.columns = ["1st", "2nd", "1st"] - mi = MultiIndex( - levels=[["a", "b"], ["1st", "2nd"]], - codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)], - ) - - left, right = df.stack(), Series(np.arange(12), index=mi) - check(left, right) - - tpls = ("a", 2), ("b", 1), ("a", 1), ("b", 2) - df.index = MultiIndex.from_tuples(tpls) - mi = MultiIndex( - levels=[["a", "b"], [1, 2], ["1st", "2nd"]], - codes=[ - np.tile(np.arange(2).repeat(3), 2), - np.repeat([1, 0, 1], [3, 6, 3]), - np.tile([0, 1, 0], 4), - ], - ) - - left, right = df.stack(), Series(np.arange(12), index=mi) - check(left, right) - - def test_unstack_odd_failure(self): - data = """day,time,smoker,sum,len -Fri,Dinner,No,8.25,3. -Fri,Dinner,Yes,27.03,9 -Fri,Lunch,No,3.0,1 -Fri,Lunch,Yes,13.68,6 -Sat,Dinner,No,139.63,45 -Sat,Dinner,Yes,120.77,42 -Sun,Dinner,No,180.57,57 -Sun,Dinner,Yes,66.82,19 -Thur,Dinner,No,3.0,1 -Thur,Lunch,No,117.32,44 -Thur,Lunch,Yes,51.51,17""" - - df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"]) - - # it works, #2100 - result = df.unstack(2) - - recons = result.stack() - tm.assert_frame_equal(recons, df) - - def test_stack_mixed_dtype(self): - df = self.frame.T - df["foo", "four"] = "foo" - df = df.sort_index(level=1, axis=1) - - stacked = df.stack() - result = df["foo"].stack().sort_index() - tm.assert_series_equal(stacked["foo"], result, check_names=False) - assert result.name is None - assert stacked["bar"].dtype == np.float_ - - def test_unstack_bug(self): - df = DataFrame( - { - "state": ["naive", "naive", "naive", "activ", "activ", "activ"], - "exp": ["a", "b", "b", "b", "a", "a"], - "barcode": [1, 2, 3, 4, 1, 3], - "v": ["hi", "hi", "bye", "bye", "bye", "peace"], - "extra": np.arange(6.0), - } - ) - - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) - - unstacked = result.unstack() - restacked = unstacked.stack() - tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) - - def test_stack_unstack_preserve_names(self): - unstacked = self.frame.unstack() - assert unstacked.index.name == "first" - assert unstacked.columns.names == ["exp", "second"] - - restacked = unstacked.stack() - assert restacked.index.names == self.frame.index.names - - @pytest.mark.parametrize("method", ["stack", "unstack"]) - def test_stack_unstack_wrong_level_name(self, method): - # GH 18303 - wrong level name should raise - - # A DataFrame with flat axes: - df = self.frame.loc["foo"] - - with pytest.raises(KeyError, match="does not match index name"): - getattr(df, method)("mistake") - - if method == "unstack": - # Same on a Series: - s = df.iloc[:, 0] - with pytest.raises(KeyError, match="does not match index name"): - getattr(s, method)("mistake") - - def test_unused_level_raises(self): - # GH 20410 - mi = MultiIndex( - levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], - codes=[[1, 0], [1, 0]], - ) - df = DataFrame(-1, index=range(3), columns=mi) - - with pytest.raises(KeyError, match="notevenone"): - df["notevenone"] - - def test_unstack_level_name(self): - result = self.frame.unstack("second") - expected = self.frame.unstack(level=1) - tm.assert_frame_equal(result, expected) - - def test_stack_level_name(self): - unstacked = self.frame.unstack("second") - result = unstacked.stack("exp") - expected = self.frame.unstack().stack(0) - tm.assert_frame_equal(result, expected) - - result = self.frame.stack("exp") - expected = self.frame.stack() - tm.assert_series_equal(result, expected) - - def test_stack_unstack_multiple(self): - unstacked = self.ymd.unstack(["year", "month"]) - expected = self.ymd.unstack("year").unstack("month") - tm.assert_frame_equal(unstacked, expected) - assert unstacked.columns.names == expected.columns.names - - # series - s = self.ymd["A"] - s_unstacked = s.unstack(["year", "month"]) - tm.assert_frame_equal(s_unstacked, expected["A"]) - - restacked = unstacked.stack(["year", "month"]) - restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) - restacked = restacked.sort_index(level=0) - - tm.assert_frame_equal(restacked, self.ymd) - assert restacked.index.names == self.ymd.index.names - - # GH #451 - unstacked = self.ymd.unstack([1, 2]) - expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how="all") - tm.assert_frame_equal(unstacked, expected) - - unstacked = self.ymd.unstack([2, 1]) - expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how="all") - tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) - - def test_stack_names_and_numbers(self): - unstacked = self.ymd.unstack(["year", "month"]) - - # Can't use mixture of names and numbers to stack - with pytest.raises(ValueError, match="level should contain"): - unstacked.stack([0, "month"]) - - def test_stack_multiple_out_of_bounds(self): - # nlevels == 3 - unstacked = self.ymd.unstack(["year", "month"]) - - with pytest.raises(IndexError, match="Too many levels"): - unstacked.stack([2, 3]) - with pytest.raises(IndexError, match="not a valid level number"): - unstacked.stack([-4, -3]) - - def test_unstack_period_series(self): - # GH 4342 - idx1 = pd.PeriodIndex( - ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], - freq="M", - name="period", - ) - idx2 = Index(["A", "B"] * 3, name="str") - value = [1, 2, 3, 4, 5, 6] - - idx = MultiIndex.from_arrays([idx1, idx2]) - s = Series(value, index=idx) - - result1 = s.unstack() - result2 = s.unstack(level=1) - result3 = s.unstack(level=0) - - e_idx = pd.PeriodIndex( - ["2013-01", "2013-02", "2013-03"], freq="M", name="period" - ) - expected = DataFrame( - {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"] - ) - expected.columns.name = "str" - - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected.T) - - idx1 = pd.PeriodIndex( - ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], - freq="M", - name="period1", - ) - - idx2 = pd.PeriodIndex( - ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"], - freq="M", - name="period2", - ) - idx = MultiIndex.from_arrays([idx1, idx2]) - s = Series(value, index=idx) - - result1 = s.unstack() - result2 = s.unstack(level=1) - result3 = s.unstack(level=0) - - e_idx = pd.PeriodIndex( - ["2013-01", "2013-02", "2013-03"], freq="M", name="period1" - ) - e_cols = pd.PeriodIndex( - ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"], - freq="M", - name="period2", - ) - expected = DataFrame( - [ - [np.nan, np.nan, np.nan, np.nan, 2, 1], - [np.nan, np.nan, 4, 3, np.nan, np.nan], - [6, 5, np.nan, np.nan, np.nan, np.nan], - ], - index=e_idx, - columns=e_cols, - ) - - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected.T) - - def test_unstack_period_frame(self): - # GH 4342 - idx1 = pd.PeriodIndex( - ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"], - freq="M", - name="period1", - ) - idx2 = pd.PeriodIndex( - ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"], - freq="M", - name="period2", - ) - value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]} - idx = MultiIndex.from_arrays([idx1, idx2]) - df = DataFrame(value, index=idx) - - result1 = df.unstack() - result2 = df.unstack(level=1) - result3 = df.unstack(level=0) - - e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1") - e_2 = pd.PeriodIndex( - ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"], - freq="M", - name="period2", - ) - e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2]) - expected = DataFrame( - [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols - ) - - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - e_1 = pd.PeriodIndex( - ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1" - ) - e_2 = pd.PeriodIndex( - ["2013-10", "2013-12", "2014-02"], freq="M", name="period2" - ) - e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1]) - expected = DataFrame( - [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols - ) - - tm.assert_frame_equal(result3, expected) - - def test_stack_multiple_bug(self): - """ bug when some uniques are not present in the data #3170""" - id_col = ([1] * 3) + ([2] * 3) - name = (["a"] * 3) + (["b"] * 3) - date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2) - var1 = np.random.randint(0, 100, 6) - df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1)) - - multi = df.set_index(["DATE", "ID"]) - multi.columns.name = "Params" - unst = multi.unstack("ID") - down = unst.resample("W-THU").mean() - - rs = down.stack("ID") - xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") - xp.columns.name = "Params" - tm.assert_frame_equal(rs, xp) - - def test_stack_dropna(self): - # GH #3997 - df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]}) - df = df.set_index(["A", "B"]) - - stacked = df.unstack().stack(dropna=False) - assert len(stacked) > len(stacked.dropna()) - - stacked = df.unstack().stack(dropna=True) - tm.assert_frame_equal(stacked, stacked.dropna()) + def test_groupby_transform(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data - def test_unstack_multiple_hierarchical(self): - df = DataFrame( - index=[ - [0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], - [0, 1, 0, 1, 0, 1, 0, 1], - ], - columns=[[0, 0, 1, 1], [0, 1, 0, 1]], - ) - - df.index.names = ["a", "b", "c"] - df.columns.names = ["d", "e"] - - # it works! - df.unstack(["b", "c"]) - - def test_groupby_transform(self): - s = self.frame["A"] + s = frame["A"] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) @@ -794,109 +106,6 @@ def test_groupby_transform(self): result = applied.reindex(expected.index) tm.assert_series_equal(result, expected, check_names=False) - def test_unstack_sparse_keyspace(self): - # memory problems with naive impl #2278 - # Generate Long File & Test Pivot - NUM_ROWS = 1000 - - df = DataFrame( - { - "A": np.random.randint(100, size=NUM_ROWS), - "B": np.random.randint(300, size=NUM_ROWS), - "C": np.random.randint(-7, 7, size=NUM_ROWS), - "D": np.random.randint(-19, 19, size=NUM_ROWS), - "E": np.random.randint(3000, size=NUM_ROWS), - "F": np.random.randn(NUM_ROWS), - } - ) - - idf = df.set_index(["A", "B", "C", "D", "E"]) - - # it works! is sufficient - idf.unstack("E") - - def test_unstack_unobserved_keys(self): - # related to #2278 refactoring - levels = [[0, 1], [0, 1, 2, 3]] - codes = [[0, 0, 1, 1], [0, 2, 0, 2]] - - index = MultiIndex(levels, codes) - - df = DataFrame(np.random.randn(4, 2), index=index) - - result = df.unstack() - assert len(result.columns) == 4 - - recons = result.stack() - tm.assert_frame_equal(recons, df) - - @pytest.mark.slow - def test_unstack_number_of_levels_larger_than_int32(self): - # GH 20601 - df = DataFrame( - np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)] - ) - with pytest.raises(ValueError, match="int32 overflow"): - df.unstack() - - def test_stack_order_with_unsorted_levels(self): - # GH 16323 - - def manual_compare_stacked(df, df_stacked, lev0, lev1): - assert all( - df.loc[row, col] == df_stacked.loc[(row, col[lev0]), col[lev1]] - for row in df.index - for col in df.columns - ) - - # deep check for 1-row case - for width in [2, 3]: - levels_poss = itertools.product( - itertools.permutations([0, 1, 2], width), repeat=2 - ) - - for levels in levels_poss: - columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - df = DataFrame(columns=columns, data=[range(4)]) - for stack_lev in range(2): - df_stacked = df.stack(stack_lev) - manual_compare_stacked(df, df_stacked, stack_lev, 1 - stack_lev) - - # check multi-row case - mi = MultiIndex( - levels=[["A", "C", "B"], ["B", "A", "C"]], - codes=[np.repeat(range(3), 3), np.tile(range(3), 3)], - ) - df = DataFrame( - columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1) - ) - manual_compare_stacked(df, df.stack(0), 0, 1) - - def test_stack_unstack_unordered_multiindex(self): - # GH 18265 - values = np.arange(5) - data = np.vstack( - [ - [f"b{x}" for x in values], # b0, b1, .. - [f"a{x}" for x in values], # a0, a1, .. - ] - ) - df = pd.DataFrame(data.T, columns=["b", "a"]) - df.columns.name = "first" - second_level_dict = {"x": df} - multi_level_df = pd.concat(second_level_dict, axis=1) - multi_level_df.columns.names = ["second", "first"] - df = multi_level_df.reindex(sorted(multi_level_df.columns), axis=1) - result = df.stack(["first", "second"]).unstack(["first", "second"]) - expected = DataFrame( - [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]], - index=[0, 1, 2, 3, 4], - columns=MultiIndex.from_tuples( - [("a", "x"), ("b", "x")], names=["first", "second"] - ), - ) - tm.assert_frame_equal(result, expected) - def test_groupby_corner(self): midx = MultiIndex( levels=[["foo"], ["bar"], ["baz"]], @@ -926,49 +135,10 @@ def test_groupby_level_no_obs(self): result = grouped.sum() assert (result.columns == ["f2", "f3"]).all() - def test_join(self): - a = self.frame.loc[self.frame.index[:5], ["A"]] - b = self.frame.loc[self.frame.index[2:], ["B", "C"]] - - joined = a.join(b, how="outer").reindex(self.frame.index) - expected = self.frame.copy() - expected.values[np.isnan(joined.values)] = np.nan - - assert not np.isnan(joined.values).all() - - # TODO what should join do with names ? - tm.assert_frame_equal(joined, expected, check_names=False) - - def test_swaplevel(self): - swapped = self.frame["A"].swaplevel() - swapped2 = self.frame["A"].swaplevel(0) - swapped3 = self.frame["A"].swaplevel(0, 1) - swapped4 = self.frame["A"].swaplevel("first", "second") - assert not swapped.index.equals(self.frame.index) - tm.assert_series_equal(swapped, swapped2) - tm.assert_series_equal(swapped, swapped3) - tm.assert_series_equal(swapped, swapped4) - - back = swapped.swaplevel() - back2 = swapped.swaplevel(0) - back3 = swapped.swaplevel(0, 1) - back4 = swapped.swaplevel("second", "first") - assert back.index.equals(self.frame.index) - tm.assert_series_equal(back, back2) - tm.assert_series_equal(back, back3) - tm.assert_series_equal(back, back4) - - ft = self.frame.T - swapped = ft.swaplevel("first", "second", axis=1) - exp = self.frame.swaplevel("first", "second").T - tm.assert_frame_equal(swapped, exp) - - msg = "Can only swap levels on a hierarchical axis." - with pytest.raises(TypeError, match=msg): - DataFrame(range(3)).swaplevel() - - def test_insert_index(self): - df = self.ymd[:5].T + def test_insert_index(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + + df = ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] assert isinstance(df.columns, MultiIndex) assert (df[2000, 1, 10] == df[2000, 1, 7]).all() @@ -993,45 +163,20 @@ def test_alignment(self): exp = x.reindex(exp_index) - y.reindex(exp_index) tm.assert_series_equal(res, exp) - def test_count(self): - frame = self.frame.copy() - frame.index.names = ["a", "b"] - - result = frame.count(level="b") - expect = self.frame.count(level=1) - tm.assert_frame_equal(result, expect, check_names=False) - - result = frame.count(level="a") - expect = self.frame.count(level=0) - tm.assert_frame_equal(result, expect, check_names=False) - - series = self.series.copy() - series.index.names = ["a", "b"] - - result = series.count(level="b") - expect = self.series.count(level=1).rename_axis("b") - tm.assert_series_equal(result, expect) - - result = series.count(level="a") - expect = self.series.count(level=0).rename_axis("a") - tm.assert_series_equal(result, expect) - - msg = "Level x not found" - with pytest.raises(KeyError, match=msg): - series.count("x") - with pytest.raises(KeyError, match=msg): - frame.count(level="x") - @pytest.mark.parametrize("op", AGG_FUNCTIONS) @pytest.mark.parametrize("level", [0, 1]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) - def test_series_group_min_max(self, op, level, skipna, sort): + def test_series_group_min_max( + self, op, level, skipna, sort, series_with_multilevel_index + ): # GH 17537 - grouped = self.series.groupby(level=level, sort=sort) + ser = series_with_multilevel_index + + grouped = ser.groupby(level=level, sort=sort) # skipna=True leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna)) - rightside = getattr(self.series, op)(level=level, skipna=skipna) + rightside = getattr(ser, op)(level=level, skipna=skipna) if sort: rightside = rightside.sort_index(level=level) tm.assert_series_equal(leftside, rightside) @@ -1041,17 +186,21 @@ def test_series_group_min_max(self, op, level, skipna, sort): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) - def test_frame_group_ops(self, op, level, axis, skipna, sort): + def test_frame_group_ops( + self, op, level, axis, skipna, sort, multiindex_dataframe_random_data + ): # GH 17537 - self.frame.iloc[1, [1, 2]] = np.nan - self.frame.iloc[7, [0, 1]] = np.nan + frame = multiindex_dataframe_random_data + + frame.iloc[1, [1, 2]] = np.nan + frame.iloc[7, [0, 1]] = np.nan - level_name = self.frame.index.names[level] + level_name = frame.index.names[level] if axis == 0: - frame = self.frame + frame = frame else: - frame = self.frame.T + frame = frame.T grouped = frame.groupby(level=level, axis=axis, sort=sort) @@ -1075,47 +224,6 @@ def aggf(x): tm.assert_frame_equal(leftside, rightside) - def test_stat_op_corner(self): - obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) - - result = obj.sum(level=0) - expected = Series([10.0], index=[2]) - tm.assert_series_equal(result, expected) - - def test_frame_any_all_group(self): - df = DataFrame( - {"data": [False, False, True, False, True, False, True]}, - index=[ - ["one", "one", "two", "one", "two", "two", "two"], - [0, 1, 0, 2, 1, 2, 3], - ], - ) - - result = df.any(level=0) - ex = DataFrame({"data": [False, True]}, index=["one", "two"]) - tm.assert_frame_equal(result, ex) - - result = df.all(level=0) - ex = DataFrame({"data": [False, False]}, index=["one", "two"]) - tm.assert_frame_equal(result, ex) - - def test_series_any_timedelta(self): - # GH 17667 - df = DataFrame( - { - "a": Series([0, 0]), - "t": Series([pd.to_timedelta(0, "s"), pd.to_timedelta(1, "ms")]), - } - ) - - result = df.any(axis=0) - expected = Series(data=[False, True], index=["a", "t"]) - tm.assert_series_equal(result, expected) - - result = df.any(axis=1) - expected = Series(data=[False, True]) - tm.assert_series_equal(result, expected) - def test_std_var_pass_ddof(self): index = MultiIndex.from_arrays( [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] @@ -1134,28 +242,33 @@ def test_std_var_pass_ddof(self): expected = df.groupby(level=0).agg(alt) tm.assert_frame_equal(result, expected) - def test_frame_series_agg_multiple_levels(self): - result = self.ymd.sum(level=["year", "month"]) - expected = self.ymd.groupby(level=["year", "month"]).sum() - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_agg_multiple_levels( + self, multiindex_year_month_day_dataframe_random_data, klass + ): + ymd = multiindex_year_month_day_dataframe_random_data + if klass is Series: + ymd = ymd["A"] - result = self.ymd["A"].sum(level=["year", "month"]) - expected = self.ymd["A"].groupby(level=["year", "month"]).sum() - tm.assert_series_equal(result, expected) + result = ymd.sum(level=["year", "month"]) + expected = ymd.groupby(level=["year", "month"]).sum() + tm.assert_equal(result, expected) - def test_groupby_multilevel(self): - result = self.ymd.groupby(level=[0, 1]).mean() + def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data - k1 = self.ymd.index.get_level_values(0) - k2 = self.ymd.index.get_level_values(1) + result = ymd.groupby(level=[0, 1]).mean() - expected = self.ymd.groupby([k1, k2]).mean() + k1 = ymd.index.get_level_values(0) + k2 = ymd.index.get_level_values(1) + + expected = ymd.groupby([k1, k2]).mean() # TODO groupby with level_values drops names tm.assert_frame_equal(result, expected, check_names=False) - assert result.index.names == self.ymd.index.names[:2] + assert result.index.names == ymd.index.names[:2] - result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() + result2 = ymd.groupby(level=ymd.index.names[:2]).mean() tm.assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): @@ -1169,69 +282,6 @@ def test_multilevel_consolidate(self): df["Totals", ""] = df.sum(1) df = df._consolidate() - def test_loc_preserve_names(self): - result = self.ymd.loc[2000] - result2 = self.ymd["A"].loc[2000] - assert result.index.names == self.ymd.index.names[1:] - assert result2.index.names == self.ymd.index.names[1:] - - result = self.ymd.loc[2000, 2] - result2 = self.ymd["A"].loc[2000, 2] - assert result.index.name == self.ymd.index.names[2] - assert result2.index.name == self.ymd.index.names[2] - - def test_unstack_preserve_types(self): - # GH #403 - self.ymd["E"] = "foo" - self.ymd["F"] = 2 - - unstacked = self.ymd.unstack("month") - assert unstacked["A", 1].dtype == np.float64 - assert unstacked["E", 1].dtype == np.object_ - assert unstacked["F", 1].dtype == np.float64 - - def test_unstack_group_index_overflow(self): - codes = np.tile(np.arange(500), 2) - level = np.arange(500) - - index = MultiIndex( - levels=[level] * 8 + [[0, 1]], - codes=[codes] * 8 + [np.arange(2).repeat(500)], - ) - - s = Series(np.arange(1000), index=index) - result = s.unstack() - assert result.shape == (500, 2) - - # test roundtrip - stacked = result.stack() - tm.assert_series_equal(s, stacked.reindex(s.index)) - - # put it at beginning - index = MultiIndex( - levels=[[0, 1]] + [level] * 8, - codes=[np.arange(2).repeat(500)] + [codes] * 8, - ) - - s = Series(np.arange(1000), index=index) - result = s.unstack(0) - assert result.shape == (500, 2) - - # put it in middle - index = MultiIndex( - levels=[level] * 4 + [[0, 1]] + [level] * 4, - codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4), - ) - - s = Series(np.arange(1000), index=index) - result = s.unstack(4) - assert result.shape == (500, 2) - - def test_to_html(self): - self.ymd.columns.name = "foo" - self.ymd.to_html() - self.ymd.T.to_html() - def test_level_with_tuples(self): index = MultiIndex( levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], @@ -1280,46 +330,23 @@ def test_level_with_tuples(self): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - def test_mixed_depth_pop(self): - arrays = [ - ["a", "top", "top", "routine1", "routine1", "routine2"], - ["", "OD", "OD", "result1", "result2", "result1"], - ["", "wx", "wy", "", "", ""], - ] + def test_reindex_level_partial_selection(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) - - df1 = df.copy() - df2 = df.copy() - result = df1.pop("a") - expected = df2.pop(("a", "", "")) - tm.assert_series_equal(expected, result, check_names=False) - tm.assert_frame_equal(df1, df2) - assert result.name == "a" - - expected = df1["top"] - df1 = df1.drop(["top"], axis=1) - result = df2.pop("top") - tm.assert_frame_equal(expected, result) - tm.assert_frame_equal(df1, df2) - - def test_reindex_level_partial_selection(self): - result = self.frame.reindex(["foo", "qux"], level=0) - expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]] + result = frame.reindex(["foo", "qux"], level=0) + expected = frame.iloc[[0, 1, 2, 7, 8, 9]] tm.assert_frame_equal(result, expected) - result = self.frame.T.reindex(["foo", "qux"], axis=1, level=0) + result = frame.T.reindex(["foo", "qux"], axis=1, level=0) tm.assert_frame_equal(result, expected.T) - result = self.frame.loc[["foo", "qux"]] + result = frame.loc[["foo", "qux"]] tm.assert_frame_equal(result, expected) - result = self.frame["A"].loc[["foo", "qux"]] + result = frame["A"].loc[["foo", "qux"]] tm.assert_series_equal(result, expected["A"]) - result = self.frame.T.loc[:, ["foo", "qux"]] + result = frame.T.loc[:, ["foo", "qux"]] tm.assert_frame_equal(result, expected.T) def test_unicode_repr_level_names(self): @@ -1330,29 +357,6 @@ def test_unicode_repr_level_names(self): repr(s) repr(df) - def test_join_segfault(self): - # 1532 - df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) - df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) - df1 = df1.set_index(["a", "b"]) - df2 = df2.set_index(["a", "b"]) - # it works! - for how in ["left", "right", "outer"]: - df1.join(df2, how=how) - - def test_frame_dict_constructor_empty_series(self): - s1 = Series( - [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)]) - ) - s2 = Series( - [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]) - ) - s3 = Series(dtype=object) - - # it works! - DataFrame({"foo": s1, "bar": s2, "baz": s3}) - DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2}) - @pytest.mark.parametrize("d", [4, "d"]) def test_empty_frame_groupby_dtypes_consistency(self, d): # GH 20888 @@ -1367,37 +371,6 @@ def test_empty_frame_groupby_dtypes_consistency(self, d): tm.assert_index_equal(result, expected) - def test_multiindex_na_repr(self): - # only an issue with long columns - df3 = DataFrame( - { - "A" * 30: {("A", "A0006000", "nuit"): "A0006000"}, - "B" * 30: {("A", "A0006000", "nuit"): np.nan}, - "C" * 30: {("A", "A0006000", "nuit"): np.nan}, - "D" * 30: {("A", "A0006000", "nuit"): np.nan}, - "E" * 30: {("A", "A0006000", "nuit"): "A"}, - "F" * 30: {("A", "A0006000", "nuit"): np.nan}, - } - ) - - idf = df3.set_index(["A" * 30, "C" * 30]) - repr(idf) - - def test_assign_index_sequences(self): - # #2200 - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( - ["a", "b"] - ) - index = list(df.index) - index[0] = ("faz", "boo") - df.index = index - repr(df) - - # this travels an improper code path - index[0] = ["faz", "boo"] - df.index = index - repr(df) - def test_duplicate_groupby_issues(self): idx_tp = [ ("600809", "20061231"), @@ -1435,300 +408,6 @@ def test_duplicate_mi(self): result = df.loc[("foo", "bar")] tm.assert_frame_equal(result, expected) - def test_multiindex_set_index(self): - # segfault in #3308 - d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} - df = DataFrame(d) - tuples = [(0, 1), (0, 2), (1, 2)] - df["tuples"] = tuples - - index = MultiIndex.from_tuples(df["tuples"]) - # it works! - df.set_index(index) - - def test_set_index_datetime(self): - # GH 3950 - df = DataFrame( - { - "label": ["a", "a", "a", "b", "b", "b"], - "datetime": [ - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - ], - "value": range(6), - } - ) - df.index = pd.to_datetime(df.pop("datetime"), utc=True) - df.index = df.index.tz_convert("US/Pacific") - - expected = pd.DatetimeIndex( - ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], - name="datetime", - ) - expected = expected.tz_localize("UTC").tz_convert("US/Pacific") - - df = df.set_index("label", append=True) - tm.assert_index_equal(df.index.levels[0], expected) - tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) - assert df.index.names == ["datetime", "label"] - - df = df.swaplevel(0, 1) - tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) - tm.assert_index_equal(df.index.levels[1], expected) - assert df.index.names == ["label", "datetime"] - - df = DataFrame(np.random.random(6)) - idx1 = pd.DatetimeIndex( - [ - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - ], - tz="US/Eastern", - ) - idx2 = pd.DatetimeIndex( - [ - "2012-04-01 09:00", - "2012-04-01 09:00", - "2012-04-01 09:00", - "2012-04-02 09:00", - "2012-04-02 09:00", - "2012-04-02 09:00", - ], - tz="US/Eastern", - ) - idx3 = pd.date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") - idx3 = idx3._with_freq(None) - - df = df.set_index(idx1) - df = df.set_index(idx2, append=True) - df = df.set_index(idx3, append=True) - - expected1 = pd.DatetimeIndex( - ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], - tz="US/Eastern", - ) - expected2 = pd.DatetimeIndex( - ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern" - ) - - tm.assert_index_equal(df.index.levels[0], expected1) - tm.assert_index_equal(df.index.levels[1], expected2) - tm.assert_index_equal(df.index.levels[2], idx3) - - # GH 7092 - tm.assert_index_equal(df.index.get_level_values(0), idx1) - tm.assert_index_equal(df.index.get_level_values(1), idx2) - tm.assert_index_equal(df.index.get_level_values(2), idx3) - - def test_reset_index_datetime(self): - # GH 3950 - for tz in ["UTC", "Asia/Tokyo", "US/Eastern"]: - idx1 = pd.date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") - idx2 = Index(range(5), name="idx2", dtype="int64") - idx = MultiIndex.from_arrays([idx1, idx2]) - df = DataFrame( - {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, - index=idx, - ) - - expected = DataFrame( - { - "idx1": [ - datetime.datetime(2011, 1, 1), - datetime.datetime(2011, 1, 2), - datetime.datetime(2011, 1, 3), - datetime.datetime(2011, 1, 4), - datetime.datetime(2011, 1, 5), - ], - "idx2": np.arange(5, dtype="int64"), - "a": np.arange(5, dtype="int64"), - "b": ["A", "B", "C", "D", "E"], - }, - columns=["idx1", "idx2", "a", "b"], - ) - expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) - - tm.assert_frame_equal(df.reset_index(), expected) - - idx3 = pd.date_range( - "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3" - ) - idx = MultiIndex.from_arrays([idx1, idx2, idx3]) - df = DataFrame( - {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, - index=idx, - ) - - expected = DataFrame( - { - "idx1": [ - datetime.datetime(2011, 1, 1), - datetime.datetime(2011, 1, 2), - datetime.datetime(2011, 1, 3), - datetime.datetime(2011, 1, 4), - datetime.datetime(2011, 1, 5), - ], - "idx2": np.arange(5, dtype="int64"), - "idx3": [ - datetime.datetime(2012, 1, 1), - datetime.datetime(2012, 2, 1), - datetime.datetime(2012, 3, 1), - datetime.datetime(2012, 4, 1), - datetime.datetime(2012, 5, 1), - ], - "a": np.arange(5, dtype="int64"), - "b": ["A", "B", "C", "D", "E"], - }, - columns=["idx1", "idx2", "idx3", "a", "b"], - ) - expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) - expected["idx3"] = expected["idx3"].apply( - lambda d: Timestamp(d, tz="Europe/Paris") - ) - tm.assert_frame_equal(df.reset_index(), expected) - - # GH 7793 - idx = MultiIndex.from_product( - [["a", "b"], pd.date_range("20130101", periods=3, tz=tz)] - ) - df = DataFrame( - np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx - ) - - expected = DataFrame( - { - "level_0": "a a a b b b".split(), - "level_1": [ - datetime.datetime(2013, 1, 1), - datetime.datetime(2013, 1, 2), - datetime.datetime(2013, 1, 3), - ] - * 2, - "a": np.arange(6, dtype="int64"), - }, - columns=["level_0", "level_1", "a"], - ) - expected["level_1"] = expected["level_1"].apply( - lambda d: Timestamp(d, freq="D", tz=tz) - ) - tm.assert_frame_equal(df.reset_index(), expected) - - def test_reset_index_period(self): - # GH 7746 - idx = MultiIndex.from_product( - [pd.period_range("20130101", periods=3, freq="M"), list("abc")], - names=["month", "feature"], - ) - - df = DataFrame( - np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"] - ) - expected = DataFrame( - { - "month": ( - [pd.Period("2013-01", freq="M")] * 3 - + [pd.Period("2013-02", freq="M")] * 3 - + [pd.Period("2013-03", freq="M")] * 3 - ), - "feature": ["a", "b", "c"] * 3, - "a": np.arange(9, dtype="int64"), - }, - columns=["month", "feature", "a"], - ) - tm.assert_frame_equal(df.reset_index(), expected) - - def test_reset_index_multiindex_columns(self): - levels = [["A", ""], ["B", "b"]] - df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) - result = df[["B"]].rename_axis("A").reset_index() - tm.assert_frame_equal(result, df) - - # gh-16120: already existing column - msg = r"cannot insert \('A', ''\), already exists" - with pytest.raises(ValueError, match=msg): - df.rename_axis("A").reset_index() - - # gh-16164: multiindex (tuple) full key - result = df.set_index([("A", "")]).reset_index() - tm.assert_frame_equal(result, df) - - # with additional (unnamed) index level - idx_col = DataFrame( - [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")]) - ) - expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1) - result = df.set_index([("B", "b")], append=True).reset_index() - tm.assert_frame_equal(result, expected) - - # with index name which is a too long tuple... - msg = "Item must have length equal to number of levels." - with pytest.raises(ValueError, match=msg): - df.rename_axis([("C", "c", "i")]).reset_index() - - # or too short... - levels = [["A", "a", ""], ["B", "b", "i"]] - df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) - idx_col = DataFrame( - [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")]) - ) - expected = pd.concat([idx_col, df2], axis=1) - result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii") - tm.assert_frame_equal(result, expected) - - # ... which is incompatible with col_fill=None - with pytest.raises( - ValueError, - match=( - "col_fill=None is incompatible with " - r"incomplete column name \('C', 'c'\)" - ), - ): - df2.rename_axis([("C", "c")]).reset_index(col_fill=None) - - # with col_level != 0 - result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C") - tm.assert_frame_equal(result, expected) - - def test_set_index_period(self): - # GH 6631 - df = DataFrame(np.random.random(6)) - idx1 = pd.period_range("2011-01-01", periods=3, freq="M") - idx1 = idx1.append(idx1) - idx2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H") - idx2 = idx2.append(idx2).append(idx2) - idx3 = pd.period_range("2005", periods=6, freq="A") - - df = df.set_index(idx1) - df = df.set_index(idx2, append=True) - df = df.set_index(idx3, append=True) - - expected1 = pd.period_range("2011-01-01", periods=3, freq="M") - expected2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H") - - tm.assert_index_equal(df.index.levels[0], expected1) - tm.assert_index_equal(df.index.levels[1], expected2) - tm.assert_index_equal(df.index.levels[2], idx3) - - tm.assert_index_equal(df.index.get_level_values(0), idx1) - tm.assert_index_equal(df.index.get_level_values(1), idx2) - tm.assert_index_equal(df.index.get_level_values(2), idx3) - - def test_repeat(self): - # GH 9361 - # fixed by # GH 7891 - m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)]) - data = ["a", "b", "c", "d"] - m_df = Series(data, index=m_idx) - assert m_df.repeat(3).shape == (3 * len(data),) - def test_subsets_multiindex_dtype(self): # GH 20757 data = [["x", 1]] @@ -1739,70 +418,9 @@ def test_subsets_multiindex_dtype(self): tm.assert_series_equal(result, expected) -class TestSorted(Base): +class TestSorted: """ everything you wanted to test about sorting """ - def test_sort_index_preserve_levels(self): - result = self.frame.sort_index() - assert result.index.names == self.frame.index.names - - def test_sorting_repr_8017(self): - - np.random.seed(0) - data = np.random.randn(3, 4) - - for gen, extra in [ - ([1.0, 3.0, 2.0, 5.0], 4.0), - ([1, 3, 2, 5], 4), - ( - [ - Timestamp("20130101"), - Timestamp("20130103"), - Timestamp("20130102"), - Timestamp("20130105"), - ], - Timestamp("20130104"), - ), - (["1one", "3one", "2one", "5one"], "4one"), - ]: - columns = MultiIndex.from_tuples([("red", i) for i in gen]) - df = DataFrame(data, index=list("def"), columns=columns) - df2 = pd.concat( - [ - df, - DataFrame( - "world", - index=list("def"), - columns=MultiIndex.from_tuples([("red", extra)]), - ), - ], - axis=1, - ) - - # check that the repr is good - # make sure that we have a correct sparsified repr - # e.g. only 1 header of read - assert str(df2).splitlines()[0].split() == ["red"] - - # GH 8017 - # sorting fails after columns added - - # construct single-dtype then sort - result = df.copy().sort_index(axis=1) - expected = df.iloc[:, [0, 2, 1, 3]] - tm.assert_frame_equal(result, expected) - - result = df2.sort_index(axis=1) - expected = df2.iloc[:, [0, 2, 1, 4, 3]] - tm.assert_frame_equal(result, expected) - - # setitem then sort - result = df.copy() - result[("red", extra)] = "world" - - result = result.sort_index(axis=1) - tm.assert_frame_equal(result, expected) - def test_sort_non_lexsorted(self): # degenerate case where we sort but don't # have a satisfying result :< @@ -1828,29 +446,3 @@ def test_sort_non_lexsorted(self): ) result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :] tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "keys, expected", - [ - (["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]), - (["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]), - ((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]), - ((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]), - ((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]), - ((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]), - ((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]), - ], - ) - @pytest.mark.parametrize("dim", ["index", "columns"]) - def test_multilevel_index_loc_order(self, dim, keys, expected): - # GH 22797 - # Try to respect order of keys given for MultiIndex.loc - kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} - df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs) - exp_index = MultiIndex.from_arrays(expected) - if dim == "index": - res = df.loc[keys, :] - tm.assert_index_equal(res.index, exp_index) - elif dim == "columns": - res = df.loc[:, keys] - tm.assert_index_equal(res.columns, exp_index) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index c45e4508c6153..458fba2e13b0f 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1005,6 +1005,23 @@ def test_nanmean(self, tz): result = nanops.nanmean(obj) assert result == expected + @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) + def test_nanmean_skipna_false(self, dtype): + arr = np.arange(12).astype(np.int64).view(dtype).reshape(4, 3) + + arr[-1, -1] = "NaT" + + result = nanops.nanmean(arr, skipna=False) + assert result is pd.NaT + + result = nanops.nanmean(arr, axis=0, skipna=False) + expected = np.array([4, 5, "NaT"], dtype=arr.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = nanops.nanmean(arr, axis=1, skipna=False) + expected = np.array([arr[0, 1], arr[1, 1], arr[2, 1], arr[-1, -1]]) + tm.assert_numpy_array_equal(result, expected) + def test_use_bottleneck(): @@ -1036,7 +1053,7 @@ def test_use_bottleneck(): ) def test_numpy_ops(numpy_op, expected): # GH8383 - result = numpy_op(pd.Series([1, 2, 3, 4])) + result = numpy_op(Series([1, 2, 3, 4])) assert result == expected @@ -1062,7 +1079,7 @@ def test_numpy_ops(numpy_op, expected): ) def test_nanops_independent_of_mask_param(operation): # GH22764 - s = pd.Series([1, 2, np.nan, 3, np.nan, 4]) + s = Series([1, 2, np.nan, 3, np.nan, 4]) mask = s.isna() median_expected = operation(s) median_result = operation(s, mask=mask) diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index d839936f731a3..6e224245076ee 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -4,6 +4,22 @@ import pandas as pd import pandas._testing as tm +from pandas.core import accessor + + +def test_dirname_mixin(): + # GH37173 + + class X(accessor.DirNamesMixin): + x = 1 + y: int + + def __init__(self): + self.z = 3 + + result = [attr_name for attr_name in dir(X()) if not attr_name.startswith("_")] + + assert result == ["x", "z"] @contextlib.contextmanager diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 61df5d4d5fdd6..9be5abb9dda65 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -132,7 +132,7 @@ def any_string_method(request): Examples -------- >>> def test_something(any_string_method): - ... s = pd.Series(['a', 'b', np.nan, 'd']) + ... s = Series(['a', 'b', np.nan, 'd']) ... ... method_name, args, kwargs = any_string_method ... method = getattr(s.str, method_name) @@ -183,7 +183,7 @@ def any_allowed_skipna_inferred_dtype(request): ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype ... ... # constructor for .str-accessor will also pass - ... pd.Series(values).str + ... Series(values).str """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting @@ -2546,8 +2546,8 @@ def test_split(self): @pytest.mark.parametrize("dtype", [object, "string"]) @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split_n(self, dtype, method): - s = pd.Series(["a b", pd.NA, "b c"], dtype=dtype) - expected = pd.Series([["a", "b"], pd.NA, ["b", "c"]]) + s = Series(["a b", pd.NA, "b c"], dtype=dtype) + expected = Series([["a", "b"], pd.NA, ["b", "c"]]) result = getattr(s.str, method)(" ", n=None) tm.assert_series_equal(result, expected) @@ -3653,14 +3653,14 @@ def test_string_array_extract(): @pytest.mark.parametrize("klass", [tuple, list, np.array, pd.Series, pd.Index]) def test_cat_different_classes(klass): # https://github.com/pandas-dev/pandas/issues/33425 - s = pd.Series(["a", "b", "c"]) + s = Series(["a", "b", "c"]) result = s.str.cat(klass(["x", "y", "z"])) - expected = pd.Series(["ax", "by", "cz"]) + expected = Series(["ax", "by", "cz"]) tm.assert_series_equal(result, expected) def test_str_get_stringarray_multiple_nans(): - s = pd.Series(pd.array(["a", "ab", pd.NA, "abc"])) + s = Series(pd.array(["a", "ab", pd.NA, "abc"])) result = s.str.get(2) - expected = pd.Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) + expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 63f9a2532fa73..13c8987c66977 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -320,7 +320,7 @@ def test_to_datetime_format_weeks(self, cache): def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): # GH 13486 result = pd.to_datetime(dates, format=fmt) - expected = pd.Index(expected_dates) + expected = Index(expected_dates) tm.assert_equal(result, expected) def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): @@ -357,7 +357,7 @@ def test_to_datetime_parse_timezone_malformed(self, offset): def test_to_datetime_parse_timezone_keeps_name(self): # GH 21697 fmt = "%Y-%m-%d %H:%M:%S %z" - arg = pd.Index(["2010-01-01 12:00:00 Z"], name="foo") + arg = Index(["2010-01-01 12:00:00 Z"], name="foo") result = pd.to_datetime(arg, format=fmt) expected = pd.DatetimeIndex(["2010-01-01 12:00:00"], tz="UTC", name="foo") tm.assert_index_equal(result, expected) @@ -613,7 +613,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): # numpy is either a python datetime.datetime or datetime.date tm.assert_index_equal( pd.to_datetime(dts_with_oob, errors="ignore", cache=cache), - pd.Index([dt.item() for dt in dts_with_oob]), + Index([dt.item() for dt in dts_with_oob]), ) @pytest.mark.parametrize("cache", [True, False]) @@ -650,7 +650,7 @@ def test_to_datetime_different_offsets(self, cache): ts_string_1 = "March 1, 2018 12:00:00+0400" ts_string_2 = "March 1, 2018 12:00:00+0500" arr = [ts_string_1] * 5 + [ts_string_2] * 5 - expected = pd.Index([parse(x) for x in arr]) + expected = Index([parse(x) for x in arr]) result = pd.to_datetime(arr, cache=cache) tm.assert_index_equal(result, expected) @@ -714,17 +714,17 @@ def test_to_datetime_utc_true( def test_to_datetime_utc_true_with_series_single_value(self, cache): # GH 15760 UTC=True with Series ts = 1.5e18 - result = pd.to_datetime(pd.Series([ts]), utc=True, cache=cache) - expected = pd.Series([pd.Timestamp(ts, tz="utc")]) + result = pd.to_datetime(Series([ts]), utc=True, cache=cache) + expected = Series([pd.Timestamp(ts, tz="utc")]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): ts = "2013-01-01 00:00:00-01:00" expected_ts = "2013-01-01 01:00:00" - data = pd.Series([ts] * 3) + data = Series([ts] * 3) result = pd.to_datetime(data, utc=True, cache=cache) - expected = pd.Series([pd.Timestamp(expected_ts, tz="utc")] * 3) + expected = Series([pd.Timestamp(expected_ts, tz="utc")] * 3) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("cache", [True, False]) @@ -736,8 +736,8 @@ def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): ], ) def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype): - expected = pd.Series([pd.Timestamp("2013-01-01 01:00:00", tz="UTC")]) - result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True, cache=cache) + expected = Series([pd.Timestamp("2013-01-01 01:00:00", tz="UTC")]) + result = pd.to_datetime(Series([date], dtype=dtype), utc=True, cache=cache) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("cache", [True, False]) @@ -876,7 +876,7 @@ def test_datetime_invalid_index(self, values, format, infer): res = pd.to_datetime( values, errors="ignore", format=format, infer_datetime_format=infer ) - tm.assert_index_equal(res, pd.Index(values)) + tm.assert_index_equal(res, Index(values)) res = pd.to_datetime( values, errors="coerce", format=format, infer_datetime_format=infer @@ -895,7 +895,7 @@ def test_datetime_invalid_index(self, values, format, infer): @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) - @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index, deque]) + @pytest.mark.parametrize("constructor", [list, tuple, np.array, Index, deque]) def test_to_datetime_cache(self, utc, format, constructor): date = "20130101 00:00:00" test_dates = [date] * 10 ** 5 @@ -929,7 +929,7 @@ def test_to_datetime_from_deque(self): def test_to_datetime_cache_series(self, utc, format): date = "20130101 00:00:00" test_dates = [date] * 10 ** 5 - data = pd.Series(test_dates) + data = Series(test_dates) result = pd.to_datetime(data, utc=utc, format=format, cache=True) expected = pd.to_datetime(data, utc=utc, format=format, cache=False) tm.assert_series_equal(result, expected) @@ -1049,7 +1049,7 @@ def test_iso8601_strings_mixed_offsets_with_naive(self): def test_mixed_offsets_with_native_datetime_raises(self): # GH 25978 - s = pd.Series( + s = Series( [ "nan", pd.Timestamp("1990-01-01"), @@ -1246,7 +1246,7 @@ def test_unit_rounding(self, cache): @pytest.mark.parametrize("cache", [True, False]) def test_unit_ignore_keeps_name(self, cache): # GH 21697 - expected = pd.Index([15e9] * 2, name="name") + expected = Index([15e9] * 2, name="name") result = pd.to_datetime(expected, errors="ignore", unit="s", cache=cache) tm.assert_index_equal(result, expected) @@ -1418,9 +1418,9 @@ def test_dataframe_dtypes(self, cache): def test_dataframe_utc_true(self): # GH 23760 - df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) + df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = pd.to_datetime(df, utc=True) - expected = pd.Series( + expected = Series( np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -1579,7 +1579,7 @@ def test_to_datetime_with_apply(self, cache): result = td.apply(pd.to_datetime, format="%b %y", cache=cache) tm.assert_series_equal(result, expected) - td = pd.Series(["May 04", "Jun 02", ""], index=[1, 2, 3]) + td = Series(["May 04", "Jun 02", ""], index=[1, 2, 3]) msg = r"time data '' does not match format '%b %y' \(match\)" with pytest.raises(ValueError, match=msg): pd.to_datetime(td, format="%b %y", errors="raise", cache=cache) @@ -1818,7 +1818,7 @@ def test_guess_datetime_format_for_array(self): class TestToDatetimeInferFormat: @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_consistent_format(self, cache): - s = pd.Series(pd.date_range("20000101", periods=50, freq="H")) + s = Series(pd.date_range("20000101", periods=50, freq="H")) test_formats = ["%m-%d-%Y", "%m/%d/%Y %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%f"] @@ -1842,7 +1842,7 @@ def test_to_datetime_infer_datetime_format_consistent_format(self, cache): @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): - s = pd.Series( + s = Series( np.array( ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ) @@ -1855,7 +1855,7 @@ def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): pd.to_datetime(s, infer_datetime_format=True, cache=cache), ) - s = pd.Series(np.array(["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"])) + s = Series(np.array(["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"])) tm.assert_series_equal( pd.to_datetime(s, infer_datetime_format=False, cache=cache), @@ -1864,7 +1864,7 @@ def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): - s = pd.Series( + s = Series( np.array(["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan]) ) tm.assert_series_equal( @@ -1874,7 +1874,7 @@ def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): - s = pd.Series( + s = Series( np.array( [ np.nan, @@ -1896,9 +1896,9 @@ def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): ) def test_infer_datetime_format_tz_name(self, tz_name, offset): # GH 33133 - s = pd.Series([f"2019-02-02 08:07:13 {tz_name}"]) + s = Series([f"2019-02-02 08:07:13 {tz_name}"]) result = to_datetime(s, infer_datetime_format=True) - expected = pd.Series( + expected = Series( [pd.Timestamp("2019-02-02 08:07:13").tz_localize(pytz.FixedOffset(offset))] ) tm.assert_series_equal(result, expected) @@ -1906,8 +1906,8 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 - s = pd.Series(["2014-1-1", "2014-2-2", "2015-3-3"]) - expected = pd.Series( + s = Series(["2014-1-1", "2014-2-2", "2015-3-3"]) + expected = Series( [ pd.Timestamp("2014-01-01"), pd.Timestamp("2014-02-02"), @@ -2410,13 +2410,13 @@ def test_should_cache_errors(unique_share, check_count, err_message): def test_nullable_integer_to_datetime(): # Test for #30050 - ser = pd.Series([1, 2, None, 2 ** 61, None]) + ser = Series([1, 2, None, 2 ** 61, None]) ser = ser.astype("Int64") ser_copy = ser.copy() res = pd.to_datetime(ser, unit="ns") - expected = pd.Series( + expected = Series( [ np.datetime64("1970-01-01 00:00:00.000000001"), np.datetime64("1970-01-01 00:00:00.000000002"), diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 450076f2824ad..713607d087bc0 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -571,8 +571,8 @@ def test_downcast_limits(dtype, downcast, min_max): "ser,expected", [ ( - pd.Series([0, 9223372036854775808]), - pd.Series([0, 9223372036854775808], dtype=np.uint64), + Series([0, 9223372036854775808]), + Series([0, 9223372036854775808], dtype=np.uint64), ) ], ) @@ -647,7 +647,7 @@ def test_failure_to_convert_uint64_string_to_NaN(): assert np.isnan(result) ser = Series([32, 64, np.nan]) - result = to_numeric(pd.Series(["32", "64", "uint64"]), errors="coerce") + result = to_numeric(Series(["32", "64", "uint64"]), errors="coerce") tm.assert_series_equal(result, ser) @@ -707,3 +707,21 @@ def test_precision_float_conversion(strrep): result = to_numeric(strrep) assert result == float(strrep) + + +@pytest.mark.parametrize( + "values, expected", + [ + (["1", "2", None], Series([1, 2, np.nan])), + (["1", "2", "3"], Series([1, 2, 3])), + (["1", "2", 3], Series([1, 2, 3])), + (["1", "2", 3.5], Series([1, 2, 3.5])), + (["1", None, 3.5], Series([1, np.nan, 3.5])), + (["1", "2", "3.5"], Series([1, 2, 3.5])), + ], +) +def test_to_numeric_from_nullable_string(values, expected): + # https://github.com/pandas-dev/pandas/issues/37262 + s = Series(values, dtype="string") + result = to_numeric(s) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 6a87c05384689..922aff1792227 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -678,6 +678,11 @@ def test_isAnchored_deprecated(self, offset_types): expected = off.is_anchored() assert result == expected + def test_offsets_hashable(self, offset_types): + # GH: 37267 + off = self._get_offset(offset_types) + assert hash(off) is not None + class TestDateOffset(Base): def setup_method(self, method): diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 0fa9081d606b0..8d9b54cf3f0df 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -13,6 +13,7 @@ from hypothesis.extra.dateutil import timezones as dateutil_timezones from hypothesis.extra.pytz import timezones as pytz_timezones import pytest +import pytz import pandas as pd from pandas import Timestamp @@ -92,7 +93,13 @@ def test_on_offset_implementations(dt, offset): # check that the class-specific implementations of is_on_offset match # the general case definition: # (dt + offset) - offset == dt - compare = (dt + offset) - offset + try: + compare = (dt + offset) - offset + except pytz.NonExistentTimeError: + # dt + offset does not exist, assume(False) to indicate + # to hypothesis that this is not a valid test case + assume(False) + assert offset.is_on_offset(dt) == (compare == dt) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 5174ff005b5fb..6111797d70268 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -265,14 +265,14 @@ def test_assert_frame_equal_interval_dtype_mismatch(): @pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): # https://github.com/pandas-dev/pandas/issues/35715 - left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") - right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) tm.assert_frame_equal(left, right, check_dtype=False) def test_allows_duplicate_labels(): - left = pd.DataFrame() - right = pd.DataFrame().set_flags(allows_duplicate_labels=False) + left = DataFrame() + right = DataFrame().set_flags(allows_duplicate_labels=False) tm.assert_frame_equal(left, left) tm.assert_frame_equal(right, right) tm.assert_frame_equal(left, right, check_flags=False) diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index 87765c909938d..5f87824713175 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -1,10 +1,58 @@ +"""" +Test module for testing ``pandas._testing.assert_produces_warning``. +""" import warnings import pytest +from pandas.errors import DtypeWarning, PerformanceWarning + import pandas._testing as tm +@pytest.fixture( + params=[ + RuntimeWarning, + ResourceWarning, + UserWarning, + FutureWarning, + DeprecationWarning, + PerformanceWarning, + DtypeWarning, + ], +) +def category(request): + """ + Return unique warning. + + Useful for testing behavior of tm.assert_produces_warning with various categories. + """ + return request.param + + +@pytest.fixture( + params=[ + (RuntimeWarning, UserWarning), + (UserWarning, FutureWarning), + (FutureWarning, RuntimeWarning), + (DeprecationWarning, PerformanceWarning), + (PerformanceWarning, FutureWarning), + (DtypeWarning, DeprecationWarning), + (ResourceWarning, DeprecationWarning), + (FutureWarning, DeprecationWarning), + ], + ids=lambda x: type(x).__name__, +) +def pair_different_warnings(request): + """ + Return pair or different warnings. + + Useful for testing how several different warnings are handled + in tm.assert_produces_warning. + """ + return request.param + + def f(): warnings.warn("f1", FutureWarning) warnings.warn("f2", RuntimeWarning) @@ -20,3 +68,87 @@ def test_assert_produces_warning_honors_filter(): with tm.assert_produces_warning(RuntimeWarning, raise_on_extra_warnings=False): f() + + +@pytest.mark.parametrize( + "message, match", + [ + ("", None), + ("", ""), + ("Warning message", r".*"), + ("Warning message", "War"), + ("Warning message", r"[Ww]arning"), + ("Warning message", "age"), + ("Warning message", r"age$"), + ("Message 12-234 with numbers", r"\d{2}-\d{3}"), + ("Message 12-234 with numbers", r"^Mes.*\d{2}-\d{3}"), + ("Message 12-234 with numbers", r"\d{2}-\d{3}\s\S+"), + ("Message, which we do not match", None), + ], +) +def test_catch_warning_category_and_match(category, message, match): + with tm.assert_produces_warning(category, match=match): + warnings.warn(message, category) + + +@pytest.mark.parametrize( + "message, match", + [ + ("Warning message", "Not this message"), + ("Warning message", "warning"), + ("Warning message", r"\d+"), + ], +) +def test_fail_to_match(category, message, match): + msg = f"Did not see warning {repr(category.__name__)} matching" + with pytest.raises(AssertionError, match=msg): + with tm.assert_produces_warning(category, match=match): + warnings.warn(message, category) + + +def test_fail_to_catch_actual_warning(pair_different_warnings): + expected_category, actual_category = pair_different_warnings + match = "Did not see expected warning of class" + with pytest.raises(AssertionError, match=match): + with tm.assert_produces_warning(expected_category): + warnings.warn("warning message", actual_category) + + +def test_ignore_extra_warning(pair_different_warnings): + expected_category, extra_category = pair_different_warnings + with tm.assert_produces_warning(expected_category, raise_on_extra_warnings=False): + warnings.warn("Expected warning", expected_category) + warnings.warn("Unexpected warning OK", extra_category) + + +def test_raise_on_extra_warning(pair_different_warnings): + expected_category, extra_category = pair_different_warnings + match = r"Caused unexpected warning\(s\)" + with pytest.raises(AssertionError, match=match): + with tm.assert_produces_warning(expected_category): + warnings.warn("Expected warning", expected_category) + warnings.warn("Unexpected warning NOT OK", extra_category) + + +def test_same_category_different_messages_first_match(): + category = UserWarning + with tm.assert_produces_warning(category, match=r"^Match this"): + warnings.warn("Match this", category) + warnings.warn("Do not match that", category) + warnings.warn("Do not match that either", category) + + +def test_same_category_different_messages_last_match(): + category = DeprecationWarning + with tm.assert_produces_warning(category, match=r"^Match this"): + warnings.warn("Do not match that", category) + warnings.warn("Do not match that either", category) + warnings.warn("Match this", category) + + +def test_right_category_wrong_match_raises(pair_different_warnings): + target_category, other_category = pair_different_warnings + with pytest.raises(AssertionError, match="Did not see warning.*matching"): + with tm.assert_produces_warning(target_category, match=r"^Match this"): + warnings.warn("Do not match it", target_category) + warnings.warn("Match this", other_category) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 53746aa048663..0f56fb0b93642 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -301,14 +301,14 @@ def test_series_equal_exact_for_nonnumeric(): @pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): # https://github.com/pandas-dev/pandas/issues/35715 - left = pd.Series([1, 2, 3], dtype="Int64") - right = pd.Series([1, 2, 3], dtype=right_dtype) + left = Series([1, 2, 3], dtype="Int64") + right = Series([1, 2, 3], dtype=right_dtype) tm.assert_series_equal(left, right, check_dtype=False) def test_allows_duplicate_labels(): - left = pd.Series([1]) - right = pd.Series([1]).set_flags(allows_duplicate_labels=False) + left = Series([1]) + right = Series([1]).set_flags(allows_duplicate_labels=False) tm.assert_series_equal(left, left) tm.assert_series_equal(right, right) tm.assert_series_equal(left, right, check_flags=False) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index ff29df39e1871..cf618f7c828aa 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -300,25 +300,25 @@ def test_hash_with_tuple(): # GH#28969 array containing a tuple raises on call to arr.astype(str) # apparently a numpy bug github.com/numpy/numpy/issues/9441 - df = pd.DataFrame({"data": [tuple("1"), tuple("2")]}) + df = DataFrame({"data": [tuple("1"), tuple("2")]}) result = hash_pandas_object(df) - expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) + expected = Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) tm.assert_series_equal(result, expected) - df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]}) + df2 = DataFrame({"data": [tuple([1]), tuple([2])]}) result = hash_pandas_object(df2) - expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) + expected = Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) tm.assert_series_equal(result, expected) # require that the elements of such tuples are themselves hashable - df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) + df3 = DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) with pytest.raises(TypeError, match="unhashable type: 'list'"): hash_pandas_object(df3) def test_hash_object_none_key(): # https://github.com/pandas-dev/pandas/issues/30887 - result = pd.util.hash_pandas_object(pd.Series(["a", "b"]), hash_key=None) - expected = pd.Series([4578374827886788867, 17338122309987883691], dtype="uint64") + result = pd.util.hash_pandas_object(Series(["a", "b"]), hash_key=None) + expected = Series([4578374827886788867, 17338122309987883691], dtype="uint64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/moments/__init__.py b/pandas/tests/window/moments/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 96ad83f6b40b1..6ab53c8e2ec0d 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -143,8 +143,8 @@ def test_rolling_apply_consistency( @pytest.mark.parametrize("window", range(7)) def test_rolling_corr_with_zero_variance(window): # GH 18430 - s = pd.Series(np.zeros(20)) - other = pd.Series(np.arange(20)) + s = Series(np.zeros(20)) + other = Series(np.arange(20)) assert s.rolling(window=window).corr(other=other).isna().all() diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py index 287cd7ebba536..605b85344ba76 100644 --- a/pandas/tests/window/moments/test_moments_ewm.py +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -2,7 +2,6 @@ from numpy.random import randn import pytest -import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm @@ -20,7 +19,7 @@ def test_ewma_frame(frame, name): def test_ewma_adjust(): - vals = pd.Series(np.zeros(1000)) + vals = Series(np.zeros(1000)) vals[5] = 1 result = vals.ewm(span=100, adjust=False).mean().sum() assert np.abs(result - 1) < 1e-2 @@ -295,7 +294,7 @@ def test_ewm_domain_checks(arr): @pytest.mark.parametrize("method", ["mean", "vol", "var"]) def test_ew_empty_series(method): - vals = pd.Series([], dtype=np.float64) + vals = Series([], dtype=np.float64) ewm = vals.ewm(3) result = getattr(ewm, method)() diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index 488306d0585c5..2f622c2bc3e60 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -74,17 +74,17 @@ def test_cmov_window(): def test_cmov_window_corner(): # GH 8238 # all nan - vals = pd.Series([np.nan] * 10) + vals = Series([np.nan] * 10) result = vals.rolling(5, center=True, win_type="boxcar").mean() assert np.isnan(result).all() # empty - vals = pd.Series([], dtype=object) + vals = Series([], dtype=object) result = vals.rolling(5, center=True, win_type="boxcar").mean() assert len(result) == 0 # shorter than window - vals = pd.Series(np.random.randn(5)) + vals = Series(np.random.randn(5)) result = vals.rolling(10, win_type="boxcar").mean() assert np.isnan(result).all() assert len(result) == 5 @@ -523,22 +523,22 @@ def test_cmov_window_special_linear_range(win_types_special): def test_rolling_min_min_periods(): - a = pd.Series([1, 2, 3, 4, 5]) + a = Series([1, 2, 3, 4, 5]) result = a.rolling(window=100, min_periods=1).min() - expected = pd.Series(np.ones(len(a))) + expected = Series(np.ones(len(a))) tm.assert_series_equal(result, expected) msg = "min_periods 5 must be <= window 3" with pytest.raises(ValueError, match=msg): - pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() + Series([1, 2, 3]).rolling(window=3, min_periods=5).min() def test_rolling_max_min_periods(): - a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) + a = Series([1, 2, 3, 4, 5], dtype=np.float64) b = a.rolling(window=100, min_periods=1).max() tm.assert_almost_equal(a, b) msg = "min_periods 5 must be <= window 3" with pytest.raises(ValueError, match=msg): - pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() + Series([1, 2, 3]).rolling(window=3, min_periods=5).max() def test_rolling_quantile_np_percentile(): @@ -610,17 +610,17 @@ def test_rolling_quantile_param(): def test_rolling_std_1obs(): - vals = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) + vals = Series([1.0, 2.0, 3.0, 4.0, 5.0]) result = vals.rolling(1, min_periods=1).std() - expected = pd.Series([np.nan] * 5) + expected = Series([np.nan] * 5) tm.assert_series_equal(result, expected) result = vals.rolling(1, min_periods=1).std(ddof=0) - expected = pd.Series([0.0] * 5) + expected = Series([0.0] * 5) tm.assert_series_equal(result, expected) - result = pd.Series([np.nan, np.nan, 3, 4, 5]).rolling(3, min_periods=2).std() + result = Series([np.nan, np.nan, 3, 4, 5]).rolling(3, min_periods=2).std() assert np.isnan(result[2]) @@ -629,7 +629,7 @@ def test_rolling_std_neg_sqrt(): # Test move_nanstd for neg sqrt. - a = pd.Series( + a = Series( [ 0.0011448196318903589, 0.00028718669878572767, diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index eb14ecfba1f51..6e5d7b4df00e1 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -305,7 +305,7 @@ def test_preserve_metadata(): ) def test_multiple_agg_funcs(func, window_size, expected_vals): # GH 15072 - df = pd.DataFrame( + df = DataFrame( [ ["A", 10, 20], ["A", 20, 30], @@ -331,7 +331,7 @@ def test_multiple_agg_funcs(func, window_size, expected_vals): columns = pd.MultiIndex.from_tuples( [("low", "mean"), ("low", "max"), ("high", "mean"), ("high", "min")] ) - expected = pd.DataFrame(expected_vals, index=index, columns=columns) + expected = DataFrame(expected_vals, index=index, columns=columns) result = window.agg(dict((("low", ["mean", "max"]), ("high", ["mean", "min"])))) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b06a506281047..183d2814920e4 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -137,6 +137,14 @@ def test_expanding_count_default_min_periods_with_null_values(constructor): tm.assert_equal(result, expected) +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_expanding_count_with_min_periods_exceeding_series_length(constructor): + # GH 25857 + result = constructor(range(5)).expanding(min_periods=6).count() + expected = constructor([np.nan, np.nan, np.nan, np.nan, np.nan]) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( "df,expected,min_periods", [ @@ -227,7 +235,7 @@ def test_iter_expanding_series(ser, expected, min_periods): def test_center_deprecate_warning(): # GH 20647 - df = pd.DataFrame() + df = DataFrame() with tm.assert_produces_warning(FutureWarning): df.expanding(center=True) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 034f941462bb5..101d65c885c9b 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -131,7 +131,7 @@ def test_rolling_apply(self, raw): def test_rolling_apply_mutability(self): # GH 14013 - df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) + df = DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) g = df.groupby("A") mi = pd.MultiIndex.from_tuples( @@ -140,7 +140,7 @@ def test_rolling_apply_mutability(self): mi.names = ["A", None] # Grouped column should not be a part of the output - expected = pd.DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi) + expected = DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi) result = g.rolling(window=2).sum() tm.assert_frame_equal(result, expected) @@ -221,7 +221,7 @@ def test_groupby_rolling(self, expected_value, raw_value): def foo(x): return int(isinstance(x, np.ndarray)) - df = pd.DataFrame({"id": [1, 1, 1], "value": [1, 2, 3]}) + df = DataFrame({"id": [1, 1, 1], "value": [1, 2, 3]}) result = df.groupby("id").value.rolling(1).apply(foo, raw=raw_value) expected = Series( [expected_value] * 3, @@ -250,9 +250,9 @@ def test_groupby_rolling_center_center(self): ) tm.assert_series_equal(result, expected) - df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)}) + df = DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)}) result = df.groupby("a").rolling(center=True, window=3).mean() - expected = pd.DataFrame( + expected = DataFrame( [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan], index=pd.MultiIndex.from_tuples( ( @@ -274,9 +274,9 @@ def test_groupby_rolling_center_center(self): ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)}) + df = DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)}) result = df.groupby("a").rolling(center=True, window=3).mean() - expected = pd.DataFrame( + expected = DataFrame( [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan], index=pd.MultiIndex.from_tuples( ( @@ -297,10 +297,45 @@ def test_groupby_rolling_center_center(self): ) tm.assert_frame_equal(result, expected) + def test_groupby_rolling_center_on(self): + # GH 37141 + df = DataFrame( + data={ + "Date": pd.date_range("2020-01-01", "2020-01-10"), + "gb": ["group_1"] * 6 + ["group_2"] * 4, + "value": range(10), + } + ) + result = ( + df.groupby("gb") + .rolling(6, on="Date", center=True, min_periods=1) + .value.mean() + ) + expected = Series( + [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 7.0, 7.5, 7.5, 7.5], + name="value", + index=pd.MultiIndex.from_tuples( + ( + ("group_1", pd.Timestamp("2020-01-01")), + ("group_1", pd.Timestamp("2020-01-02")), + ("group_1", pd.Timestamp("2020-01-03")), + ("group_1", pd.Timestamp("2020-01-04")), + ("group_1", pd.Timestamp("2020-01-05")), + ("group_1", pd.Timestamp("2020-01-06")), + ("group_2", pd.Timestamp("2020-01-07")), + ("group_2", pd.Timestamp("2020-01-08")), + ("group_2", pd.Timestamp("2020-01-09")), + ("group_2", pd.Timestamp("2020-01-10")), + ), + names=["gb", "Date"], + ), + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("min_periods", [5, 4, 3]) def test_groupby_rolling_center_min_periods(self, min_periods): # GH 36040 - df = pd.DataFrame({"group": ["A"] * 10 + ["B"] * 10, "data": range(20)}) + df = DataFrame({"group": ["A"] * 10 + ["B"] * 10, "data": range(20)}) window_size = 5 result = ( @@ -318,7 +353,7 @@ def test_groupby_rolling_center_min_periods(self, min_periods): grp_A_expected = nans + grp_A_mean[num_nans : 10 - num_nans] + nans grp_B_expected = nans + grp_B_mean[num_nans : 10 - num_nans] + nans - expected = pd.DataFrame( + expected = DataFrame( {"group": ["A"] * 10 + ["B"] * 10, "data": grp_A_expected + grp_B_expected} ) @@ -361,7 +396,7 @@ def get_window_bounds( start[start < 0] = min_periods return start, end - df = pd.DataFrame( + df = DataFrame( {"a": [1.0, 2.0, 3.0, 4.0, 5.0] * 3}, index=[0] * 5 + [1] * 5 + [2] * 5 ) result = ( @@ -374,7 +409,7 @@ def get_window_bounds( def test_groupby_rolling_subset_with_closed(self): # GH 35549 - df = pd.DataFrame( + df = DataFrame( { "column1": range(6), "column2": range(6), @@ -398,7 +433,7 @@ def test_groupby_rolling_subset_with_closed(self): def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 - df = pd.DataFrame( + df = DataFrame( { "column1": range(6), "column2": range(6), @@ -446,19 +481,19 @@ def test_groupby_rolling_index_changed(self, func): def test_groupby_rolling_empty_frame(self): # GH 36197 - expected = pd.DataFrame({"s1": []}) + expected = DataFrame({"s1": []}) result = expected.groupby("s1").rolling(window=1).sum() expected.index = pd.MultiIndex.from_tuples([], names=["s1", None]) tm.assert_frame_equal(result, expected) - expected = pd.DataFrame({"s1": [], "s2": []}) + expected = DataFrame({"s1": [], "s2": []}) result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) tm.assert_frame_equal(result, expected) def test_groupby_rolling_string_index(self): # GH: 36727 - df = pd.DataFrame( + df = DataFrame( [ ["A", "group_1", pd.Timestamp(2019, 1, 1, 9)], ["B", "group_1", pd.Timestamp(2019, 1, 2, 9)], @@ -473,7 +508,7 @@ def test_groupby_rolling_string_index(self): df["count_to_date"] = groups.cumcount() rolling_groups = groups.rolling("10d", on="eventTime") result = rolling_groups.apply(lambda df: df.shape[0]) - expected = pd.DataFrame( + expected = DataFrame( [ ["A", "group_1", pd.Timestamp(2019, 1, 1, 9), 1.0], ["B", "group_1", pd.Timestamp(2019, 1, 2, 9), 2.0], @@ -488,12 +523,12 @@ def test_groupby_rolling_string_index(self): def test_groupby_rolling_no_sort(self): # GH 36889 result = ( - pd.DataFrame({"foo": [2, 1], "bar": [2, 1]}) + DataFrame({"foo": [2, 1], "bar": [2, 1]}) .groupby("foo", sort=False) .rolling(1) .min() ) - expected = pd.DataFrame( + expected = DataFrame( np.array([[2.0, 2.0], [1.0, 1.0]]), columns=["foo", "bar"], index=pd.MultiIndex.from_tuples([(2, 0), (1, 1)], names=["foo", None]), @@ -502,7 +537,7 @@ def test_groupby_rolling_no_sort(self): def test_groupby_rolling_count_closed_on(self): # GH 35869 - df = pd.DataFrame( + df = DataFrame( { "column1": range(6), "column2": range(6), @@ -515,7 +550,7 @@ def test_groupby_rolling_count_closed_on(self): .rolling("3d", on="date", closed="left")["column1"] .count() ) - expected = pd.Series( + expected = Series( [np.nan, 1.0, 1.0, np.nan, 1.0, 1.0], name="column1", index=pd.MultiIndex.from_tuples( @@ -538,14 +573,31 @@ def test_groupby_rolling_count_closed_on(self): ) def test_groupby_rolling_sem(self, func, kwargs): # GH: 26476 - df = pd.DataFrame( + df = DataFrame( [["a", 1], ["a", 2], ["b", 1], ["b", 2], ["b", 3]], columns=["a", "b"] ) result = getattr(df.groupby("a"), func)(**kwargs).sem() - expected = pd.DataFrame( + expected = DataFrame( {"a": [np.nan] * 5, "b": [np.nan, 0.70711, np.nan, 0.70711, 0.70711]}, index=pd.MultiIndex.from_tuples( [("a", 0), ("a", 1), ("b", 2), ("b", 3), ("b", 4)], names=["a", None] ), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + ("rollings", "key"), [({"on": "a"}, "a"), ({"on": None}, "index")] + ) + def test_groupby_rolling_nans_in_index(self, rollings, key): + # GH: 34617 + df = DataFrame( + { + "a": pd.to_datetime(["2020-06-01 12:00", "2020-06-01 14:00", np.nan]), + "b": [1, 2, 3], + "c": [1, 1, 1], + } + ) + if key == "index": + df = df.set_index("a") + with pytest.raises(ValueError, match=f"{key} must be monotonic"): + df.groupby("c").rolling("60min", **rollings) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 3d59f6cdd4996..2c8439aae75e5 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -122,42 +122,63 @@ def test_numpy_compat(method): getattr(r, method)(dtype=np.float64) -def test_closed(): - df = DataFrame({"A": [0, 1, 2, 3, 4]}) - # closed only allowed for datetimelike +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) +def test_closed_fixed(closed, arithmetic_win_operators): + # GH 34315 + func_name = arithmetic_win_operators + df_fixed = DataFrame({"A": [0, 1, 2, 3, 4]}) + df_time = DataFrame({"A": [0, 1, 2, 3, 4]}, index=date_range("2020", periods=5)) - msg = "closed only implemented for datetimelike and offset based windows" + result = getattr(df_fixed.rolling(2, closed=closed, min_periods=1), func_name)() + expected = getattr(df_time.rolling("2D", closed=closed), func_name)().reset_index( + drop=True + ) + + tm.assert_frame_equal(result, expected) - with pytest.raises(ValueError, match=msg): - df.rolling(window=3, closed="neither") + +def test_closed_fixed_binary_col(): + # GH 34315 + data = [0, 1, 1, 0, 0, 1, 0, 1] + df = DataFrame( + {"binary_col": data}, + index=pd.date_range(start="2020-01-01", freq="min", periods=len(data)), + ) + + rolling = df.rolling(window=len(df), closed="left", min_periods=1) + result = rolling.mean() + expected = DataFrame( + [np.nan, 0, 0.5, 2 / 3, 0.5, 0.4, 0.5, 0.428571], + columns=["binary_col"], + index=pd.date_range(start="2020-01-01", freq="min", periods=len(data)), + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("closed", ["neither", "left"]) def test_closed_empty(closed, arithmetic_win_operators): # GH 26005 func_name = arithmetic_win_operators - ser = pd.Series( - data=np.arange(5), index=pd.date_range("2000", periods=5, freq="2D") - ) + ser = Series(data=np.arange(5), index=pd.date_range("2000", periods=5, freq="2D")) roll = ser.rolling("1D", closed=closed) result = getattr(roll, func_name)() - expected = pd.Series([np.nan] * 5, index=ser.index) + expected = Series([np.nan] * 5, index=ser.index) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("func", ["min", "max"]) def test_closed_one_entry(func): # GH24718 - ser = pd.Series(data=[2], index=pd.date_range("2000", periods=1)) + ser = Series(data=[2], index=pd.date_range("2000", periods=1)) result = getattr(ser.rolling("10D", closed="left"), func)() - tm.assert_series_equal(result, pd.Series([np.nan], index=ser.index)) + tm.assert_series_equal(result, Series([np.nan], index=ser.index)) @pytest.mark.parametrize("func", ["min", "max"]) def test_closed_one_entry_groupby(func): # GH24718 - ser = pd.DataFrame( + ser = DataFrame( data={"A": [1, 1, 2], "B": [3, 2, 1]}, index=pd.date_range("2000", periods=3) ) result = getattr( @@ -166,7 +187,7 @@ def test_closed_one_entry_groupby(func): exp_idx = pd.MultiIndex.from_arrays( arrays=[[1, 1, 2], ser.index], names=("A", None) ) - expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name="B") + expected = Series(data=[np.nan, 3, np.nan], index=exp_idx, name="B") tm.assert_series_equal(result, expected) @@ -186,23 +207,23 @@ def test_closed_one_entry_groupby(func): ) def test_closed_min_max_datetime(input_dtype, func, closed, expected): # see gh-21704 - ser = pd.Series( + ser = Series( data=np.arange(10).astype(input_dtype), index=pd.date_range("2000", periods=10) ) result = getattr(ser.rolling("3D", closed=closed), func)() - expected = pd.Series(expected, index=ser.index) + expected = Series(expected, index=ser.index) tm.assert_series_equal(result, expected) def test_closed_uneven(): # see gh-21704 - ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + ser = Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) # uneven ser = ser.drop(index=ser.index[[1, 5]]) result = ser.rolling("3D", closed="left").min() - expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index) + expected = Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index) tm.assert_series_equal(result, expected) @@ -221,10 +242,10 @@ def test_closed_uneven(): ) def test_closed_min_max_minp(func, closed, expected): # see gh-21704 - ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + ser = Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) ser[ser.index[-3:]] = np.nan result = getattr(ser.rolling("3D", min_periods=2, closed=closed), func)() - expected = pd.Series(expected, index=ser.index) + expected = Series(expected, index=ser.index) tm.assert_series_equal(result, expected) @@ -239,9 +260,9 @@ def test_closed_min_max_minp(func, closed, expected): ) def test_closed_median_quantile(closed, expected): # GH 26005 - ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + ser = Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) roll = ser.rolling("3D", closed=closed) - expected = pd.Series(expected, index=ser.index) + expected = Series(expected, index=ser.index) result = roll.median() tm.assert_series_equal(result, expected) @@ -267,8 +288,8 @@ def tests_empty_df_rolling(roller): def test_empty_window_median_quantile(): # GH 26005 - expected = pd.Series([np.nan, np.nan, np.nan]) - roll = pd.Series(np.arange(3)).rolling(0) + expected = Series([np.nan, np.nan, np.nan]) + roll = Series(np.arange(3)).rolling(0) result = roll.median() tm.assert_series_equal(result, expected) @@ -280,27 +301,27 @@ def test_empty_window_median_quantile(): def test_missing_minp_zero(): # https://github.com/pandas-dev/pandas/pull/18921 # minp=0 - x = pd.Series([np.nan]) + x = Series([np.nan]) result = x.rolling(1, min_periods=0).sum() - expected = pd.Series([0.0]) + expected = Series([0.0]) tm.assert_series_equal(result, expected) # minp=1 result = x.rolling(1, min_periods=1).sum() - expected = pd.Series([np.nan]) + expected = Series([np.nan]) tm.assert_series_equal(result, expected) def test_missing_minp_zero_variable(): # https://github.com/pandas-dev/pandas/pull/18921 - x = pd.Series( + x = Series( [np.nan] * 4, index=pd.DatetimeIndex( ["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"] ), ) result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() - expected = pd.Series(0.0, index=x.index) + expected = Series(0.0, index=x.index) tm.assert_series_equal(result, expected) @@ -349,22 +370,22 @@ def test_readonly_array(): # GH-27766 arr = np.array([1, 3, np.nan, 3, 5]) arr.setflags(write=False) - result = pd.Series(arr).rolling(2).mean() - expected = pd.Series([np.nan, 2, np.nan, np.nan, 4]) + result = Series(arr).rolling(2).mean() + expected = Series([np.nan, 2, np.nan, np.nan, 4]) tm.assert_series_equal(result, expected) def test_rolling_datetime(axis_frame, tz_naive_fixture): # GH-28192 tz = tz_naive_fixture - df = pd.DataFrame( + df = DataFrame( {i: [1] * 2 for i in pd.date_range("2019-8-01", "2019-08-03", freq="D", tz=tz)} ) if axis_frame in [0, "index"]: result = df.T.rolling("2D", axis=axis_frame).sum().T else: result = df.rolling("2D", axis=axis_frame).sum() - expected = pd.DataFrame( + expected = DataFrame( { **{ i: [1.0] * 2 @@ -440,9 +461,9 @@ def test_rolling_window_as_string(): def test_min_periods1(): # GH#6795 - df = pd.DataFrame([0, 1, 2, 1, 0], columns=["a"]) + df = DataFrame([0, 1, 2, 1, 0], columns=["a"]) result = df["a"].rolling(3, center=True, min_periods=1).max() - expected = pd.Series([1.0, 2.0, 2.0, 2.0, 1.0], name="a") + expected = Series([1.0, 2.0, 2.0, 2.0, 1.0], name="a") tm.assert_series_equal(result, expected) @@ -498,7 +519,7 @@ def test_rolling_count_default_min_periods_with_null_values(constructor): ({"A": [2, 3], "B": [5, 6]}, [1, 2]), ], 2, - 3, + 2, ), ( DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), @@ -518,7 +539,7 @@ def test_rolling_count_default_min_periods_with_null_values(constructor): ({"A": [3], "B": [6]}, [2]), ], 1, - 2, + 0, ), (DataFrame({"A": [1], "B": [4]}), [], 2, None), (DataFrame({"A": [1], "B": [4]}), [], 2, 1), @@ -605,9 +626,9 @@ def test_iter_rolling_on_dataframe(expected, window): 1, ), (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 1), - (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 3), + (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 2), (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 0), - (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 2), + (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 1), (Series([1, 2]), [([1], [0]), ([1, 2], [0, 1])], 2, 0), (Series([], dtype="int64"), [], 2, 1), ], @@ -708,7 +729,7 @@ def scaled_sum(*args): @pytest.mark.parametrize("add", [0.0, 2.0]) def test_rolling_numerical_accuracy_kahan_mean(add): # GH: 36031 implementing kahan summation - df = pd.DataFrame( + df = DataFrame( {"A": [3002399751580331.0 + add, -0.0, -0.0]}, index=[ pd.Timestamp("19700101 09:00:00"), @@ -720,7 +741,7 @@ def test_rolling_numerical_accuracy_kahan_mean(add): df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean() ) dates = pd.date_range("19700101 09:00:00", periods=7, freq="S") - expected = pd.DataFrame( + expected = DataFrame( { "A": [ np.nan, @@ -739,9 +760,9 @@ def test_rolling_numerical_accuracy_kahan_mean(add): def test_rolling_numerical_accuracy_kahan_sum(): # GH: 13254 - df = pd.DataFrame([2.186, -1.647, 0.0, 0.0, 0.0, 0.0], columns=["x"]) + df = DataFrame([2.186, -1.647, 0.0, 0.0, 0.0, 0.0], columns=["x"]) result = df["x"].rolling(3).sum() - expected = pd.Series([np.nan, np.nan, 0.539, -1.647, 0.0, 0.0], name="x") + expected = Series([np.nan, np.nan, 0.539, -1.647, 0.0, 0.0], name="x") tm.assert_series_equal(result, expected) @@ -752,7 +773,7 @@ def test_rolling_numerical_accuracy_jump(): ) data = np.random.rand(len(index)) - df = pd.DataFrame({"data": data}, index=index) + df = DataFrame({"data": data}, index=index) result = df.rolling("60s").mean() tm.assert_frame_equal(result, df[["data"]]) @@ -770,10 +791,10 @@ def test_rolling_numerical_accuracy_small_values(): def test_rolling_numerical_too_large_numbers(): # GH: 11645 dates = pd.date_range("2015-01-01", periods=10, freq="D") - ds = pd.Series(data=range(10), index=dates, dtype=np.float64) + ds = Series(data=range(10), index=dates, dtype=np.float64) ds[2] = -9e33 result = ds.rolling(5).mean() - expected = pd.Series( + expected = Series( [np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 5.0, 6.0, 7.0], index=dates, ) @@ -786,10 +807,10 @@ def test_rolling_numerical_too_large_numbers(): ) def test_rolling_mixed_dtypes_axis_1(func, value): # GH: 20649 - df = pd.DataFrame(1, index=[1, 2], columns=["a", "b", "c"]) + df = DataFrame(1, index=[1, 2], columns=["a", "b", "c"]) df["c"] = 1.0 result = getattr(df.rolling(window=2, min_periods=1, axis=1), func)() - expected = pd.DataFrame( + expected = DataFrame( {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, index=[1, 2] ) tm.assert_frame_equal(result, expected) @@ -797,7 +818,7 @@ def test_rolling_mixed_dtypes_axis_1(func, value): def test_rolling_axis_one_with_nan(): # GH: 35596 - df = pd.DataFrame( + df = DataFrame( [ [0, 1, 2, 4, np.nan, np.nan, np.nan], [0, 1, 2, np.nan, np.nan, np.nan, np.nan], @@ -805,7 +826,7 @@ def test_rolling_axis_one_with_nan(): ] ) result = df.rolling(window=7, min_periods=1, axis="columns").sum() - expected = pd.DataFrame( + expected = DataFrame( [ [0.0, 1.0, 3.0, 7.0, 7.0, 7.0, 7.0], [0.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0], @@ -821,17 +842,17 @@ def test_rolling_axis_one_with_nan(): ) def test_rolling_axis_1_non_numeric_dtypes(value): # GH: 20649 - df = pd.DataFrame({"a": [1, 2]}) + df = DataFrame({"a": [1, 2]}) df["b"] = value result = df.rolling(window=2, min_periods=1, axis=1).sum() - expected = pd.DataFrame({"a": [1.0, 2.0]}) + expected = DataFrame({"a": [1.0, 2.0]}) tm.assert_frame_equal(result, expected) def test_rolling_on_df_transposed(): # GH: 32724 - df = pd.DataFrame({"A": [1, None], "B": [4, 5], "C": [7, 8]}) - expected = pd.DataFrame({"A": [1.0, np.nan], "B": [5.0, 5.0], "C": [11.0, 13.0]}) + df = DataFrame({"A": [1, None], "B": [4, 5], "C": [7, 8]}) + expected = DataFrame({"A": [1.0, np.nan], "B": [5.0, 5.0], "C": [11.0, 13.0]}) result = df.rolling(min_periods=1, window=2, axis=1).sum() tm.assert_frame_equal(result, expected) @@ -864,9 +885,9 @@ def test_rolling_on_df_transposed(): ) def test_rolling_period_index(index, window, func, values): # GH: 34225 - ds = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8], index=index) + ds = Series([0, 1, 2, 3, 4, 5, 6, 7, 8], index=index) result = getattr(ds.rolling(window, closed="left"), func)() - expected = pd.Series(values, index=index) + expected = Series(values, index=index) tm.assert_series_equal(result, expected) @@ -876,8 +897,8 @@ def test_rolling_sem(constructor): obj = getattr(pd, constructor)([0, 1, 2]) result = obj.rolling(2, min_periods=1).sem() if isinstance(result, DataFrame): - result = pd.Series(result[0].values) - expected = pd.Series([np.nan] + [0.707107] * 2) + result = Series(result[0].values) + expected = Series([np.nan] + [0.707107] * 2) tm.assert_series_equal(result, expected) @@ -892,7 +913,177 @@ def test_rolling_sem(constructor): ) def test_rolling_var_numerical_issues(func, third_value, values): # GH: 37051 - ds = pd.Series([99999999999999999, 1, third_value, 2, 3, 1, 1]) + ds = Series([99999999999999999, 1, third_value, 2, 3, 1, 1]) result = getattr(ds.rolling(2), func)() - expected = pd.Series([np.nan] + values) + expected = Series([np.nan] + values) tm.assert_series_equal(result, expected) + + +def test_timeoffset_as_window_parameter_for_corr(): + # GH: 28266 + exp = DataFrame( + { + "B": [ + np.nan, + np.nan, + 0.9999999999999998, + -1.0, + 1.0, + -0.3273268353539892, + 0.9999999999999998, + 1.0, + 0.9999999999999998, + 1.0, + ], + "A": [ + np.nan, + np.nan, + -1.0, + 1.0000000000000002, + -0.3273268353539892, + 0.9999999999999966, + 1.0, + 1.0000000000000002, + 1.0, + 1.0000000000000002, + ], + }, + index=pd.MultiIndex.from_tuples( + [ + (pd.Timestamp("20130101 09:00:00"), "B"), + (pd.Timestamp("20130101 09:00:00"), "A"), + (pd.Timestamp("20130102 09:00:02"), "B"), + (pd.Timestamp("20130102 09:00:02"), "A"), + (pd.Timestamp("20130103 09:00:03"), "B"), + (pd.Timestamp("20130103 09:00:03"), "A"), + (pd.Timestamp("20130105 09:00:05"), "B"), + (pd.Timestamp("20130105 09:00:05"), "A"), + (pd.Timestamp("20130106 09:00:06"), "B"), + (pd.Timestamp("20130106 09:00:06"), "A"), + ] + ), + ) + + df = DataFrame( + {"B": [0, 1, 2, 4, 3], "A": [7, 4, 6, 9, 3]}, + index=[ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130102 09:00:02"), + pd.Timestamp("20130103 09:00:03"), + pd.Timestamp("20130105 09:00:05"), + pd.Timestamp("20130106 09:00:06"), + ], + ) + + res = df.rolling(window="3d").corr() + + tm.assert_frame_equal(exp, res) + + +@pytest.mark.parametrize("method", ["var", "sum", "mean", "skew", "kurt", "min", "max"]) +def test_rolling_decreasing_indices(method): + """ + Make sure that decreasing indices give the same results as increasing indices. + + GH 36933 + """ + df = DataFrame({"values": np.arange(-15, 10) ** 2}) + df_reverse = DataFrame({"values": df["values"][::-1]}, index=df.index[::-1]) + + increasing = getattr(df.rolling(window=5), method)() + decreasing = getattr(df_reverse.rolling(window=5), method)() + + assert np.abs(decreasing.values[::-1][:-4] - increasing.values[4:]).max() < 1e-12 + + +@pytest.mark.parametrize( + "method,expected", + [ + ( + "var", + [ + float("nan"), + 43.0, + float("nan"), + 136.333333, + 43.5, + 94.966667, + 182.0, + 318.0, + ], + ), + ("mean", [float("nan"), 7.5, float("nan"), 21.5, 6.0, 9.166667, 13.0, 17.5]), + ("sum", [float("nan"), 30.0, float("nan"), 86.0, 30.0, 55.0, 91.0, 140.0]), + ( + "skew", + [ + float("nan"), + 0.709296, + float("nan"), + 0.407073, + 0.984656, + 0.919184, + 0.874674, + 0.842418, + ], + ), + ( + "kurt", + [ + float("nan"), + -0.5916711736073559, + float("nan"), + -1.0028993131317954, + -0.06103844629409494, + -0.254143227116194, + -0.37362637362637585, + -0.45439658241367054, + ], + ), + ], +) +def test_rolling_non_monotonic(method, expected): + """ + Make sure the (rare) branch of non-monotonic indices is covered by a test. + + output from 1.1.3 is assumed to be the expected output. Output of sum/mean has + manually been verified. + + GH 36933. + """ + # Based on an example found in computation.rst + use_expanding = [True, False, True, False, True, True, True, True] + df = DataFrame({"values": np.arange(len(use_expanding)) ** 2}) + + class CustomIndexer(pd.api.indexers.BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if self.use_expanding[i]: + start[i] = 0 + end[i] = i + 1 + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + indexer = CustomIndexer(window_size=4, use_expanding=use_expanding) + + result = getattr(df.rolling(indexer), method)() + expected = DataFrame({"values": expected}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("index", "window"), + [([0, 1, 2, 3, 4], 2), (pd.date_range("2001-01-01", freq="D", periods=5), "2D")], +) +def test_rolling_corr_timedelta_index(index, window): + # GH: 31286 + x = Series([1, 2, 3, 4, 5], index=index) + y = x.copy() + x[0:2] = 0.0 + result = x.rolling(window).corr(y) + expected = Series([np.nan, np.nan, 1, 1, 1], index=index) + tm.assert_almost_equal(result, expected) diff --git a/pyproject.toml b/pyproject.toml index 8161e8ad752da..2b78147e9294d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,9 +6,10 @@ requires = [ "wheel", "Cython>=0.29.21,<3", # Note: sync with setup.py "numpy==1.16.5; python_version=='3.7' and platform_system!='AIX'", - "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'", + "numpy==1.17.3; python_version=='3.8' and platform_system!='AIX'", "numpy==1.16.5; python_version=='3.7' and platform_system=='AIX'", - "numpy==1.17.3; python_version>='3.8' and platform_system=='AIX'", + "numpy==1.17.3; python_version=='3.8' and platform_system=='AIX'", + "numpy; python_version>='3.9'", ] [tool.black] diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index b6ffab1482bbc..7b648a589bc61 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -12,11 +12,10 @@ import argparse import ast -import os import sys import token import tokenize -from typing import IO, Callable, FrozenSet, Iterable, List, Set, Tuple +from typing import IO, Callable, Iterable, List, Set, Tuple PRIVATE_IMPORTS_TO_IGNORE: Set[str] = { "_extension_array_shared_docs", @@ -33,9 +32,7 @@ "_get_version", "__main__", "_transform_template", - "_arith_doc_FRAME", "_flex_comp_doc_FRAME", - "_make_flex_doc", "_op_descriptions", "_IntegerDtype", "_use_inf_as_na", @@ -405,8 +402,6 @@ def main( function: Callable[[IO[str]], Iterable[Tuple[int, str]]], source_path: str, output_format: str, - file_extensions_to_check: str, - excluded_file_paths: str, ) -> bool: """ Main entry point of the script. @@ -434,20 +429,10 @@ def main( ValueError If the `source_path` is not pointing to existing file/directory. """ - if not os.path.exists(source_path): - raise ValueError("Please enter a valid path, pointing to a file/directory.") - is_failed: bool = False - file_path: str = "" - - FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset( - file_extensions_to_check.split(",") - ) - PATHS_TO_IGNORE = frozenset(excluded_file_paths.split(",")) - if os.path.isfile(source_path): - file_path = source_path - with open(file_path) as file_obj: + for file_path in source_path: + with open(file_path, encoding="utf-8") as file_obj: for line_number, msg in function(file_obj): is_failed = True print( @@ -456,25 +441,6 @@ def main( ) ) - for subdir, _, files in os.walk(source_path): - if any(path in subdir for path in PATHS_TO_IGNORE): - continue - for file_name in files: - if not any( - file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK - ): - continue - - file_path = os.path.join(subdir, file_name) - with open(file_path) as file_obj: - for line_number, msg in function(file_obj): - is_failed = True - print( - output_format.format( - source_path=file_path, line_number=line_number, msg=msg - ) - ) - return is_failed @@ -489,9 +455,7 @@ def main( parser = argparse.ArgumentParser(description="Unwanted patterns checker.") - parser.add_argument( - "path", nargs="?", default=".", help="Source path of file/directory to check." - ) + parser.add_argument("paths", nargs="*", help="Source paths of files to check.") parser.add_argument( "--format", "-f", @@ -505,25 +469,13 @@ def main( required=True, help="Validation test case to check.", ) - parser.add_argument( - "--included-file-extensions", - default="py,pyx,pxd,pxi", - help="Comma separated file extensions to check.", - ) - parser.add_argument( - "--excluded-file-paths", - default="asv_bench/env", - help="Comma separated file paths to exclude.", - ) args = parser.parse_args() sys.exit( main( function=globals().get(args.validation_type), # type: ignore - source_path=args.path, + source_path=args.paths, output_format=args.format, - file_extensions_to_check=args.included_file_extensions, - excluded_file_paths=args.excluded_file_paths, ) ) diff --git a/setup.cfg b/setup.cfg index 447adc9188951..d8525c12cf13e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -54,7 +54,7 @@ exclude = [tool:pytest] # sync minversion with setup.cfg & install.rst -minversion = 4.0.2 +minversion = 5.0.1 testpaths = pandas doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS addopts = --strict-data-files @@ -181,30 +181,15 @@ check_untyped_defs=False [mypy-pandas.core.indexes.extension] check_untyped_defs=False -[mypy-pandas.core.indexes.interval] -check_untyped_defs=False - [mypy-pandas.core.indexes.multi] check_untyped_defs=False [mypy-pandas.core.resample] check_untyped_defs=False -[mypy-pandas.core.reshape.concat] -check_untyped_defs=False - [mypy-pandas.core.reshape.merge] check_untyped_defs=False -[mypy-pandas.core.window.common] -check_untyped_defs=False - -[mypy-pandas.core.window.expanding] -check_untyped_defs=False - -[mypy-pandas.core.window.rolling] -check_untyped_defs=False - [mypy-pandas.io.clipboard] check_untyped_defs=False @@ -223,15 +208,6 @@ check_untyped_defs=False [mypy-pandas.io.parsers] check_untyped_defs=False -[mypy-pandas.io.pytables] -check_untyped_defs=False - -[mypy-pandas.io.stata] -check_untyped_defs=False - -[mypy-pandas.plotting._matplotlib.converter] -check_untyped_defs=False - [mypy-pandas.plotting._matplotlib.core] check_untyped_defs=False