diff --git a/.github/ISSUE_TEMPLATE/submit_question.md b/.github/ISSUE_TEMPLATE/submit_question.md deleted file mode 100644 index 9b48918ff2f6d..0000000000000 --- a/.github/ISSUE_TEMPLATE/submit_question.md +++ /dev/null @@ -1,24 +0,0 @@ ---- - -name: Submit Question -about: Ask a general question about pandas -title: "QST:" -labels: "Usage Question, Needs Triage" - ---- - -- [ ] I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) on StackOverflow for similar questions. - -- [ ] I have asked my usage related question on [StackOverflow](https://stackoverflow.com). - ---- - -#### Question about pandas - -**Note**: If you'd still like to submit a question, please read [this guide]( -https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your question. - -```python -# Your code here, if applicable - -``` diff --git a/.github/ISSUE_TEMPLATE/submit_question.yml b/.github/ISSUE_TEMPLATE/submit_question.yml new file mode 100644 index 0000000000000..b227c9970f29e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/submit_question.yml @@ -0,0 +1,43 @@ +name: Submit Question +description: Ask a general question about pandas +title: "QST: " +labels: [Usage Question, Needs Triage] + +body: + - type: markdown + attributes: + value: > + Since [StackOverflow](https://stackoverflow.com) is better suited towards answering + usage questions, we ask that all usage questions are first asked on StackOverflow. + - type: checkboxes + attributes: + options: + - label: > + I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) + on StackOverflow for similar questions. + required: true + - label: > + I have asked my usage related question on [StackOverflow](https://stackoverflow.com). + required: true + - type: input + id: question-link + attributes: + label: Link to question on StackOverflow + validations: + required: true + - type: markdown + attributes: + value: --- + - type: textarea + id: question + attributes: + label: Question about pandas + description: > + **Note**: If you'd still like to submit a question, please read [this guide]( + https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing + how to provide the necessary information for us to reproduce your question. + placeholder: | + ```python + # Your code here, if applicable + + ``` diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b9fe5461aff37..e7a26a7905799 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,10 +48,6 @@ jobs: - name: Build Pandas uses: ./.github/actions/build_pandas - - name: Linting - run: ci/code_checks.sh lint - if: always() - - name: Checks on imported code run: ci/code_checks.sh code if: always() diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index adeae4c4f09dc..7429155fe5023 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -41,7 +41,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - pip install git+https://github.com/numpy/numpy.git + pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy pip install git+https://github.com/pytest-dev/pytest.git pip install git+https://github.com/nedbat/coveragepy.git pip install cython python-dateutil pytz hypothesis pytest-xdist pytest-cov diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d72364b66e0e7..892b8d6dfa0c8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -107,6 +107,11 @@ repos: # Check for deprecated messages without sphinx directive |(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.) types_or: [python, cython, rst] + - id: cython-casting + name: Check Cython casting is `obj`, not ` obj` + language: pygrep + entry: '[a-zA-Z0-9*]> ' + files: (\.pyx|\.pxi.in)$ - id: incorrect-backticks name: Check for backticks incorrectly rendering because of missing spaces language: pygrep diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e5834f311d259..e4c4122f9ff43 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -538,8 +538,12 @@ class Interpolate: def setup(self, downcast): N = 10000 # this is the worst case, where every column has NaNs. - self.df = DataFrame(np.random.randn(N, 100)) - self.df.values[::2] = np.nan + arr = np.random.randn(N, 100) + # NB: we need to set values in array, not in df.values, otherwise + # the benchmark will be misleading for ArrayManager + arr[::2] = np.nan + + self.df = DataFrame(arr) self.df2 = DataFrame( { diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 0d05d50bd7dd6..f9860de3b2bb3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -3,21 +3,18 @@ # Run checks related to code quality. # # This script is intended for both the CI and to check locally that code standards are -# respected. We are currently linting (PEP-8 and similar), looking for patterns of -# common mistakes (sphinx directives with missing blank lines, old style classes, -# unwanted imports...), we run doctests here (currently some files only), and we +# respected. We run doctests here (currently some files only), and we # validate formatting error in docstrings. # # Usage: # $ ./ci/code_checks.sh # run all checks -# $ ./ci/code_checks.sh lint # run linting only # $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors # $ ./ci/code_checks.sh typing # run static type analysis -[[ -z "$1" || "$1" == "lint" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "typing" ]] || \ - { echo "Unknown command $1. Usage: $0 [lint|code|doctests|docstrings|typing]"; exit 9999; } +[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "typing" ]] || \ + { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|typing]"; exit 9999; } BASE_DIR="$(dirname $0)/.." RET=0 @@ -40,23 +37,6 @@ if [[ "$GITHUB_ACTIONS" == "true" ]]; then INVGREP_PREPEND="##[error]" fi -### LINTING ### -if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then - - # Check that cython casting is of the form `obj` as opposed to ` obj`; - # it doesn't make a difference, but we want to be internally consistent. - # Note: this grep pattern is (intended to be) equivalent to the python - # regex r'(?])> ' - MSG='Linting .pyx code for spacing conventions in casting' ; echo $MSG - invgrep -r -E --include '*.pyx' --include '*.pxi.in' '[a-zA-Z0-9*]> ' pandas/_libs - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # readability/casting: Warnings about C casting instead of C++ casting - # runtime/int: Warnings about using C number types instead of C++ ones - # build/include_subdir: Warnings about prefacing included header files with directory - -fi - ### CODE ### if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then diff --git a/ci/deps/actions-39-numpydev.yaml b/ci/deps/actions-39-numpydev.yaml index 466ca6215f46a..03181a9d71d1d 100644 --- a/ci/deps/actions-39-numpydev.yaml +++ b/ci/deps/actions-39-numpydev.yaml @@ -11,11 +11,11 @@ dependencies: - hypothesis>=5.5.3 # pandas dependencies + - python-dateutil - pytz - pip - pip: - cython==0.29.21 # GH#34014 - - "git+git://github.com/dateutil/dateutil.git" - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - "--pre" - "numpy" diff --git a/doc/source/_static/style/appmaphead1.png b/doc/source/_static/style/appmaphead1.png new file mode 100644 index 0000000000000..905bcaa63e900 Binary files /dev/null and b/doc/source/_static/style/appmaphead1.png differ diff --git a/doc/source/_static/style/appmaphead2.png b/doc/source/_static/style/appmaphead2.png new file mode 100644 index 0000000000000..9adde61908378 Binary files /dev/null and b/doc/source/_static/style/appmaphead2.png differ diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 721b1af126709..53e66eeff51e6 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -23,11 +23,10 @@ contributing them to the project:: ./ci/code_checks.sh -The script verifies the linting of code files, it looks for common mistake patterns -(like missing spaces around sphinx directives that make the documentation not -being rendered properly) and it also validates the doctests. It is possible to -run the checks independently by using the parameters ``lint``, ``patterns`` and -``doctests`` (e.g. ``./ci/code_checks.sh lint``). +The script validates the doctests, formatting in docstrings, static typing, and +imported modules. It is possible to run the checks independently by using the +parameters ``docstring``, ``code``, ``typing``, and ``doctests`` +(e.g. ``./ci/code_checks.sh doctests``). In addition, because a lot of people use our library, it is important that we do not make sudden changes to the code that could have the potential to break diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index d5b45f5953453..f7beff2dd81cc 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -106,7 +106,7 @@ extension array for IP Address data, this might be ``ipaddress.IPv4Address``. See the `extension dtype source`_ for interface definition. -:class:`pandas.api.extension.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name. +:class:`pandas.api.extensions.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name. This allows one to instantiate ``Series`` and ``.astype()`` with a registered string name, for example ``'category'`` is a registered string accessor for the ``CategoricalDtype``. @@ -125,7 +125,7 @@ data. We do require that your array be convertible to a NumPy array, even if this is relatively expensive (as it is for ``Categorical``). They may be backed by none, one, or many NumPy arrays. For example, -``pandas.Categorical`` is an extension array backed by two arrays, +:class:`pandas.Categorical` is an extension array backed by two arrays, one for codes and one for categories. An array of IPv6 addresses may be backed by a NumPy structured array with two fields, one for the lower 64 bits and one for the upper 64 bits. Or they may be backed diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index 1a8c21a2c1a74..6e58f487d5f4a 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -170,6 +170,7 @@ Numeric Index :toctree: api/ :template: autosummary/class_without_autosummary.rst + NumericIndex RangeIndex Int64Index UInt64Index diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 7b790daea37ff..ac4fc314c6c07 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -36,6 +36,8 @@ Style application Styler.apply Styler.applymap + Styler.apply_index + Styler.applymap_index Styler.format Styler.hide_index Styler.hide_columns diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 3b33ebe701037..535b503e4372c 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -7,7 +7,7 @@ MultiIndex / advanced indexing ****************************** This section covers :ref:`indexing with a MultiIndex ` -and :ref:`other advanced indexing features `. +and :ref:`other advanced indexing features `. See the :ref:`Indexing and Selecting Data ` for general indexing documentation. @@ -738,7 +738,7 @@ faster than fancy indexing. %timeit ser.iloc[indexer] %timeit ser.take(indexer) -.. _indexing.index_types: +.. _advanced.index_types: Index types ----------- @@ -749,7 +749,7 @@ and documentation about ``TimedeltaIndex`` is found :ref:`here ` for more. + ``RangeIndex`` however, will not be removed, as it represents an optimized version of an integer index. + :class:`Int64Index` is a fundamental basic index in pandas. This is an immutable array implementing an ordered, sliceable set. :class:`RangeIndex` is a sub-class of ``Int64Index`` that provides the default index for all ``NDFrame`` objects. ``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analogous to Python `range types `__. -.. _indexing.float64index: +.. _advanced.float64index: Float64Index ~~~~~~~~~~~~ +.. note:: + + In pandas 2.0, :class:`NumericIndex` will become the default index type for numeric types + instead of ``Int64Index``, ``Float64Index`` and ``UInt64Index`` and those index types + will be removed. See :ref:`here ` for more. + ``RangeIndex`` however, will not be removed, as it represents an optimized version of an integer index. + By default a :class:`Float64Index` will be automatically created when passing floating, or mixed-integer-floating values in index creation. This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the same. @@ -956,6 +970,38 @@ If you need integer based selection, you should use ``iloc``: dfir.iloc[0:5] + +.. _advanced.numericindex: + +NumericIndex +~~~~~~~~~~~~ + +.. versionadded:: 1.4.0 + +.. note:: + + In pandas 2.0, :class:`NumericIndex` will become the default index type for numeric types + instead of ``Int64Index``, ``Float64Index`` and ``UInt64Index`` and those index types + will be removed. + ``RangeIndex`` however, will not be removed, as it represents an optimized version of an integer index. + +:class:`NumericIndex` is an index type that can hold data of any numpy int/uint/float dtype. For example: + +.. ipython:: python + + idx = pd.NumericIndex([1, 2, 4, 5], dtype="int8") + idx + ser = pd.Series(range(4), index=idx) + ser + +``NumericIndex`` works the same way as the existing ``Int64Index``, ``Float64Index`` and +``UInt64Index`` except that it can hold any numpy int, uint or float dtype. + +Until Pandas 2.0, you will have to call ``NumericIndex`` explicitly in order to use it, like in the example above. +In Pandas 2.0, ``NumericIndex`` will become the default pandas numeric index type and will automatically be used where appropriate. + +Please notice that ``NumericIndex`` *can not* hold Pandas numeric dtypes (:class:`Int64Dtype`, :class:`Int32Dtype` etc.). + .. _advanced.intervalindex: IntervalIndex diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 6f9d8eb3474c2..0105cf99193dd 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1141,7 +1141,7 @@ Categorical index ``CategoricalIndex`` is a type of index that is useful for supporting indexing with duplicates. This is a container around a ``Categorical`` and allows efficient indexing and storage of an index with a large number of duplicated elements. -See the :ref:`advanced indexing docs ` for a more detailed +See the :ref:`advanced indexing docs ` for a more detailed explanation. Setting the index will create a ``CategoricalIndex``: diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 7d1d03fe020a6..e74272c825e46 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -474,7 +474,15 @@ rows and columns: .. ipython:: python - df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std) + table = df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std) + table + +Additionally, you can call :meth:`DataFrame.stack` to display a pivoted DataFrame +as having a multi-level index: + +.. ipython:: python + + table.stack() .. _reshaping.crosstabulations: diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 52d99533c1f60..b2b3678e48534 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -294,7 +294,7 @@ To convert back to sparse SciPy matrix in COO format, you can use the :meth:`Dat sdf.sparse.to_coo() -meth:`Series.sparse.to_coo` is implemented for transforming a ``Series`` with sparse values indexed by a :class:`MultiIndex` to a :class:`scipy.sparse.coo_matrix`. +:meth:`Series.sparse.to_coo` is implemented for transforming a ``Series`` with sparse values indexed by a :class:`MultiIndex` to a :class:`scipy.sparse.coo_matrix`. The method requires a ``MultiIndex`` with two or more levels. diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index f77d134d75988..10ef65a68eefa 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -225,13 +225,15 @@ "\n", "- Using [.set_table_styles()][table] to control broader areas of the table with specified internal CSS. Although table styles allow the flexibility to add CSS selectors and properties controlling all individual parts of the table, they are unwieldy for individual cell specifications. Also, note that table styles cannot be exported to Excel. \n", "- Using [.set_td_classes()][td_class] to directly link either external CSS classes to your data cells or link the internal CSS classes created by [.set_table_styles()][table]. See [here](#Setting-Classes-and-Linking-to-External-CSS). These cannot be used on column header rows or indexes, and also won't export to Excel. \n", - "- Using the [.apply()][apply] and [.applymap()][applymap] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). These cannot be used on column header rows or indexes, but only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.applymap()][dfapplymap].\n", + "- Using the [.apply()][apply] and [.applymap()][applymap] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes; [.apply_index()][applyindex] and [.applymap_index()][applymapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.applymap()][dfapplymap].\n", "\n", "[table]: ../reference/api/pandas.io.formats.style.Styler.set_table_styles.rst\n", "[styler]: ../reference/api/pandas.io.formats.style.Styler.rst\n", "[td_class]: ../reference/api/pandas.io.formats.style.Styler.set_td_classes.rst\n", "[apply]: ../reference/api/pandas.io.formats.style.Styler.apply.rst\n", "[applymap]: ../reference/api/pandas.io.formats.style.Styler.applymap.rst\n", + "[applyindex]: ../reference/api/pandas.io.formats.style.Styler.apply_index.rst\n", + "[applymapindex]: ../reference/api/pandas.io.formats.style.Styler.applymap_index.rst\n", "[dfapply]: ../reference/api/pandas.DataFrame.apply.rst\n", "[dfapplymap]: ../reference/api/pandas.DataFrame.applymap.rst" ] @@ -432,6 +434,8 @@ "source": [ "## Styler Functions\n", "\n", + "### Acting on Data\n", + "\n", "We use the following methods to pass your style functions. Both of those methods take a function (and some other keyword arguments) and apply it to the DataFrame in a certain way, rendering CSS styles.\n", "\n", "- [.applymap()][applymap] (elementwise): accepts a function that takes a single value and returns a string with the CSS attribute-value pair.\n", @@ -533,6 +537,18 @@ " .apply(highlight_max, props='color:white;background-color:purple', axis=None)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", + "s2.set_uuid('after_apply_again')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -548,6 +564,33 @@ "" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Acting on the Index and Column Headers\n", + "\n", + "Similar application is acheived for headers by using:\n", + " \n", + "- [.applymap_index()][applymapindex] (elementwise): accepts a function that takes a single value and returns a string with the CSS attribute-value pair.\n", + "- [.apply_index()][applyindex] (level-wise): accepts a function that takes a Series and returns a Series, or numpy array with an identical shape where each element is a string with a CSS attribute-value pair. This method passes each level of your Index one-at-a-time. To style the index use `axis=0` and to style the column headers use `axis=1`.\n", + "\n", + "You can select a `level` of a `MultiIndex` but currently no similar `subset` application is available for these methods.\n", + "\n", + "[applyindex]: ../reference/api/pandas.io.formats.style.Styler.apply_index.rst\n", + "[applymapindex]: ../reference/api/pandas.io.formats.style.Styler.applymap_index.rst" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s2.applymap_index(lambda v: \"color:pink;\" if v>4 else \"color:darkblue;\", axis=0)\n", + "s2.apply_index(lambda s: np.where(s.isin([\"A\", \"B\"]), \"color:pink;\", \"color:darkblue;\"), axis=1)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1931,6 +1974,7 @@ } ], "metadata": { + "celltoolbar": "Edit Metadata", "kernelspec": { "display_name": "Python 3", "language": "python", diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 3c6b70fb21383..b2596358d0c9d 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -310,7 +310,7 @@ Float64Index API change - Added a new index type, ``Float64Index``. This will be automatically created when passing floating values in index creation. This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the - same. See :ref:`the docs`, (:issue:`263`) + same. See :ref:`the docs`, (:issue:`263`) Construction is by default for floating type values. diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index 269854111373f..cbf5b7703bd79 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -168,7 +168,7 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index. ordered=False, name='B', dtype='category') -See the :ref:`documentation ` for more. (:issue:`7629`, :issue:`10038`, :issue:`10039`) +See the :ref:`documentation ` for more. (:issue:`7629`, :issue:`10038`, :issue:`10039`) .. _whatsnew_0161.enhancements.sample: diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index f54cea744f4d2..a94eab960418b 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -15,14 +15,17 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`) -- Regression in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`) +- Regression in updating values of :class:`Series` using boolean index, created by using :meth:`DataFrame.pop` (:issue:`42530`) - Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`) -- Fixed regression in :meth:`DataFrame.shift` where TypeError occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`) +- Fixed regression in :meth:`DataFrame.shift` where ``TypeError`` occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`) - Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`) - Regression in :meth:`DataFrame.drop` does nothing if :class:`MultiIndex` has duplicates and indexer is a tuple or list of tuples (:issue:`42771`) -- Fixed regression where :meth:`pandas.read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to None (:issue:`42387`) +- Fixed regression where :func:`read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to ``None`` (:issue:`42387`) - Fixed regression in comparisons between :class:`Timestamp` object and ``datetime64`` objects outside the implementation bounds for nanosecond ``datetime64`` (:issue:`42794`) - Fixed regression in :meth:`.Styler.highlight_min` and :meth:`.Styler.highlight_max` where ``pandas.NA`` was not successfully ignored (:issue:`42650`) +- Fixed regression in :func:`concat` where ``copy=False`` was not honored in ``axis=1`` Series concatenation (:issue:`42501`) +- Regression in :meth:`Series.nlargest` and :meth:`Series.nsmallest` with nullable integer or float dtype (:issue:`42816`) +- Fixed regression in :meth:`Series.quantile` with :class:`Int64Dtype` (:issue:`42626`) .. --------------------------------------------------------------------------- @@ -30,7 +33,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in :meth:`pandas.read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`) +- Bug in :func:`read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`) - 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and ``window`` is an offset that covers all the rows (:issue:`42753`) - :meth:`.Styler.hide_columns` now hides the index name header row as well as column headers (:issue:`42101`) @@ -44,7 +47,6 @@ Bug fixes Other ~~~~~ -- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1. - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8754286ee7d11..f0af60f80edd5 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -15,10 +15,53 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_140.enhancements.enhancement1: +.. _whatsnew_140.enhancements.numeric_index: -enhancement1 -^^^^^^^^^^^^ +More flexible numeric dtypes for indexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Until now, it has only been possible to create numeric indexes with int64/float64/uint64 dtypes. +It is now possible to create an index of any numpy int/uint/float dtype using the new :class:`NumericIndex` index type (:issue:`41153`): + +.. ipython:: python + + pd.NumericIndex([1, 2, 3], dtype="int8") + pd.NumericIndex([1, 2, 3], dtype="uint32") + pd.NumericIndex([1, 2, 3], dtype="float32") + +In order to maintain backwards compatibility, calls to the base :class:`Index` will in +pandas 1.x. return :class:`Int64Index`, :class:`UInt64Index` and :class:`Float64Index`. +For example, the code below returns an ``Int64Index`` with dtype ``int64``: + +.. code-block:: ipython + + In [1]: pd.Index([1, 2, 3], dtype="int8") + Int64Index([1, 2, 3], dtype='int64') + +For the duration of Pandas 1.x, in order to maintain backwards compatibility, all +operations that until now have returned :class:`Int64Index`, :class:`UInt64Index` and +:class:`Float64Index` will continue to so. This means, that in order to use +``NumericIndex``, you will have to call ``NumericIndex`` explicitly. For example the below series +will have an ``Int64Index``: + +.. code-block:: ipython + + In [2]: ser = pd.Series([1, 2, 3], index=[1, 2, 3]) + In [3]: ser.index + Int64Index([1, 2, 3], dtype='int64') + +Instead if you want to use a ``NumericIndex``, you should do: + +.. ipython:: python + + idx = pd.NumericIndex([1, 2, 3], dtype="int8") + ser = pd.Series([1, 2, 3], index=idx) + ser.index + +In Pandas 2.0, :class:`NumericIndex` will become the default numeric index type and +``Int64Index``, ``UInt64Index`` and ``Float64Index`` will be removed. + +See :ref:`here ` for more. .. _whatsnew_140.enhancements.enhancement2: @@ -38,6 +81,7 @@ Other enhancements - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`) - Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`) - Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`) +- :meth:`.Styler.apply_index` and :meth:`.Styler.applymap_index` added to allow conditional styling of index and column header values for HTML and LaTeX (:issue:`41893`) - :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - @@ -162,6 +206,7 @@ Deprecations - Deprecated ignoring missing labels when indexing with a sequence of labels on a level of a MultiIndex (:issue:`42351`) - Creating an empty Series without a dtype will now raise a more visible ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`30017`) - Deprecated the 'kind' argument in :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer`, :meth:`Index.slice_locs`; in a future version passing 'kind' will raise (:issue:`42857`) +- Deprecated dropping of nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`EWM` aggregations (:issue:`42738`) - Deprecated :meth:`Index.reindex` with a non-unique index (:issue:`42568`) - @@ -176,6 +221,7 @@ Performance improvements - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`) - Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`) - Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`) +- Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`) - .. --------------------------------------------------------------------------- @@ -241,6 +287,7 @@ Indexing - Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`) - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`) - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`) +- Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`) - Missing @@ -260,6 +307,7 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) +- Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) - Period diff --git a/pandas/__init__.py b/pandas/__init__.py index 43f05617584cc..d8df7a42911ab 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -75,6 +75,7 @@ UInt64Index, RangeIndex, Float64Index, + NumericIndex, MultiIndex, IntervalIndex, TimedeltaIndex, diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 3feefe7ac8ff4..c589985a6d4b1 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -33,6 +33,8 @@ class BlockPlacement: @property def as_array(self) -> np.ndarray: ... @property + def as_slice(self) -> slice: ... + @property def is_slice_like(self) -> bool: ... @overload def __getitem__(self, loc: slice | Sequence[int]) -> BlockPlacement: ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index ba59c50142550..2b498260d94ee 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -395,7 +395,7 @@ def get_blkno_indexers( cdef: int64_t cur_blkno Py_ssize_t i, start, stop, n, diff, tot_len - object blkno + int64_t blkno object group_dict = defaultdict(list) n = blknos.shape[0] diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index d730084692dd4..4d3bdde357e88 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -15,10 +15,7 @@ from numpy cimport ( cnp.import_array() -from pandas._libs.util cimport ( - is_array, - set_array_not_contiguous, -) +from pandas._libs.util cimport is_array from pandas._libs.lib import is_scalar @@ -350,151 +347,3 @@ cdef class Slider: cdef reset(self): self.buf.data = self.orig_data self.buf.shape[0] = 0 - - -def apply_frame_axis0(object frame, object f, object names, - const int64_t[:] starts, const int64_t[:] ends): - cdef: - BlockSlider slider - Py_ssize_t i, n = len(starts) - list results - object piece - dict item_cache - - # We have already checked that we don't have a MultiIndex before calling - assert frame.index.nlevels == 1 - - results = [] - - slider = BlockSlider(frame) - - mutated = False - item_cache = slider.dummy._item_cache - try: - for i in range(n): - slider.move(starts[i], ends[i]) - - item_cache.clear() # ugh - chunk = slider.dummy - object.__setattr__(chunk, 'name', names[i]) - - piece = f(chunk) - - # Need to infer if low level index slider will cause segfaults - require_slow_apply = i == 0 and piece is chunk - try: - if piece.index is not chunk.index: - mutated = True - except AttributeError: - # `piece` might not have an index, could be e.g. an int - pass - - if not is_scalar(piece): - # Need to copy data to avoid appending references - try: - piece = piece.copy(deep="all") - except (TypeError, AttributeError): - pass - - results.append(piece) - - # If the data was modified inplace we need to - # take the slow path to not risk segfaults - # we have already computed the first piece - if require_slow_apply: - break - finally: - slider.reset() - - return results, mutated - - -cdef class BlockSlider: - """ - Only capable of sliding on axis=0 - """ - cdef: - object frame, dummy, index, block - list blocks, blk_values - ndarray orig_blklocs, orig_blknos - ndarray values - Slider idx_slider - char **base_ptrs - int nblocks - Py_ssize_t i - - def __init__(self, object frame): - self.frame = frame - self.dummy = frame[:0] - self.index = self.dummy.index - - # GH#35417 attributes we need to restore at each step in case - # the function modified them. - mgr = self.dummy._mgr - self.orig_blklocs = mgr.blklocs - self.orig_blknos = mgr.blknos - self.blocks = [x for x in self.dummy._mgr.blocks] - - self.blk_values = [block.values for block in self.dummy._mgr.blocks] - - for values in self.blk_values: - set_array_not_contiguous(values) - - self.nblocks = len(self.blk_values) - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference to a 1-d - # ndarray like datetime / timedelta / period. - self.idx_slider = Slider( - self.frame.index._index_data, self.dummy.index._index_data) - - self.base_ptrs = malloc(sizeof(char*) * self.nblocks) - for i, block in enumerate(self.blk_values): - self.base_ptrs[i] = (block).data - - def __dealloc__(self): - free(self.base_ptrs) - - cdef move(self, int start, int end): - cdef: - ndarray arr - Py_ssize_t i - - self._restore_blocks() - - # move blocks - for i in range(self.nblocks): - arr = self.blk_values[i] - - # axis=1 is the frame's axis=0 - arr.data = self.base_ptrs[i] + arr.strides[1] * start - arr.shape[1] = end - start - - # move and set the index - self.idx_slider.move(start, end) - - object.__setattr__(self.index, '_index_data', self.idx_slider.buf) - self.index._engine.clear_mapping() - self.index._cache.clear() # e.g. inferred_freq must go - - cdef reset(self): - cdef: - ndarray arr - Py_ssize_t i - - self._restore_blocks() - - for i in range(self.nblocks): - arr = self.blk_values[i] - - # axis=1 is the frame's axis=0 - arr.data = self.base_ptrs[i] - arr.shape[1] = 0 - - cdef _restore_blocks(self): - """ - Ensure that we have the original blocks, blknos, and blklocs. - """ - mgr = self.dummy._mgr - mgr.blocks = tuple(self.blocks) - mgr._blklocs = self.orig_blklocs - mgr._blknos = self.orig_blknos diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index cf530c8c07440..06c3d823f72d2 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -927,7 +927,6 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->itemName = itemName; GET_TC(tc)->itemValue = itemValue; - GET_TC(tc)->index++; itemName = attr; break; diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 7a55dd69ba7b7..f11d9c10ea85f 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -50,6 +50,7 @@ Int64Index, IntervalIndex, MultiIndex, + NumericIndex, RangeIndex, Series, UInt64Index, @@ -105,7 +106,6 @@ use_numexpr, with_csv_dialect, ) -from pandas.core.api import NumericIndex from pandas.core.arrays import ( DatetimeArray, PandasArray, diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 52c5790e1fc24..4f9dd61b8e0da 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,6 +11,7 @@ Literal, Union, cast, + final, ) from warnings import warn @@ -1209,12 +1210,15 @@ def __init__(self, obj, n: int, keep: str): def compute(self, method: str) -> DataFrame | Series: raise NotImplementedError + @final def nlargest(self): return self.compute("nlargest") + @final def nsmallest(self): return self.compute("nsmallest") + @final @staticmethod def is_valid_dtype_n_method(dtype: DtypeObj) -> bool: """ @@ -1253,6 +1257,18 @@ def compute(self, method: str) -> Series: dropped = self.obj.dropna() + if is_extension_array_dtype(dropped.dtype): + # GH#41816 bc we have dropped NAs above, MaskedArrays can use the + # numpy logic. + from pandas.core.arrays import BaseMaskedArray + + arr = dropped._values + if isinstance(arr, BaseMaskedArray): + ser = type(dropped)(arr._data, index=dropped.index, name=dropped.name) + + result = type(self)(ser, n=self.n, keep=self.keep).compute(method) + return result.astype(arr.dtype) + # slow method if n >= len(self.obj): ascending = method == "nsmallest" diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 32c50ed38eba0..c5e96f32e261f 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -181,5 +181,10 @@ def _quantile_ea_fallback( assert res.ndim == 2 assert res.shape[0] == 1 res = res[0] - out = type(values)._from_sequence(res, dtype=values.dtype) + try: + out = type(values)._from_sequence(res, dtype=values.dtype) + except TypeError: + # GH#42626: not able to safely cast Int64 + # for floating point output + out = np.atleast_2d(np.asarray(res, dtype=np.float64)) return out diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b2a4fecbf084d..860d4f6a5dcc2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -211,7 +211,6 @@ class NDFrame(PandasObject, indexing.IndexingMixin): "_is_copy", "_subtyp", "_name", - "_index", "_default_kind", "_default_fill_value", "_metadata", diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6d8881d12dbb7..f9ba34e916a04 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -85,17 +85,13 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import ( - base, - grouper, -) +from pandas.core.groupby import grouper from pandas.core.indexes.api import ( CategoricalIndex, Index, MultiIndex, ensure_index, ) -from pandas.core.internals import ArrayManager from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -718,60 +714,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() - result_values = None - - if data.ndim == 2 and any( - isinstance(x, ExtensionArray) for x in data._iter_column_arrays() - ): - # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 - # if we pass EA instead of ndarray - # TODO: can we have a workaround for EAs backed by ndarray? - pass - - elif isinstance(data._mgr, ArrayManager): - # TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0 - # for now -> relies on BlockManager internals - pass - elif ( - com.get_callable_name(f) not in base.plotting_methods - and isinstance(splitter, FrameSplitter) - and axis == 0 - # fast_apply/libreduction doesn't allow non-numpy backed indexes - and not data.index._has_complex_internals - ): - try: - sdata = splitter.sorted_data - result_values, mutated = splitter.fast_apply(f, sdata, group_keys) - - except IndexError: - # This is a rare case in which re-running in python-space may - # make a difference, see test_apply_mutate.test_mutate_groups - pass - - else: - # If the fast apply path could be used we can return here. - # Otherwise we need to fall back to the slow implementation. - if len(result_values) == len(group_keys): - return group_keys, result_values, mutated - - if result_values is None: - # result_values is None if fast apply path wasn't taken - # or fast apply aborted with an unexpected exception. - # In either case, initialize the result list and perform - # the slow iteration. - result_values = [] - skip_first = False - else: - # If result_values is not None we're in the case that the - # fast apply loop was broken prematurely but we have - # already the result for the first group which we can reuse. - skip_first = True + result_values = [] # This calls DataSplitter.__iter__ zipped = zip(group_keys, splitter) - if skip_first: - # pop the first item from the front of the iterator - next(zipped) for key, group in zipped: object.__setattr__(group, "name", key) @@ -1290,11 +1236,6 @@ def _chop(self, sdata: Series, slice_obj: slice) -> Series: class FrameSplitter(DataSplitter): - def fast_apply(self, f: F, sdata: FrameOrSeries, names): - # must return keys::list, values::list, mutated::bool - starts, ends = lib.generate_slices(self.slabels, self.ngroups) - return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # Fastpath equivalent to: # if self.axis == 0: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 08946bf5a8033..b67bb2cd3cf37 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -20,6 +20,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import ( algos as libalgos, index as libindex, @@ -156,7 +158,6 @@ from pandas.io.formats.printing import ( PrettyDict, default_pprint, - format_object_attrs, format_object_summary, pprint_thing, ) @@ -1214,7 +1215,20 @@ def _format_attrs(self) -> list[tuple[str_t, str_t | int]]: """ Return a list of tuples of the (attr,formatted_value). """ - return format_object_attrs(self, include_dtype=not self._is_multi) + attrs: list[tuple[str_t, str_t | int]] = [] + + if not self._is_multi: + attrs.append(("dtype", f"'{self.dtype}'")) + + if self.name is not None: + attrs.append(("name", default_pprint(self.name))) + elif self._is_multi and any(x is not None for x in self.names): + attrs.append(("names", default_pprint(self.names))) + + max_seq_items = get_option("display.max_seq_items") or len(self) + if len(self) > max_seq_items: + attrs.append(("length", len(self))) + return attrs @final def _mpl_repr(self) -> np.ndarray: @@ -6290,7 +6304,7 @@ def drop(self, labels, errors: str_t = "raise") -> Index: mask = indexer == -1 if mask.any(): if errors != "ignore": - raise KeyError(f"{labels[mask]} not found in axis") + raise KeyError(f"{list(labels[mask])} not found in axis") indexer = indexer[~mask] return self.delete(indexer) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index bce49f2e923e9..1a6a5f08841b9 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -283,7 +283,7 @@ def _is_dtype_compat(self, other) -> Categorical: @doc(Index.astype) def astype(self, dtype: Dtype, copy: bool = True) -> Index: - from pandas.core.api import NumericIndex + from pandas import NumericIndex dtype = pandas_dtype(dtype) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 60e66036731a6..4d80480468adb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3568,7 +3568,7 @@ def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) if ( any(-1 in code for code in self.codes) - and any(-1 in code for code in self.codes) + and any(-1 in code for code in other.codes) or self.has_duplicates or other.has_duplicates ): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 4b43ed92441a1..1802a4d58a34a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -29,7 +29,6 @@ is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, - is_sparse, ) from pandas.core.dtypes.concat import ( cast_to_common_type, @@ -46,6 +45,7 @@ DatetimeArray, ExtensionArray, ) +from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.array_manager import ( ArrayManager, @@ -260,7 +260,10 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra mgr_shape_list[ax] = len(indexer) mgr_shape = tuple(mgr_shape_list) + has_column_indexer = False + if 0 in indexers: + has_column_indexer = True ax0_indexer = indexers.pop(0) blknos = algos.take_nd(mgr.blknos, ax0_indexer, fill_value=-1) blklocs = algos.take_nd(mgr.blklocs, ax0_indexer, fill_value=-1) @@ -270,9 +273,6 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra blk = mgr.blocks[0] return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] - # error: Incompatible types in assignment (expression has type "None", variable - # has type "ndarray") - ax0_indexer = None # type: ignore[assignment] blknos = mgr.blknos blklocs = mgr.blklocs @@ -288,6 +288,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra shape = tuple(shape_list) if blkno == -1: + # only reachable in the `0 in indexers` case unit = JoinUnit(None, shape) else: blk = mgr.blocks[blkno] @@ -302,7 +303,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra # placement was sequential before. ( ( - ax0_indexer is None + not has_column_indexer and blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1 ) @@ -330,6 +331,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra class JoinUnit: def __init__(self, block, shape: Shape, indexers=None): # Passing shape explicitly is required for cases when block is None. + # Note: block is None implies indexers is None, but not vice-versa if indexers is None: indexers = {} self.block = block @@ -358,7 +360,7 @@ def dtype(self): return blk.dtype return ensure_dtype_can_hold_na(blk.dtype) - def is_valid_na_for(self, dtype: DtypeObj) -> bool: + def _is_valid_na_for(self, dtype: DtypeObj) -> bool: """ Check that we are all-NA of a type/dtype that is compatible with this dtype. Augments `self.is_na` with an additional check of the type of NA values. @@ -389,11 +391,8 @@ def is_na(self) -> bool: if not self.block._can_hold_na: return False - # Usually it's enough to check but a small fraction of values to see if - # a block is NOT null, chunks should help in such cases. 1000 value - # was chosen rather arbitrarily. values = self.block.values - if is_sparse(self.block.values.dtype): + if isinstance(self.block.values.dtype, SparseDtype): return False elif self.block.is_extension: # TODO(EA2D): no need for special case with 2D EAs @@ -411,7 +410,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: else: fill_value = upcasted_na - if self.is_valid_na_for(empty_dtype): + if self._is_valid_na_for(empty_dtype): + # note: always holds when self.block is None blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype("object"): @@ -592,13 +592,16 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: _concatenate_join_units (which uses `concat_compat`). """ + first = join_units[0].block + if first is None: + return False return ( - # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa + # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64 + all(type(ju.block) is type(first) for ju in join_units) and # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform all( - is_dtype_equal(ju.block.dtype, join_units[0].block.dtype) + is_dtype_equal(ju.block.dtype, first.dtype) # GH#42092 we only want the dtype_equal check for non-numeric blocks # (for now, may change but that would need a deprecation) or ju.block.dtype.kind in ["b", "i", "u"] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7c8b289e6eb87..c71c3c3912812 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1178,8 +1178,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: warnings.warn( "DataFrame is highly fragmented. This is usually the result " "of calling `frame.insert` many times, which has poor performance. " - "Consider using pd.concat instead. To get a de-fragmented frame, " - "use `newframe = frame.copy()`", + "Consider joining all columns at once using pd.concat(axis=1) " + "instead. To get a de-fragmented frame, use `newframe = frame.copy()`", PerformanceWarning, stacklevel=5, ) @@ -1714,6 +1714,13 @@ def array_values(self): """The array that Series.array returns""" return self._block.array_values + def get_numeric_data(self, copy: bool = False): + if self._block.is_numeric: + if copy: + return self.copy() + return self + return self.make_empty() + @property def _can_hold_na(self) -> bool: return self._block._can_hold_na diff --git a/pandas/core/missing.py b/pandas/core/missing.py index f144821220e4b..1a07b5614eb38 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -215,7 +215,7 @@ def interpolate_array_2d( **kwargs, ): """ - Wrapper to dispatch to either interpolate_2d or interpolate_2d_with_fill. + Wrapper to dispatch to either interpolate_2d or _interpolate_2d_with_fill. """ try: m = clean_fill_method(method) @@ -237,7 +237,7 @@ def interpolate_array_2d( else: assert index is not None # for mypy - interp_values = interpolate_2d_with_fill( + interp_values = _interpolate_2d_with_fill( data=data, index=index, axis=axis, @@ -251,7 +251,7 @@ def interpolate_array_2d( return interp_values -def interpolate_2d_with_fill( +def _interpolate_2d_with_fill( data: np.ndarray, # floating dtype index: Index, axis: int, @@ -263,11 +263,11 @@ def interpolate_2d_with_fill( **kwargs, ) -> np.ndarray: """ - Column-wise application of interpolate_1d. + Column-wise application of _interpolate_1d. Notes ----- - The signature does differs from interpolate_1d because it only + The signature does differs from _interpolate_1d because it only includes what is needed for Block.interpolate. """ # validate the interp method @@ -276,13 +276,44 @@ def interpolate_2d_with_fill( if is_valid_na_for_dtype(fill_value, data.dtype): fill_value = na_value_for_dtype(data.dtype, compat=False) + if method == "time": + if not needs_i8_conversion(index.dtype): + raise ValueError( + "time-weighted interpolation only works " + "on Series or DataFrames with a " + "DatetimeIndex" + ) + method = "values" + + valid_limit_directions = ["forward", "backward", "both"] + limit_direction = limit_direction.lower() + if limit_direction not in valid_limit_directions: + raise ValueError( + "Invalid limit_direction: expecting one of " + f"{valid_limit_directions}, got '{limit_direction}'." + ) + + if limit_area is not None: + valid_limit_areas = ["inside", "outside"] + limit_area = limit_area.lower() + if limit_area not in valid_limit_areas: + raise ValueError( + f"Invalid limit_area: expecting one of {valid_limit_areas}, got " + f"{limit_area}." + ) + + # default limit is unlimited GH #16282 + limit = algos.validate_limit(nobs=None, limit=limit) + + indices = _index_to_interp_indices(index, method) + def func(yvalues: np.ndarray) -> np.ndarray: # process 1-d slices in the axis direction, returning it # should the axis argument be handled below in apply_along_axis? - # i.e. not an arg to interpolate_1d - return interpolate_1d( - xvalues=index, + # i.e. not an arg to _interpolate_1d + return _interpolate_1d( + indices=indices, yvalues=yvalues, method=method, limit=limit, @@ -297,8 +328,30 @@ def func(yvalues: np.ndarray) -> np.ndarray: return np.apply_along_axis(func, axis, data) -def interpolate_1d( - xvalues: Index, +def _index_to_interp_indices(index: Index, method: str) -> np.ndarray: + """ + Convert Index to ndarray of indices to pass to NumPy/SciPy. + """ + xarr = index._values + if needs_i8_conversion(xarr.dtype): + # GH#1646 for dt64tz + xarr = xarr.view("i8") + + if method == "linear": + inds = xarr + inds = cast(np.ndarray, inds) + else: + inds = np.asarray(xarr) + + if method in ("values", "index"): + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + + return inds + + +def _interpolate_1d( + indices: np.ndarray, yvalues: np.ndarray, method: str | None = "linear", limit: int | None = None, @@ -311,51 +364,23 @@ def interpolate_1d( ): """ Logic for the 1-d interpolation. The result should be 1-d, inputs - xvalues and yvalues will each be 1-d arrays of the same length. + indices and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argument. """ + invalid = isna(yvalues) valid = ~invalid if not valid.any(): - result = np.empty(xvalues.shape, dtype=np.float64) + result = np.empty(indices.shape, dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues - if method == "time": - if not needs_i8_conversion(xvalues.dtype): - raise ValueError( - "time-weighted interpolation only works " - "on Series or DataFrames with a " - "DatetimeIndex" - ) - method = "values" - - valid_limit_directions = ["forward", "backward", "both"] - limit_direction = limit_direction.lower() - if limit_direction not in valid_limit_directions: - raise ValueError( - "Invalid limit_direction: expecting one of " - f"{valid_limit_directions}, got '{limit_direction}'." - ) - - if limit_area is not None: - valid_limit_areas = ["inside", "outside"] - limit_area = limit_area.lower() - if limit_area not in valid_limit_areas: - raise ValueError( - f"Invalid limit_area: expecting one of {valid_limit_areas}, got " - f"{limit_area}." - ) - - # default limit is unlimited GH #16282 - limit = algos.validate_limit(nobs=None, limit=limit) - # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) @@ -369,8 +394,6 @@ def interpolate_1d( last_valid_index = len(yvalues) end_nans = set(range(1 + last_valid_index, len(valid))) - mid_nans = all_nans - start_nans - end_nans - # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. @@ -396,6 +419,7 @@ def interpolate_1d( preserve_nans |= start_nans | end_nans elif limit_area == "outside": # preserve NaNs on the inside + mid_nans = all_nans - start_nans - end_nans preserve_nans |= mid_nans # sort preserve_nans and convert to list @@ -403,37 +427,18 @@ def interpolate_1d( result = yvalues.copy() - # xarr to pass to NumPy/SciPy - xarr = xvalues._values - if needs_i8_conversion(xarr.dtype): - # GH#1646 for dt64tz - xarr = xarr.view("i8") - - if method == "linear": - inds = xarr - else: - inds = np.asarray(xarr) - - if method in ("values", "index"): - if inds.dtype == np.object_: - inds = lib.maybe_convert_objects(inds) - if method in NP_METHODS: # np.interp requires sorted X values, #21037 - # error: Argument 1 to "argsort" has incompatible type "Union[ExtensionArray, - # Any]"; expected "Union[Union[int, float, complex, str, bytes, generic], - # Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - indexer = np.argsort(inds[valid]) # type: ignore[arg-type] + indexer = np.argsort(indices[valid]) result[invalid] = np.interp( - inds[invalid], inds[valid][indexer], yvalues[valid][indexer] + indices[invalid], indices[valid][indexer], yvalues[valid][indexer] ) else: result[invalid] = _interpolate_scipy_wrapper( - inds[valid], + indices[valid], yvalues[valid], - inds[invalid], + indices[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 560735b593cd1..3be1a04d9e2a4 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -504,7 +504,7 @@ def get_result(self): cons = sample._constructor_expanddim index, columns = self.new_axes - df = cons(data, index=index) + df = cons(data, index=index, copy=self.copy) df.columns = columns return df.__finalize__(self, method="concat") diff --git a/pandas/core/series.py b/pandas/core/series.py index 6efd1f65c2264..7766a7ea362eb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -519,8 +519,6 @@ def _constructor_expanddim(self) -> type[DataFrame]: def _can_hold_na(self) -> bool: return self._mgr._can_hold_na - _index: Index | None = None - def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: """ Override generic, we want to set the _typ here. @@ -549,7 +547,6 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: # or not be a DatetimeIndex pass - object.__setattr__(self, "_index", labels) if not fastpath: # The ensure_index call above ensures we have an Index object self._mgr.set_axis(axis, labels) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 01e11ff4b008d..d4c0eb946505d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -32,6 +32,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_float64, @@ -436,6 +437,18 @@ def hfunc2d(values: ArrayLike) -> ArrayLike: new_mgr = mgr.apply_2d(hfunc2d, ignore_failures=True) else: new_mgr = mgr.apply(hfunc, ignore_failures=True) + + if 0 != len(new_mgr.items) != len(mgr.items): + # GH#42738 ignore_failures dropped nuisance columns + dropped = mgr.items.difference(new_mgr.items) + warnings.warn( + "Dropping of nuisance columns in rolling operations " + "is deprecated; in a future version this will raise TypeError. " + "Select only valid columns before calling the operation. " + f"Dropped columns were {dropped}", + FutureWarning, + stacklevel=find_stack_level(), + ) out = obj._constructor(new_mgr) return self._resolve_output(out, obj) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index ac81fffcf353a..77431533e703a 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -11,7 +11,6 @@ Iterable, Mapping, Sequence, - Sized, TypeVar, Union, ) @@ -505,44 +504,6 @@ def _justify( return head, tail # type: ignore[return-value] -def format_object_attrs( - obj: Sized, include_dtype: bool = True -) -> list[tuple[str, str | int]]: - """ - Return a list of tuples of the (attr, formatted_value) - for common attrs, including dtype, name, length - - Parameters - ---------- - obj : object - Must be sized. - include_dtype : bool - If False, dtype won't be in the returned list - - Returns - ------- - list of 2-tuple - - """ - attrs: list[tuple[str, str | int]] = [] - if hasattr(obj, "dtype") and include_dtype: - # error: "Sized" has no attribute "dtype" - attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore[attr-defined] - if getattr(obj, "name", None) is not None: - # error: "Sized" has no attribute "name" - attrs.append(("name", default_pprint(obj.name))) # type: ignore[attr-defined] - # error: "Sized" has no attribute "names" - elif getattr(obj, "names", None) is not None and any( - obj.names # type: ignore[attr-defined] - ): - # error: "Sized" has no attribute "names" - attrs.append(("names", default_pprint(obj.names))) # type: ignore[attr-defined] - max_seq_items = get_option("display.max_seq_items") or len(obj) - if len(obj) > max_seq_items: - attrs.append(("length", len(obj))) - return attrs - - class PrettyDict(Dict[_KT, _VT]): """Dict extension to support abbreviated __repr__""" diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 1a891d76a376c..a72de753d6a8a 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1012,6 +1012,32 @@ def _update_ctx(self, attrs: DataFrame) -> None: i, j = self.index.get_loc(rn), self.columns.get_loc(cn) self.ctx[(i, j)].extend(css_list) + def _update_ctx_header(self, attrs: DataFrame, axis: str) -> None: + """ + Update the state of the ``Styler`` for header cells. + + Collects a mapping of {index_label: [('', ''), ..]}. + + Parameters + ---------- + attrs : Series + Should contain strings of ': ;: ', and an + integer index. + Whitespace shouldn't matter and the final trailing ';' shouldn't + matter. + axis : str + Identifies whether the ctx object being updated is the index or columns + """ + for j in attrs.columns: + for i, c in attrs[[j]].itertuples(): + if not c: + continue + css_list = maybe_convert_css_to_tuples(c) + if axis == "index": + self.ctx_index[(i, j)].extend(css_list) + else: + self.ctx_columns[(j, i)].extend(css_list) + def _copy(self, deepcopy: bool = False) -> Styler: """ Copies a Styler, allowing for deepcopy or shallow copy @@ -1051,6 +1077,8 @@ def _copy(self, deepcopy: bool = False) -> Styler: "hidden_rows", "hidden_columns", "ctx", + "ctx_index", + "ctx_columns", "cell_context", "_todo", "table_styles", @@ -1172,6 +1200,8 @@ def apply( See Also -------- + Styler.applymap_index: Apply a CSS-styling function to headers elementwise. + Styler.apply_index: Apply a CSS-styling function to headers level-wise. Styler.applymap: Apply a CSS-styling function elementwise. Notes @@ -1215,6 +1245,149 @@ def apply( ) return self + def _apply_index( + self, + func: Callable[..., Styler], + axis: int | str = 0, + level: Level | list[Level] | None = None, + method: str = "apply", + **kwargs, + ) -> Styler: + if axis in [0, "index"]: + obj, axis = self.index, "index" + elif axis in [1, "columns"]: + obj, axis = self.columns, "columns" + else: + raise ValueError( + f"`axis` must be one of 0, 1, 'index', 'columns', got {axis}" + ) + + levels_ = _refactor_levels(level, obj) + data = DataFrame(obj.to_list()).loc[:, levels_] + + if method == "apply": + result = data.apply(func, axis=0, **kwargs) + elif method == "applymap": + result = data.applymap(func, **kwargs) + + self._update_ctx_header(result, axis) + return self + + @doc( + this="apply", + wise="level-wise", + alt="applymap", + altwise="elementwise", + func="take a Series and return a string array of the same length", + axis='{0, 1, "index", "columns"}', + input_note="the index as a Series, if an Index, or a level of a MultiIndex", + output_note="an identically sized array of CSS styles as strings", + var="s", + ret='np.where(s == "B", "background-color: yellow;", "")', + ret2='["background-color: yellow;" if "x" in v else "" for v in s]', + ) + def apply_index( + self, + func: Callable[..., Styler], + axis: int | str = 0, + level: Level | list[Level] | None = None, + **kwargs, + ) -> Styler: + """ + Apply a CSS-styling function to the index or column headers, {wise}. + + Updates the HTML representation with the result. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + func : function + ``func`` should {func}. + axis : {axis} + The headers over which to apply the function. + level : int, str, list, optional + If index is MultiIndex the level(s) over which to apply the function. + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + self : Styler + + See Also + -------- + Styler.{alt}_index: Apply a CSS-styling function to headers {altwise}. + Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. + Styler.applymap: Apply a CSS-styling function elementwise. + + Notes + ----- + Each input to ``func`` will be {input_note}. The output of ``func`` should be + {output_note}, in the format 'attribute: value; attribute2: value2; ...' + or, if nothing is to be applied to that element, an empty string or ``None``. + + Examples + -------- + Basic usage to conditionally highlight values in the index. + + >>> df = pd.DataFrame([[1,2], [3,4]], index=["A", "B"]) + >>> def color_b(s): + ... return {ret} + >>> df.style.{this}_index(color_b) # doctest: +SKIP + + .. figure:: ../../_static/style/appmaphead1.png + + Selectively applying to specific levels of MultiIndex columns. + + >>> midx = pd.MultiIndex.from_product([['ix', 'jy'], [0, 1], ['x3', 'z4']]) + >>> df = pd.DataFrame([np.arange(8)], columns=midx) + >>> def highlight_x({var}): + ... return {ret2} + >>> df.style.{this}_index(highlight_x, axis="columns", level=[0, 2]) + ... # doctest: +SKIP + + .. figure:: ../../_static/style/appmaphead2.png + """ + self._todo.append( + ( + lambda instance: getattr(instance, "_apply_index"), + (func, axis, level, "apply"), + kwargs, + ) + ) + return self + + @doc( + apply_index, + this="applymap", + wise="elementwise", + alt="apply", + altwise="level-wise", + func="take a scalar and return a string", + axis='{0, 1, "index", "columns"}', + input_note="an index value, if an Index, or a level value of a MultiIndex", + output_note="CSS styles as a string", + var="v", + ret='"background-color: yellow;" if v == "B" else None', + ret2='"background-color: yellow;" if "x" in v else None', + ) + def applymap_index( + self, + func: Callable[..., Styler], + axis: int | str = 0, + level: Level | list[Level] | None = None, + **kwargs, + ) -> Styler: + self._todo.append( + ( + lambda instance: getattr(instance, "_apply_index"), + (func, axis, level, "applymap"), + kwargs, + ) + ) + return self + def _applymap( self, func: Callable, subset: Subset | None = None, **kwargs ) -> Styler: @@ -1237,7 +1410,7 @@ def applymap( Parameters ---------- func : function - ``func`` should take a scalar and return a scalar. + ``func`` should take a scalar and return a string. subset : label, array-like, IndexSlice, optional A valid 2d input to `DataFrame.loc[]`, or, in the case of a 1d input or single key, to `DataFrame.loc[:, ]` where the columns are @@ -1251,6 +1424,8 @@ def applymap( See Also -------- + Styler.applymap_index: Apply a CSS-styling function to headers elementwise. + Styler.apply_index: Apply a CSS-styling function to headers level-wise. Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. Notes diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index e89d4519543c6..aa58b3abbd06c 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -102,6 +102,8 @@ def __init__( self.hidden_rows: Sequence[int] = [] # sequence for specific hidden rows/cols self.hidden_columns: Sequence[int] = [] self.ctx: DefaultDict[tuple[int, int], CSSList] = defaultdict(list) + self.ctx_index: DefaultDict[tuple[int, int], CSSList] = defaultdict(list) + self.ctx_columns: DefaultDict[tuple[int, int], CSSList] = defaultdict(list) self.cell_context: DefaultDict[tuple[int, int], str] = defaultdict(str) self._todo: list[tuple[Callable, tuple, dict]] = [] self.tooltips: Tooltips | None = None @@ -152,6 +154,8 @@ def _compute(self): (application method, *args, **kwargs) """ self.ctx.clear() + self.ctx_index.clear() + self.ctx_columns.clear() r = self for func, args, kwargs in self._todo: r = func(self)(*args, **kwargs) @@ -201,6 +205,9 @@ def _translate(self, sparse_index: bool, sparse_cols: bool, blank: str = "  len(self.data.index), len(self.data.columns), max_elements ) + self.cellstyle_map_columns: DefaultDict[ + tuple[CSSPair, ...], list[str] + ] = defaultdict(list) head = self._translate_header( BLANK_CLASS, BLANK_VALUE, @@ -215,6 +222,9 @@ def _translate(self, sparse_index: bool, sparse_cols: bool, blank: str = "  self.cellstyle_map: DefaultDict[tuple[CSSPair, ...], list[str]] = defaultdict( list ) + self.cellstyle_map_index: DefaultDict[ + tuple[CSSPair, ...], list[str] + ] = defaultdict(list) body = self._translate_body( DATA_CLASS, ROW_HEADING_CLASS, @@ -226,11 +236,17 @@ def _translate(self, sparse_index: bool, sparse_cols: bool, blank: str = "  ) d.update({"body": body}) - cellstyle: list[dict[str, CSSList | list[str]]] = [ - {"props": list(props), "selectors": selectors} - for props, selectors in self.cellstyle_map.items() - ] - d.update({"cellstyle": cellstyle}) + ctx_maps = { + "cellstyle": "cellstyle_map", + "cellstyle_index": "cellstyle_map_index", + "cellstyle_columns": "cellstyle_map_columns", + } # add the cell_ids styles map to the render dictionary in right format + for k, attr in ctx_maps.items(): + map = [ + {"props": list(props), "selectors": selectors} + for props, selectors in getattr(self, attr).items() + ] + d.update({k: map}) table_attr = self.table_attributes use_mathjax = get_option("display.html.use_mathjax") @@ -323,8 +339,9 @@ def _translate_header( ] if clabels: - column_headers = [ - _element( + column_headers = [] + for c, value in enumerate(clabels[r]): + header_element = _element( "th", f"{col_heading_class} level{r} col{c}", value, @@ -335,8 +352,16 @@ def _translate_header( else "" ), ) - for c, value in enumerate(clabels[r]) - ] + + if self.cell_ids: + header_element["id"] = f"level{r}_col{c}" + if (r, c) in self.ctx_columns and self.ctx_columns[r, c]: + header_element["id"] = f"level{r}_col{c}" + self.cellstyle_map_columns[ + tuple(self.ctx_columns[r, c]) + ].append(f"level{r}_col{c}") + + column_headers.append(header_element) if len(self.data.columns) > max_cols: # add an extra column with `...` value to indicate trimming @@ -470,21 +495,30 @@ def _translate_body( body.append(index_headers + data) break - index_headers = [ - _element( + index_headers = [] + for c, value in enumerate(rlabels[r]): + header_element = _element( "th", f"{row_heading_class} level{c} row{r}", value, (_is_visible(r, c, idx_lengths) and not self.hide_index_[c]), - id=f"level{c}_row{r}", attributes=( f'rowspan="{idx_lengths.get((c, r), 0)}"' if idx_lengths.get((c, r), 0) > 1 else "" ), ) - for c, value in enumerate(rlabels[r]) - ] + + if self.cell_ids: + header_element["id"] = f"level{c}_row{r}" # id is specified + if (r, c) in self.ctx_index and self.ctx_index[r, c]: + # always add id if a style is specified + header_element["id"] = f"level{c}_row{r}" + self.cellstyle_map_index[tuple(self.ctx_index[r, c])].append( + f"level{c}_row{r}" + ) + + index_headers.append(header_element) data = [] for c, value in enumerate(row_tup[1:]): @@ -514,13 +548,12 @@ def _translate_body( display_value=self._display_funcs[(r, c)](value), ) - # only add an id if the cell has a style - if self.cell_ids or (r, c) in self.ctx: + if self.cell_ids: data_element["id"] = f"row{r}_col{c}" - if (r, c) in self.ctx and self.ctx[r, c]: # only add if non-empty - self.cellstyle_map[tuple(self.ctx[r, c])].append( - f"row{r}_col{c}" - ) + if (r, c) in self.ctx and self.ctx[r, c]: + # always add id if needed due to specified style + data_element["id"] = f"row{r}_col{c}" + self.cellstyle_map[tuple(self.ctx[r, c])].append(f"row{r}_col{c}") data.append(data_element) @@ -537,7 +570,14 @@ def _translate_latex(self, d: dict) -> None: - Remove hidden indexes or reinsert missing th elements if part of multiindex or multirow sparsification (so that \multirow and \multicol work correctly). """ - d["head"] = [[col for col in row if col["is_visible"]] for row in d["head"]] + d["head"] = [ + [ + {**col, "cellstyle": self.ctx_columns[r, c - self.index.nlevels]} + for c, col in enumerate(row) + if col["is_visible"] + ] + for r, row in enumerate(d["head"]) + ] body = [] for r, row in enumerate(d["body"]): if all(self.hide_index_): @@ -549,8 +589,9 @@ def _translate_latex(self, d: dict) -> None: "display_value": col["display_value"] if col["is_visible"] else "", + "cellstyle": self.ctx_index[r, c] if col["is_visible"] else [], } - for col in row + for c, col in enumerate(row) if col["type"] == "th" ] @@ -1345,26 +1386,21 @@ def _parse_latex_header_span( >>> _parse_latex_header_span(cell, 't', 'c') '\\multicolumn{3}{c}{text}' """ + display_val = _parse_latex_cell_styles(cell["cellstyle"], cell["display_value"]) if "attributes" in cell: attrs = cell["attributes"] if 'colspan="' in attrs: colspan = attrs[attrs.find('colspan="') + 9 :] # len('colspan="') = 9 colspan = int(colspan[: colspan.find('"')]) - return ( - f"\\multicolumn{{{colspan}}}{{{multicol_align}}}" - f"{{{cell['display_value']}}}" - ) + return f"\\multicolumn{{{colspan}}}{{{multicol_align}}}{{{display_val}}}" elif 'rowspan="' in attrs: rowspan = attrs[attrs.find('rowspan="') + 9 :] rowspan = int(rowspan[: rowspan.find('"')]) - return ( - f"\\multirow[{multirow_align}]{{{rowspan}}}{{*}}" - f"{{{cell['display_value']}}}" - ) + return f"\\multirow[{multirow_align}]{{{rowspan}}}{{*}}{{{display_val}}}" if wrap: - return f"{{{cell['display_value']}}}" + return f"{{{display_val}}}" else: - return cell["display_value"] + return display_val def _parse_latex_options_strip(value: str | int | float, arg: str) -> str: diff --git a/pandas/io/formats/templates/html_style.tpl b/pandas/io/formats/templates/html_style.tpl index b34893076bedd..5b0e7a2ed882b 100644 --- a/pandas/io/formats/templates/html_style.tpl +++ b/pandas/io/formats/templates/html_style.tpl @@ -12,13 +12,15 @@ {% endblock table_styles %} {% block before_cellstyle %}{% endblock before_cellstyle %} {% block cellstyle %} -{% for s in cellstyle %} +{% for cs in [cellstyle, cellstyle_index, cellstyle_columns] %} +{% for s in cs %} {% for selector in s.selectors %}{% if not loop.first %}, {% endif %}#T_{{uuid}}{{selector}}{% endfor %} { {% for p,val in s.props %} {{p}}: {{val}}; {% endfor %} } {% endfor %} +{% endfor %} {% endblock cellstyle %} {% endblock style %} diff --git a/pandas/io/formats/templates/html_table.tpl b/pandas/io/formats/templates/html_table.tpl index 33153af6f0882..3e3a40b9fdaa6 100644 --- a/pandas/io/formats/templates/html_table.tpl +++ b/pandas/io/formats/templates/html_table.tpl @@ -27,7 +27,7 @@ {% else %} {% for c in r %} {% if c.is_visible != False %} - <{{c.type}} class="{{c.class}}" {{c.attributes}}>{{c.value}} + <{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.value}} {% endif %} {% endfor %} {% endif %} @@ -49,7 +49,7 @@ {% endif %}{% endfor %} {% else %} {% for c in r %}{% if c.is_visible != False %} - <{{c.type}} {% if c.id is defined -%} id="T_{{uuid}}{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}} + <{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}} {% endif %}{% endfor %} {% endif %} diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f0aeeb3e6c893..5671cce1b966d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,20 +309,16 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): - parquet_kwargs = {} + parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) - # Technically works with 0.7.0, but was incorrect - # so lets just require 0.7.1 - if Version(self.api.__version__) >= Version("0.7.1"): - # Need to set even for use_nullable_dtypes = False, - # since our defaults differ - parquet_kwargs["pandas_nulls"] = use_nullable_dtypes - else: - if use_nullable_dtypes: - raise ValueError( - "The 'use_nullable_dtypes' argument is not supported for the " - "fastparquet engine for fastparquet versions less than 0.7.1" - ) + if Version(self.api.__version__) >= Version("0.7.0"): + # We are disabling nullable dtypes for fastparquet pending discussion + parquet_kwargs["pandas_nulls"] = False + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine" + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -478,7 +474,8 @@ def read_parquet( use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame. + for the resulting DataFrame. (only applicable for the ``pyarrow`` + engine) As new dtypes are added that support ``pd.NA`` in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional @@ -486,10 +483,6 @@ def read_parquet( .. versionadded:: 1.2.0 - .. versionchanged:: 1.3.2 - ``use_nullable_dtypes`` now works with the the ``fastparquet`` engine - if ``fastparquet`` is version 0.7.1 or higher. - **kwargs Any additional kwargs are passed to the engine. diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 95dc1d82cb286..7173a43d4c5e6 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -68,6 +68,7 @@ class TestPDApi(Base): "Index", "Int64Index", "MultiIndex", + "NumericIndex", "Period", "PeriodIndex", "RangeIndex", diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 6ca5f2f76861e..9124e3d546123 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -9,8 +9,8 @@ ) import pandas as pd +from pandas import NumericIndex import pandas._testing as tm -from pandas.core.api import NumericIndex from pandas.tests.base.common import allow_na_ops diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7a3f88d0d6c41..99d92a5bbf774 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -261,18 +261,7 @@ def test_dataframe_constructor_with_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "frame", - [ - pytest.param( - True, - marks=pytest.mark.xfail( - reason="pd.concat call inside NDFrame.astype reverts the dtype" - ), - ), - False, - ], -) +@pytest.mark.parametrize("frame", [True, False]) def test_astype_dispatches(frame): # This is a dtype-specific test that ensures Series[decimal].astype # gets all the way through to ExtensionArray.astype diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1f1991214aad0..775a5a38768e6 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -23,6 +23,7 @@ option_context, ) import pandas._testing as tm +from pandas.core.arrays.integer import coerce_to_array def _check_cast(df, v): @@ -726,3 +727,32 @@ def test_astype_categorical_to_string_missing(self): cat = df.astype("category") result = cat.astype(str) tm.assert_frame_equal(result, expected) + + +class IntegerArrayNoCopy(pd.core.arrays.IntegerArray): + # GH 42501 + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy) + return IntegerArrayNoCopy(values, mask) + + def copy(self): + assert False + + +class Int16DtypeNoCopy(pd.Int16Dtype): + # GH 42501 + + @classmethod + def construct_array_type(cls): + return IntegerArrayNoCopy + + +def test_frame_astype_no_copy(): + # GH 42501 + df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object) + result = df.astype({"a": Int16DtypeNoCopy()}, copy=False) + + assert result.a.dtype == pd.Int16Dtype() + assert np.shares_memory(df.b.values, result.b.values) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index fa658d87c3ca0..1e2ce38a2fefd 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -130,6 +130,10 @@ def test_drop(self): with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): simple.drop(["A", "C"], axis=1) + # GH 42881 + with pytest.raises(KeyError, match=r"\['C', 'D', 'F'\] not found in axis"): + simple.drop(["C", "D", "F"], axis=1) + # errors = 'ignore' tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple) tm.assert_frame_equal( diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5d00c5cb9740a..25529e65118c8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -7,8 +7,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -86,40 +84,6 @@ def test_apply_trivial_fail(): tm.assert_frame_equal(result, expected) -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used -def test_fast_apply(): - # make sure that fast apply is correctly called - # rather than raising any kind of error - # otherwise the python path will be callsed - # which slows things down - N = 1000 - labels = np.random.randint(0, 2000, size=N) - labels2 = np.random.randint(0, 3, size=N) - df = DataFrame( - { - "key": labels, - "key2": labels2, - "value1": np.random.randn(N), - "value2": ["foo", "bar", "baz", "qux"] * (N // 4), - } - ) - - def f(g): - return 1 - - g = df.groupby(["key", "key2"]) - - grouper = g.grouper - - splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) - group_keys = grouper._get_group_keys() - sdata = splitter.sorted_data - - values, mutated = splitter.fast_apply(f, sdata, group_keys) - - assert not mutated - - @pytest.mark.parametrize( "df, group_names", [ @@ -216,8 +180,6 @@ def test_group_apply_once_per_group2(capsys): assert result == expected -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used -@pytest.mark.xfail(reason="GH-34998") def test_apply_fast_slow_identical(): # GH 31613 @@ -237,16 +199,13 @@ def fast(group): tm.assert_frame_equal(fast_df, slow_df) -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used @pytest.mark.parametrize( "func", [ lambda x: x, - pytest.param(lambda x: x[:], marks=pytest.mark.xfail(reason="GH-34998")), + lambda x: x[:], lambda x: x.copy(deep=False), - pytest.param( - lambda x: x.copy(deep=True), marks=pytest.mark.xfail(reason="GH-34998") - ), + lambda x: x.copy(deep=True), ], ) def test_groupby_apply_identity_maybecopy_index_identical(func): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 2c4067c347a35..614b565ae1500 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -26,6 +26,7 @@ Int64Index, IntervalIndex, MultiIndex, + NumericIndex, PeriodIndex, RangeIndex, Series, @@ -34,7 +35,6 @@ ) from pandas import UInt64Index # noqa:F401 import pandas._testing as tm -from pandas.core.api import NumericIndex from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 2e948c5aa0211..960412df9e6e2 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -8,11 +8,11 @@ Float64Index, Index, Int64Index, + NumericIndex, Series, UInt64Index, ) import pandas._testing as tm -from pandas.core.api import NumericIndex from pandas.tests.indexes.common import NumericBase diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 8facaf279f2cf..33aa8bbb942d5 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -21,12 +21,12 @@ CategoricalIndex, DatetimeIndex, MultiIndex, + NumericIndex, PeriodIndex, RangeIndex, TimedeltaIndex, ) import pandas._testing as tm -from pandas.core.api import NumericIndex class TestCommon: diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 80ba0c53fb9c4..3e88dbafdb7f5 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -5,11 +5,11 @@ DatetimeIndex, Float64Index, Index, + NumericIndex, PeriodIndex, TimedeltaIndex, ) import pandas._testing as tm -from pandas.core.api import NumericIndex from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index 9983017652919..bcf3c4dbad3a8 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -108,7 +108,7 @@ def test_w3_html_format(styler):   - A + A @@ -138,10 +138,7 @@ def test_rowspan_w3(): # GH 38533 df = DataFrame(data=[[1, 2]], index=[["l0", "l0"], ["l1a", "l1b"]]) styler = Styler(df, uuid="_", cell_ids=False) - assert ( - 'l0' in styler.render() - ) + assert 'l0' in styler.render() def test_styles(styler): @@ -165,7 +162,7 @@ def test_styles(styler):   - A + A @@ -400,3 +397,36 @@ def test_sparse_options(sparse_index, sparse_columns): assert (html1 == default_html) is (sparse_index and sparse_columns) html2 = styler.to_html(sparse_index=sparse_index, sparse_columns=sparse_columns) assert html1 == html2 + + +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("columns", [True, False]) +def test_applymap_header_cell_ids(styler, index, columns): + # GH 41893 + func = lambda v: "attr: val;" + styler.uuid, styler.cell_ids = "", False + if index: + styler.applymap_index(func, axis="index") + if columns: + styler.applymap_index(func, axis="columns") + + result = styler.to_html() + + # test no data cell ids + assert '2.610000' in result + assert '2.690000' in result + + # test index header ids where needed and css styles + assert ( + 'a' in result + ) is index + assert ( + 'b' in result + ) is index + assert ("#T_level0_row0, #T_level0_row1 {\n attr: val;\n}" in result) is index + + # test column header ids where needed and css styles + assert ( + 'A' in result + ) is columns + assert ("#T_level0_col0 {\n attr: val;\n}" in result) is columns diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 3c042e130981c..6cc4b889d369a 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -52,6 +52,8 @@ def mi_styler_comp(mi_styler): mi_styler.set_table_attributes('class="box"') mi_styler.format(na_rep="MISSING", precision=3) mi_styler.highlight_max(axis=None) + mi_styler.applymap_index(lambda x: "color: white;", axis=0) + mi_styler.applymap_index(lambda x: "color: black;", axis=1) mi_styler.set_td_classes( DataFrame( [["a", "b"], ["a", "c"]], index=mi_styler.index, columns=mi_styler.columns @@ -198,7 +200,14 @@ def test_copy(comprehensive, render, deepcopy, mi_styler, mi_styler_comp): if render: styler.to_html() - excl = ["na_rep", "precision", "uuid", "cellstyle_map"] # deprecated or special var + excl = [ + "na_rep", # deprecated + "precision", # deprecated + "uuid", # special + "cellstyle_map", # render time vars.. + "cellstyle_map_columns", + "cellstyle_map_index", + ] if not deepcopy: # check memory locations are equal for all included attributes for attr in [a for a in styler.__dict__ if (not callable(a) and a not in excl)]: assert id(getattr(s2, attr)) == id(getattr(styler, attr)) @@ -245,6 +254,8 @@ def test_clear(mi_styler_comp): "uuid_len", "cell_ids", "cellstyle_map", # execution time only + "cellstyle_map_columns", # execution time only + "cellstyle_map_index", # execution time only "precision", # deprecated "na_rep", # deprecated ] @@ -296,6 +307,48 @@ def test_hide_columns_level(mi_styler, level, names): assert len(ctx["head"]) == (2 if names else 1) +@pytest.mark.parametrize("method", ["applymap", "apply"]) +@pytest.mark.parametrize("axis", ["index", "columns"]) +def test_apply_map_header(method, axis): + # GH 41893 + df = DataFrame({"A": [0, 0], "B": [1, 1]}, index=["C", "D"]) + func = { + "apply": lambda s: ["attr: val" if ("A" in v or "C" in v) else "" for v in s], + "applymap": lambda v: "attr: val" if ("A" in v or "C" in v) else "", + } + + # test execution added to todo + result = getattr(df.style, f"{method}_index")(func[method], axis=axis) + assert len(result._todo) == 1 + assert len(getattr(result, f"ctx_{axis}")) == 0 + + # test ctx object on compute + result._compute() + expected = { + (0, 0): [("attr", "val")], + } + assert getattr(result, f"ctx_{axis}") == expected + + +@pytest.mark.parametrize("method", ["apply", "applymap"]) +@pytest.mark.parametrize("axis", ["index", "columns"]) +def test_apply_map_header_mi(mi_styler, method, axis): + # GH 41893 + func = { + "apply": lambda s: ["attr: val;" if "b" in v else "" for v in s], + "applymap": lambda v: "attr: val" if "b" in v else "", + } + result = getattr(mi_styler, f"{method}_index")(func[method], axis=axis)._compute() + expected = {(1, 1): [("attr", "val")]} + assert getattr(result, f"ctx_{axis}") == expected + + +def test_apply_map_header_raises(mi_styler): + # GH 41893 + with pytest.raises(ValueError, match="`axis` must be one of 0, 1, 'index', 'col"): + mi_styler.applymap_index(lambda v: "attr: val;", axis="bad-axis")._compute() + + class TestStyler: def setup_method(self, method): np.random.seed(24) @@ -410,161 +463,58 @@ def test_empty_index_name_doesnt_display(self): # https://github.com/pandas-dev/pandas/pull/12090#issuecomment-180695902 df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) result = df.style._translate(True, True) - - expected = [ - [ - { - "class": "blank level0", - "type": "th", - "value": self.blank_value, - "is_visible": True, - "display_value": self.blank_value, - }, - { - "class": "col_heading level0 col0", - "display_value": "A", - "type": "th", - "value": "A", - "is_visible": True, - "attributes": "", - }, - { - "class": "col_heading level0 col1", - "display_value": "B", - "type": "th", - "value": "B", - "is_visible": True, - "attributes": "", - }, - { - "class": "col_heading level0 col2", - "display_value": "C", - "type": "th", - "value": "C", - "is_visible": True, - "attributes": "", - }, - ] - ] - - assert result["head"] == expected + assert len(result["head"]) == 1 + expected = { + "class": "blank level0", + "type": "th", + "value": self.blank_value, + "is_visible": True, + "display_value": self.blank_value, + } + assert expected.items() <= result["head"][0][0].items() def test_index_name(self): # https://github.com/pandas-dev/pandas/issues/11655 - # TODO: this test can be minimised to address the test more directly df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) result = df.set_index("A").style._translate(True, True) - - expected = [ - [ - { - "class": "blank level0", - "type": "th", - "value": self.blank_value, - "display_value": self.blank_value, - "is_visible": True, - }, - { - "class": "col_heading level0 col0", - "type": "th", - "value": "B", - "display_value": "B", - "is_visible": True, - "attributes": "", - }, - { - "class": "col_heading level0 col1", - "type": "th", - "value": "C", - "display_value": "C", - "is_visible": True, - "attributes": "", - }, - ], - [ - { - "class": "index_name level0", - "type": "th", - "value": "A", - "is_visible": True, - "display_value": "A", - }, - { - "class": "blank col0", - "type": "th", - "value": self.blank_value, - "is_visible": True, - "display_value": self.blank_value, - }, - { - "class": "blank col1", - "type": "th", - "value": self.blank_value, - "is_visible": True, - "display_value": self.blank_value, - }, - ], - ] - - assert result["head"] == expected + expected = { + "class": "index_name level0", + "type": "th", + "value": "A", + "is_visible": True, + "display_value": "A", + } + assert expected.items() <= result["head"][1][0].items() def test_multiindex_name(self): # https://github.com/pandas-dev/pandas/issues/11655 - # TODO: this test can be minimised to address the test more directly df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) result = df.set_index(["A", "B"]).style._translate(True, True) expected = [ - [ - { - "class": "blank", - "type": "th", - "value": self.blank_value, - "display_value": self.blank_value, - "is_visible": True, - }, - { - "class": "blank level0", - "type": "th", - "value": self.blank_value, - "display_value": self.blank_value, - "is_visible": True, - }, - { - "class": "col_heading level0 col0", - "type": "th", - "value": "C", - "display_value": "C", - "is_visible": True, - "attributes": "", - }, - ], - [ - { - "class": "index_name level0", - "type": "th", - "value": "A", - "is_visible": True, - "display_value": "A", - }, - { - "class": "index_name level1", - "type": "th", - "value": "B", - "is_visible": True, - "display_value": "B", - }, - { - "class": "blank col0", - "type": "th", - "value": self.blank_value, - "is_visible": True, - "display_value": self.blank_value, - }, - ], + { + "class": "index_name level0", + "type": "th", + "value": "A", + "is_visible": True, + "display_value": "A", + }, + { + "class": "index_name level1", + "type": "th", + "value": "B", + "is_visible": True, + "display_value": "B", + }, + { + "class": "blank col0", + "type": "th", + "value": self.blank_value, + "is_visible": True, + "display_value": self.blank_value, + }, ] - - assert result["head"] == expected + assert result["head"][1] == expected def test_numeric_columns(self): # https://github.com/pandas-dev/pandas/issues/12125 @@ -1098,7 +1048,6 @@ def test_mi_sparse_index_names(self): assert head == expected def test_mi_sparse_column_names(self): - # TODO this test is verbose - could be minimised df = DataFrame( np.arange(16).reshape(4, 4), index=MultiIndex.from_arrays( @@ -1109,7 +1058,7 @@ def test_mi_sparse_column_names(self): [["C1", "C1", "C2", "C2"], [1, 0, 1, 0]], names=["col_0", "col_1"] ), ) - result = df.style._translate(True, True) + result = Styler(df, cell_ids=False)._translate(True, True) head = result["head"][1] expected = [ { @@ -1320,7 +1269,7 @@ def test_no_cell_ids(self): styler = Styler(df, uuid="_", cell_ids=False) styler.render() s = styler.render() # render twice to ensure ctx is not updated - assert s.find('') != -1 + assert s.find('') != -1 @pytest.mark.parametrize( "classes", @@ -1338,10 +1287,10 @@ def test_set_data_classes(self, classes): # GH 36159 df = DataFrame(data=[[0, 1], [2, 3]], columns=["A", "B"], index=["a", "b"]) s = Styler(df, uuid_len=0, cell_ids=False).set_td_classes(classes).render() - assert '0' in s - assert '1' in s - assert '2' in s - assert '3' in s + assert '0' in s + assert '1' in s + assert '2' in s + assert '3' in s # GH 39317 s = Styler(df, uuid_len=0, cell_ids=True).set_td_classes(classes).render() assert '0' in s diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index 501d9b43ff106..ac164f2de9fb2 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -414,17 +414,20 @@ def test_parse_latex_cell_styles_braces(wrap_arg, expected): def test_parse_latex_header_span(): - cell = {"attributes": 'colspan="3"', "display_value": "text"} + cell = {"attributes": 'colspan="3"', "display_value": "text", "cellstyle": []} expected = "\\multicolumn{3}{Y}{text}" assert _parse_latex_header_span(cell, "X", "Y") == expected - cell = {"attributes": 'rowspan="5"', "display_value": "text"} + cell = {"attributes": 'rowspan="5"', "display_value": "text", "cellstyle": []} expected = "\\multirow[X]{5}{*}{text}" assert _parse_latex_header_span(cell, "X", "Y") == expected - cell = {"display_value": "text"} + cell = {"display_value": "text", "cellstyle": []} assert _parse_latex_header_span(cell, "X", "Y") == "text" + cell = {"display_value": "text", "cellstyle": [("bfseries", "--rwrap")]} + assert _parse_latex_header_span(cell, "X", "Y") == "\\bfseries{text}" + def test_parse_latex_table_wrapping(styler): styler.set_table_styles( @@ -635,3 +638,40 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): assert expected in styler.to_latex( environment="longtable", caption=caption, label=label ) + + +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("columns", [True, False]) +def test_apply_map_header_render_mi(df, index, columns): + cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df.loc[2, :] = [2, -2.22, "de"] + df = df.astype({"A": int}) + df.index, df.columns = ridx, cidx + styler = df.style + + func = lambda v: "bfseries: --rwrap" if "A" in v or "Z" in v or "c" in v else None + + if index: + styler.applymap_index(func, axis="index") + if columns: + styler.applymap_index(func, axis="columns") + + result = styler.to_latex() + + expected_index = dedent( + """\ + \\multirow[c]{2}{*}{\\bfseries{A}} & a & 0 & -0.610000 & ab \\\\ + & b & 1 & -1.220000 & cd \\\\ + B & \\bfseries{c} & 2 & -2.220000 & de \\\\ + """ + ) + assert (expected_index in result) is index + + expected_columns = dedent( + """\ + {} & {} & \\multicolumn{2}{r}{\\bfseries{Z}} & {Y} \\\\ + {} & {} & {a} & {b} & {\\bfseries{c}} \\\\ + """ + ) + assert (expected_columns in result) is columns diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d97ba8694818b..77a3394c08ef2 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1769,3 +1769,18 @@ def test_to_json_multiindex_escape(self): "\"(Timestamp('2017-01-23 00:00:00'), 'bar')\":true}" ) assert result == expected + + def test_to_json_series_of_objects(self): + class _TestObject: + def __init__(self, a, b, _c, d): + self.a = a + self.b = b + self._c = _c + self.d = d + + def e(self): + return 5 + + # JSON keys should be all non-callable non-underscore attributes, see GH-42768 + series = Series([_TestObject(a=1, b=2, _c=3, d=4)]) + assert json.loads(series.to_json()) == {"0": {"a": 1, "b": 2, "d": 4}} diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 57a6b214cec84..58ccd31b7c940 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -720,6 +720,21 @@ def my_obj_handler(_): ujson.encode(obj_list, default_handler=str) ) + def test_encode_object(self): + class _TestObject: + def __init__(self, a, b, _c, d): + self.a = a + self.b = b + self._c = _c + self.d = d + + def e(self): + return 5 + + # JSON keys should be all non-callable non-underscore attributes, see GH-42768 + test_object = _TestObject(a=1, b=2, _c=3, d=4) + assert ujson.decode(ujson.encode(test_object)) == {"a": 1, "b": 2, "d": 4} + class TestNumpyJSONTests: @pytest.mark.parametrize("bool_input", [True, False]) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b1f7f15dfa99a..01715ee133e96 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -600,11 +600,9 @@ def test_use_nullable_dtypes(self, engine): import pyarrow.parquet as pq if engine == "fastparquet": - pytest.importorskip( - "fastparquet", - "0.7.1", - reason="fastparquet must be 0.7.1 or higher for nullable dtype support", - ) + # We are manually disabling fastparquet's + # nullable dtype support pending discussion + pytest.skip("Fastparquet nullable dtype support is disabled") table = pyarrow.table( { @@ -612,6 +610,8 @@ def test_use_nullable_dtypes(self, engine): "b": pyarrow.array([1, 2, 3, None], "uint8"), "c": pyarrow.array(["a", "b", "c", None]), "d": pyarrow.array([True, False, True, None]), + # Test that nullable dtypes used even in absence of nulls + "e": pyarrow.array([1, 2, 3, 4], "int64"), } ) with tm.ensure_clean() as path: @@ -627,6 +627,7 @@ def test_use_nullable_dtypes(self, engine): "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), + "e": pd.array([1, 2, 3, 4], dtype="Int64"), } ) if engine == "fastparquet": @@ -1078,11 +1079,6 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp): df = pd.DataFrame({"a": [1, 2]}) - # This is supported now in fastparquet 0.7.1 and above actually - # Still need to ensure that this raises in all versions below - import fastparquet as fp - - monkeypatch.setattr(fp, "__version__", "0.4") with tm.ensure_clean() as path: df.to_parquet(path) with pytest.raises(ValueError, match="not supported for the fastparquet"): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e924bcef494b9..99e12ef39c4ee 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -24,6 +24,7 @@ time, ) from io import StringIO +from pathlib import Path import sqlite3 import warnings @@ -72,34 +73,6 @@ SQLALCHEMY_INSTALLED = False SQL_STRINGS = { - "create_iris": { - "sqlite": """CREATE TABLE iris ( - "SepalLength" REAL, - "SepalWidth" REAL, - "PetalLength" REAL, - "PetalWidth" REAL, - "Name" TEXT - )""", - "mysql": """CREATE TABLE iris ( - `SepalLength` DOUBLE, - `SepalWidth` DOUBLE, - `PetalLength` DOUBLE, - `PetalWidth` DOUBLE, - `Name` VARCHAR(200) - )""", - "postgresql": """CREATE TABLE iris ( - "SepalLength" DOUBLE PRECISION, - "SepalWidth" DOUBLE PRECISION, - "PetalLength" DOUBLE PRECISION, - "PetalWidth" DOUBLE PRECISION, - "Name" VARCHAR(200) - )""", - }, - "insert_iris": { - "sqlite": """INSERT INTO iris VALUES(?, ?, ?, ?, ?)""", - "mysql": """INSERT INTO iris VALUES(%s, %s, %s, %s, "%s");""", - "postgresql": """INSERT INTO iris VALUES(%s, %s, %s, %s, %s);""", - }, "create_test_types": { "sqlite": """CREATE TABLE types_test_data ( "TextCol" TEXT, @@ -192,7 +165,7 @@ }, "read_parameters": { "sqlite": "SELECT * FROM iris WHERE Name=? AND SepalLength=?", - "mysql": 'SELECT * FROM iris WHERE `Name`="%s" AND `SepalLength`=%s', + "mysql": "SELECT * FROM iris WHERE `Name`=%s AND `SepalLength`=%s", "postgresql": 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s', }, "read_named_parameters": { @@ -201,7 +174,7 @@ """, "mysql": """ SELECT * FROM iris WHERE - `Name`="%(name)s" AND `SepalLength`=%(length)s + `Name`=%(name)s AND `SepalLength`=%(length)s """, "postgresql": """ SELECT * FROM iris WHERE @@ -213,15 +186,141 @@ "mysql": "SELECT * FROM iris WHERE `Name` LIKE '%'", "postgresql": "SELECT * FROM iris WHERE \"Name\" LIKE '%'", }, - "create_view": { - "sqlite": """ - CREATE VIEW iris_view AS - SELECT * FROM iris - """ - }, } +def iris_table_metadata(dialect: str): + from sqlalchemy import ( + REAL, + Column, + Float, + MetaData, + String, + Table, + ) + + dtype = Float if dialect == "postgresql" else REAL + metadata = MetaData() + iris = Table( + "iris", + metadata, + Column("SepalLength", dtype), + Column("SepalWidth", dtype), + Column("PetalLength", dtype), + Column("PetalWidth", dtype), + Column("Name", String(200)), + ) + return iris + + +def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): + cur = conn.cursor() + stmt = """CREATE TABLE iris ( + "SepalLength" REAL, + "SepalWidth" REAL, + "PetalLength" REAL, + "PetalWidth" REAL, + "Name" TEXT + )""" + cur.execute(stmt) + with iris_file.open(newline=None) as csvfile: + reader = csv.reader(csvfile) + next(reader) + stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)" + cur.executemany(stmt, reader) + + +def create_and_load_iris(conn, iris_file: Path, dialect: str): + from sqlalchemy import insert + from sqlalchemy.engine import Engine + + iris = iris_table_metadata(dialect) + iris.drop(conn, checkfirst=True) + iris.create(bind=conn) + + with iris_file.open(newline=None) as csvfile: + reader = csv.reader(csvfile) + header = next(reader) + params = [{key: value for key, value in zip(header, row)} for row in reader] + stmt = insert(iris).values(params) + if isinstance(conn, Engine): + with conn.connect() as conn: + conn.execute(stmt) + else: + conn.execute(stmt) + + +def create_and_load_iris_view(conn): + stmt = "CREATE VIEW iris_view AS SELECT * FROM iris" + if isinstance(conn, sqlite3.Connection): + cur = conn.cursor() + cur.execute(stmt) + else: + from sqlalchemy import text + from sqlalchemy.engine import Engine + + stmt = text(stmt) + if isinstance(conn, Engine): + with conn.connect() as conn: + conn.execute(stmt) + else: + conn.execute(stmt) + + +@pytest.fixture +def iris_path(datapath): + iris_path = datapath("io", "data", "csv", "iris.csv") + return Path(iris_path) + + +@pytest.fixture +def test_frame1(): + columns = ["index", "A", "B", "C", "D"] + data = [ + ( + "2000-01-03 00:00:00", + 0.980268513777, + 3.68573087906, + -0.364216805298, + -1.15973806169, + ), + ( + "2000-01-04 00:00:00", + 1.04791624281, + -0.0412318367011, + -0.16181208307, + 0.212549316967, + ), + ( + "2000-01-05 00:00:00", + 0.498580885705, + 0.731167677815, + -0.537677223318, + 1.34627041952, + ), + ( + "2000-01-06 00:00:00", + 1.12020151869, + 1.56762092543, + 0.00364077397681, + 0.67525259227, + ), + ] + return DataFrame(data, columns=columns) + + +@pytest.fixture +def test_frame3(): + columns = ["index", "A", "B"] + data = [ + ("2000-01-03 00:00:00", 2 ** 31 - 1, -1.987670), + ("2000-01-04 00:00:00", -29, -0.0412318367011), + ("2000-01-05 00:00:00", 20000, 0.731167677815), + ("2000-01-06 00:00:00", -290867, 1.56762092543), + ] + return DataFrame(data, columns=columns) + + class MixInBase: def teardown_method(self, method): # if setup fails, there may not be a connection to close. @@ -293,28 +392,15 @@ def _get_exec(self): else: return self.conn.cursor() - @pytest.fixture(params=[("io", "data", "csv", "iris.csv")]) - def load_iris_data(self, datapath, request): - - iris_csv_file = datapath(*request.param) - + @pytest.fixture + def load_iris_data(self, iris_path): if not hasattr(self, "conn"): self.setup_connect() - self.drop_table("iris") - self._get_exec().execute(SQL_STRINGS["create_iris"][self.flavor]) - - with open(iris_csv_file, newline=None) as iris_csv: - r = csv.reader(iris_csv) - next(r) # skip header row - ins = SQL_STRINGS["insert_iris"][self.flavor] - - for row in r: - self._get_exec().execute(ins, row) - - def _load_iris_view(self): - self.drop_table("iris_view") - self._get_exec().execute(SQL_STRINGS["create_view"][self.flavor]) + if isinstance(self.conn, sqlite3.Connection): + create_and_load_iris_sqlite3(self.conn, iris_path) + else: + create_and_load_iris(self.conn, iris_path, self.flavor) def _check_iris_loaded_frame(self, iris_frame): pytype = iris_frame.dtypes[0].type @@ -323,66 +409,6 @@ def _check_iris_loaded_frame(self, iris_frame): assert issubclass(pytype, np.floating) tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - def _load_test1_data(self): - columns = ["index", "A", "B", "C", "D"] - data = [ - ( - "2000-01-03 00:00:00", - 0.980268513777, - 3.68573087906, - -0.364216805298, - -1.15973806169, - ), - ( - "2000-01-04 00:00:00", - 1.04791624281, - -0.0412318367011, - -0.16181208307, - 0.212549316967, - ), - ( - "2000-01-05 00:00:00", - 0.498580885705, - 0.731167677815, - -0.537677223318, - 1.34627041952, - ), - ( - "2000-01-06 00:00:00", - 1.12020151869, - 1.56762092543, - 0.00364077397681, - 0.67525259227, - ), - ] - - self.test_frame1 = DataFrame(data, columns=columns) - - def _load_test2_data(self): - df = DataFrame( - { - "A": [4, 1, 3, 6], - "B": ["asd", "gsq", "ylt", "jkl"], - "C": [1.1, 3.1, 6.9, 5.3], - "D": [False, True, True, False], - "E": ["1990-11-22", "1991-10-26", "1993-11-26", "1995-12-12"], - } - ) - df["E"] = to_datetime(df["E"]) - - self.test_frame2 = df - - def _load_test3_data(self): - columns = ["index", "A", "B"] - data = [ - ("2000-01-03 00:00:00", 2 ** 31 - 1, -1.987670), - ("2000-01-04 00:00:00", -29, -0.0412318367011), - ("2000-01-05 00:00:00", 20000, 0.731167677815), - ("2000-01-06 00:00:00", -290867, 1.56762092543), - ] - - self.test_frame3 = DataFrame(data, columns=columns) - def _load_types_test_data(self, data): def _filter_to_flavor(flavor, df): flavor_dtypes = { @@ -498,66 +524,66 @@ def _read_sql_iris_no_parameter_with_percent(self): iris_frame = self.pandasSQL.read_query(query, params=None) self._check_iris_loaded_frame(iris_frame) - def _to_sql(self, method=None): + def _to_sql(self, test_frame1, method=None): self.drop_table("test_frame1") - self.pandasSQL.to_sql(self.test_frame1, "test_frame1", method=method) + self.pandasSQL.to_sql(test_frame1, "test_frame1", method=method) assert self.pandasSQL.has_table("test_frame1") - num_entries = len(self.test_frame1) + num_entries = len(test_frame1) num_rows = self._count_rows("test_frame1") assert num_rows == num_entries # Nuke table self.drop_table("test_frame1") - def _to_sql_empty(self): + def _to_sql_empty(self, test_frame1): self.drop_table("test_frame1") - self.pandasSQL.to_sql(self.test_frame1.iloc[:0], "test_frame1") + self.pandasSQL.to_sql(test_frame1.iloc[:0], "test_frame1") - def _to_sql_fail(self): + def _to_sql_fail(self, test_frame1): self.drop_table("test_frame1") - self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + self.pandasSQL.to_sql(test_frame1, "test_frame1", if_exists="fail") assert self.pandasSQL.has_table("test_frame1") msg = "Table 'test_frame1' already exists" with pytest.raises(ValueError, match=msg): - self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + self.pandasSQL.to_sql(test_frame1, "test_frame1", if_exists="fail") self.drop_table("test_frame1") - def _to_sql_replace(self): + def _to_sql_replace(self, test_frame1): self.drop_table("test_frame1") - self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + self.pandasSQL.to_sql(test_frame1, "test_frame1", if_exists="fail") # Add to table again - self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="replace") + self.pandasSQL.to_sql(test_frame1, "test_frame1", if_exists="replace") assert self.pandasSQL.has_table("test_frame1") - num_entries = len(self.test_frame1) + num_entries = len(test_frame1) num_rows = self._count_rows("test_frame1") assert num_rows == num_entries self.drop_table("test_frame1") - def _to_sql_append(self): + def _to_sql_append(self, test_frame1): # Nuke table just in case self.drop_table("test_frame1") - self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + self.pandasSQL.to_sql(test_frame1, "test_frame1", if_exists="fail") # Add to table again - self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="append") + self.pandasSQL.to_sql(test_frame1, "test_frame1", if_exists="append") assert self.pandasSQL.has_table("test_frame1") - num_entries = 2 * len(self.test_frame1) + num_entries = 2 * len(test_frame1) num_rows = self._count_rows("test_frame1") assert num_rows == num_entries self.drop_table("test_frame1") - def _to_sql_method_callable(self): + def _to_sql_method_callable(self, test_frame1): check = [] # used to double check function below is really being used def sample(pd_table, conn, keys, data_iter): @@ -567,36 +593,36 @@ def sample(pd_table, conn, keys, data_iter): self.drop_table("test_frame1") - self.pandasSQL.to_sql(self.test_frame1, "test_frame1", method=sample) + self.pandasSQL.to_sql(test_frame1, "test_frame1", method=sample) assert self.pandasSQL.has_table("test_frame1") assert check == [1] - num_entries = len(self.test_frame1) + num_entries = len(test_frame1) num_rows = self._count_rows("test_frame1") assert num_rows == num_entries # Nuke table self.drop_table("test_frame1") - def _to_sql_with_sql_engine(self, engine="auto", **engine_kwargs): + def _to_sql_with_sql_engine(self, test_frame1, engine="auto", **engine_kwargs): """`to_sql` with the `engine` param""" # mostly copied from this class's `_to_sql()` method self.drop_table("test_frame1") self.pandasSQL.to_sql( - self.test_frame1, "test_frame1", engine=engine, **engine_kwargs + test_frame1, "test_frame1", engine=engine, **engine_kwargs ) assert self.pandasSQL.has_table("test_frame1") - num_entries = len(self.test_frame1) + num_entries = len(test_frame1) num_rows = self._count_rows("test_frame1") assert num_rows == num_entries # Nuke table self.drop_table("test_frame1") - def _roundtrip(self): + def _roundtrip(self, test_frame1): self.drop_table("test_frame_roundtrip") - self.pandasSQL.to_sql(self.test_frame1, "test_frame_roundtrip") + self.pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") result.set_index("level_0", inplace=True) @@ -604,7 +630,7 @@ def _roundtrip(self): result.index.name = None - tm.assert_frame_equal(result, self.test_frame1) + tm.assert_frame_equal(result, test_frame1) def _execute_sql(self): # drop_sql = "DROP TABLE IF EXISTS test" # should already be done @@ -678,10 +704,7 @@ def setup_method(self, load_iris_data): self.load_test_data_and_sql() def load_test_data_and_sql(self): - self._load_iris_view() - self._load_test1_data() - self._load_test2_data() - self._load_test3_data() + create_and_load_iris_view(self.conn) self._load_raw_sql() def test_read_sql_iris(self): @@ -698,46 +721,46 @@ def test_read_sql_with_chunksize_no_result(self): without_batch = sql.read_sql_query(query, self.conn) tm.assert_frame_equal(concat(with_batch), without_batch) - def test_to_sql(self): - sql.to_sql(self.test_frame1, "test_frame1", self.conn) + def test_to_sql(self, test_frame1): + sql.to_sql(test_frame1, "test_frame1", self.conn) assert sql.has_table("test_frame1", self.conn) - def test_to_sql_fail(self): - sql.to_sql(self.test_frame1, "test_frame2", self.conn, if_exists="fail") + def test_to_sql_fail(self, test_frame1): + sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") assert sql.has_table("test_frame2", self.conn) msg = "Table 'test_frame2' already exists" with pytest.raises(ValueError, match=msg): - sql.to_sql(self.test_frame1, "test_frame2", self.conn, if_exists="fail") + sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") - def test_to_sql_replace(self): - sql.to_sql(self.test_frame1, "test_frame3", self.conn, if_exists="fail") + def test_to_sql_replace(self, test_frame1): + sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="fail") # Add to table again - sql.to_sql(self.test_frame1, "test_frame3", self.conn, if_exists="replace") + sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="replace") assert sql.has_table("test_frame3", self.conn) - num_entries = len(self.test_frame1) + num_entries = len(test_frame1) num_rows = self._count_rows("test_frame3") assert num_rows == num_entries - def test_to_sql_append(self): - sql.to_sql(self.test_frame1, "test_frame4", self.conn, if_exists="fail") + def test_to_sql_append(self, test_frame1): + sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="fail") # Add to table again - sql.to_sql(self.test_frame1, "test_frame4", self.conn, if_exists="append") + sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="append") assert sql.has_table("test_frame4", self.conn) - num_entries = 2 * len(self.test_frame1) + num_entries = 2 * len(test_frame1) num_rows = self._count_rows("test_frame4") assert num_rows == num_entries - def test_to_sql_type_mapping(self): - sql.to_sql(self.test_frame3, "test_frame5", self.conn, index=False) + def test_to_sql_type_mapping(self, test_frame3): + sql.to_sql(test_frame3, "test_frame5", self.conn, index=False) result = sql.read_sql("SELECT * FROM test_frame5", self.conn) - tm.assert_frame_equal(self.test_frame3, result) + tm.assert_frame_equal(test_frame3, result) def test_to_sql_series(self): s = Series(np.arange(5, dtype="int64"), name="series") @@ -745,27 +768,27 @@ def test_to_sql_series(self): s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) tm.assert_frame_equal(s.to_frame(), s2) - def test_roundtrip(self): - sql.to_sql(self.test_frame1, "test_frame_roundtrip", con=self.conn) + def test_roundtrip(self, test_frame1): + sql.to_sql(test_frame1, "test_frame_roundtrip", con=self.conn) result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) # HACK! - result.index = self.test_frame1.index + result.index = test_frame1.index result.set_index("level_0", inplace=True) result.index.astype(int) result.index.name = None - tm.assert_frame_equal(result, self.test_frame1) + tm.assert_frame_equal(result, test_frame1) - def test_roundtrip_chunksize(self): + def test_roundtrip_chunksize(self, test_frame1): sql.to_sql( - self.test_frame1, + test_frame1, "test_frame_roundtrip", con=self.conn, index=False, chunksize=2, ) result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) - tm.assert_frame_equal(result, self.test_frame1) + tm.assert_frame_equal(result, test_frame1) def test_execute_sql(self): # drop_sql = "DROP TABLE IF EXISTS test" # should already be done @@ -999,15 +1022,13 @@ def test_integer_col_names(self): df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") - def test_get_schema(self): - create_sql = sql.get_schema(self.test_frame1, "test", con=self.conn) + def test_get_schema(self, test_frame1): + create_sql = sql.get_schema(test_frame1, "test", con=self.conn) assert "CREATE" in create_sql - def test_get_schema_with_schema(self): + def test_get_schema_with_schema(self, test_frame1): # GH28486 - create_sql = sql.get_schema( - self.test_frame1, "test", con=self.conn, schema="pypi" - ) + create_sql = sql.get_schema(test_frame1, "test", con=self.conn, schema="pypi") assert "CREATE TABLE pypi." in create_sql def test_get_schema_dtypes(self): @@ -1019,16 +1040,14 @@ def test_get_schema_dtypes(self): assert "CREATE" in create_sql assert "INTEGER" in create_sql - def test_get_schema_keys(self): + def test_get_schema_keys(self, test_frame1): frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1") constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' assert constraint_sentence in create_sql # multiple columns as key (GH10385) - create_sql = sql.get_schema( - self.test_frame1, "test", con=self.conn, keys=["A", "B"] - ) + create_sql = sql.get_schema(test_frame1, "test", con=self.conn, keys=["A", "B"]) constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' assert constraint_sentence in create_sql @@ -1115,17 +1134,17 @@ class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): def connect(self): return sqlalchemy.create_engine("sqlite:///:memory:") - def test_read_table_columns(self): + def test_read_table_columns(self, test_frame1): # test columns argument in read_table - sql.to_sql(self.test_frame1, "test_frame", self.conn) + sql.to_sql(test_frame1, "test_frame", self.conn) cols = ["A", "B"] result = sql.read_sql_table("test_frame", self.conn, columns=cols) assert result.columns.tolist() == cols - def test_read_table_index_col(self): + def test_read_table_index_col(self, test_frame1): # test columns argument in read_table - sql.to_sql(self.test_frame1, "test_frame", self.conn) + sql.to_sql(test_frame1, "test_frame", self.conn) result = sql.read_sql_table("test_frame", self.conn, index_col="index") assert result.index.names == ["index"] @@ -1164,7 +1183,7 @@ def test_not_reflect_all_tables(self): # Verify some things assert len(w) == 0 - def test_warning_case_insensitive_table_name(self): + def test_warning_case_insensitive_table_name(self, test_frame1): # see gh-7815 # # We can't test that this warning is triggered, a the database @@ -1174,7 +1193,7 @@ def test_warning_case_insensitive_table_name(self): # Cause all warnings to always be triggered. warnings.simplefilter("always") # This should not trigger a Warning - self.test_frame1.to_sql("CaseSensitive", self.conn) + test_frame1.to_sql("CaseSensitive", self.conn) # Verify some things assert len(w) == 0 @@ -1236,10 +1255,8 @@ def test_sqlalchemy_integer_overload_mapping(self, integer): ): sql.SQLTable("test_type", db, frame=df) - def test_database_uri_string(self): - + def test_database_uri_string(self, test_frame1): # Test read_sql and .to_sql method with a database URI (GH10654) - test_frame1 = self.test_frame1 # db_uri = 'sqlite:///:memory:' # raises # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near # "iris": syntax error [SQL: 'iris'] @@ -1269,21 +1286,6 @@ def test_database_uri_string(self): with pytest.raises(ImportError, match="pg8000"): sql.read_sql("select * from table", db_uri) - def _make_iris_table_metadata(self): - sa = sqlalchemy - metadata = sa.MetaData() - iris = sa.Table( - "iris", - metadata, - sa.Column("SepalLength", sa.REAL), - sa.Column("SepalWidth", sa.REAL), - sa.Column("PetalLength", sa.REAL), - sa.Column("PetalWidth", sa.REAL), - sa.Column("Name", sa.TEXT), - ) - - return iris - def test_query_by_text_obj(self): # WIP : GH10846 name_text = sqlalchemy.text("select * from iris where name=:name") @@ -1293,7 +1295,7 @@ def test_query_by_text_obj(self): def test_query_by_select_obj(self): # WIP : GH10846 - iris = self._make_iris_table_metadata() + iris = iris_table_metadata(self.flavor) name_select = sqlalchemy.select([iris]).where( iris.c.Name == sqlalchemy.bindparam("name") @@ -1353,21 +1355,21 @@ class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi): def connect(self, database=":memory:"): return sqlite3.connect(database) - def test_sql_open_close(self): + def test_sql_open_close(self, test_frame3): # Test if the IO in the database still work if the connection closed # between the writing and reading (as in many real situations). with tm.ensure_clean() as name: conn = self.connect(name) - sql.to_sql(self.test_frame3, "test_frame3_legacy", conn, index=False) + sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) conn.close() conn = self.connect(name) result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) conn.close() - tm.assert_frame_equal(self.test_frame3, result) + tm.assert_frame_equal(test_frame3, result) @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") def test_con_string_import_error(self): @@ -1391,9 +1393,9 @@ def test_safe_names_warning(self): with tm.assert_produces_warning(): sql.to_sql(df, "test_frame3_legacy", self.conn, index=False) - def test_get_schema2(self): + def test_get_schema2(self, test_frame1): # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(self.test_frame1, "test") + create_sql = sql.get_schema(test_frame1, "test") assert "CREATE" in create_sql def _get_sqlite_column_type(self, schema, column): @@ -1439,7 +1441,6 @@ def setup_class(cls): def load_test_data_and_sql(self): self._load_raw_sql() - self._load_test1_data() @pytest.fixture(autouse=True) def setup_method(self, load_iris_data): @@ -1477,26 +1478,26 @@ def test_read_sql_parameter(self): def test_read_sql_named_parameter(self): self._read_sql_iris_named_parameter() - def test_to_sql(self): - self._to_sql() + def test_to_sql(self, test_frame1): + self._to_sql(test_frame1) - def test_to_sql_empty(self): - self._to_sql_empty() + def test_to_sql_empty(self, test_frame1): + self._to_sql_empty(test_frame1) - def test_to_sql_fail(self): - self._to_sql_fail() + def test_to_sql_fail(self, test_frame1): + self._to_sql_fail(test_frame1) - def test_to_sql_replace(self): - self._to_sql_replace() + def test_to_sql_replace(self, test_frame1): + self._to_sql_replace(test_frame1) - def test_to_sql_append(self): - self._to_sql_append() + def test_to_sql_append(self, test_frame1): + self._to_sql_append(test_frame1) - def test_to_sql_method_multi(self): - self._to_sql(method="multi") + def test_to_sql_method_multi(self, test_frame1): + self._to_sql(test_frame1, method="multi") - def test_to_sql_method_callable(self): - self._to_sql_method_callable() + def test_to_sql_method_callable(self, test_frame1): + self._to_sql_method_callable(test_frame1) def test_create_table(self): temp_conn = self.connect() @@ -1536,8 +1537,8 @@ def test_drop_table(self): else: assert not temp_conn.has_table("temp_frame") - def test_roundtrip(self): - self._roundtrip() + def test_roundtrip(self, test_frame1): + self._roundtrip(test_frame1) def test_execute_sql(self): self._execute_sql() @@ -1888,15 +1889,14 @@ def test_to_sql_save_index(self): def test_transactions(self): self._transaction_test() - def test_get_schema_create_table(self): + def test_get_schema_create_table(self, test_frame3): # Use a dataframe without a bool column, since MySQL converts bool to # TINYINT (which read_sql_table returns as an int and causes a dtype # mismatch) - self._load_test3_data() tbl = "test_get_schema_create_table" - create_sql = sql.get_schema(self.test_frame3, tbl, con=self.conn) - blank_test_df = self.test_frame3.iloc[:0] + create_sql = sql.get_schema(test_frame3, tbl, con=self.conn) + blank_test_df = test_frame3.iloc[:0] self.drop_table(tbl) self.conn.execute(create_sql) @@ -2072,22 +2072,20 @@ class Temporary(Base): tm.assert_frame_equal(df, expected) # -- SQL Engine tests (in the base class for now) - def test_invalid_engine(self): + def test_invalid_engine(self, test_frame1): msg = "engine must be one of 'auto', 'sqlalchemy'" with pytest.raises(ValueError, match=msg): - self._to_sql_with_sql_engine("bad_engine") + self._to_sql_with_sql_engine(test_frame1, "bad_engine") - def test_options_sqlalchemy(self): + def test_options_sqlalchemy(self, test_frame1): # use the set option - with pd.option_context("io.sql.engine", "sqlalchemy"): - self._to_sql_with_sql_engine() + self._to_sql_with_sql_engine(test_frame1) - def test_options_auto(self): + def test_options_auto(self, test_frame1): # use the set option - with pd.option_context("io.sql.engine", "auto"): - self._to_sql_with_sql_engine() + self._to_sql_with_sql_engine(test_frame1) def test_options_get_engine(self): assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) @@ -2405,13 +2403,9 @@ def connect(cls): def setup_connect(self): self.conn = self.connect() - def load_test_data_and_sql(self): - self.pandasSQL = sql.SQLiteDatabase(self.conn) - self._load_test1_data() - @pytest.fixture(autouse=True) def setup_method(self, load_iris_data): - self.load_test_data_and_sql() + self.pandasSQL = sql.SQLiteDatabase(self.conn) def test_read_sql(self): self._read_sql_iris() @@ -2422,24 +2416,24 @@ def test_read_sql_parameter(self): def test_read_sql_named_parameter(self): self._read_sql_iris_named_parameter() - def test_to_sql(self): - self._to_sql() + def test_to_sql(self, test_frame1): + self._to_sql(test_frame1) - def test_to_sql_empty(self): - self._to_sql_empty() + def test_to_sql_empty(self, test_frame1): + self._to_sql_empty(test_frame1) - def test_to_sql_fail(self): - self._to_sql_fail() + def test_to_sql_fail(self, test_frame1): + self._to_sql_fail(test_frame1) - def test_to_sql_replace(self): - self._to_sql_replace() + def test_to_sql_replace(self, test_frame1): + self._to_sql_replace(test_frame1) - def test_to_sql_append(self): - self._to_sql_append() + def test_to_sql_append(self, test_frame1): + self._to_sql_append(test_frame1) - def test_to_sql_method_multi(self): + def test_to_sql_method_multi(self, test_frame1): # GH 29921 - self._to_sql(method="multi") + self._to_sql(test_frame1, method="multi") def test_create_and_drop_table(self): temp_frame = DataFrame( @@ -2454,8 +2448,8 @@ def test_create_and_drop_table(self): assert not self.pandasSQL.has_table("drop_test_frame") - def test_roundtrip(self): - self._roundtrip() + def test_roundtrip(self, test_frame1): + self._roundtrip(test_frame1) def test_execute_sql(self): self._execute_sql() diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 3af06145b9fcd..0efb0663a0327 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -211,3 +211,19 @@ def test_nlargest_boolean(self, data, expected): result = ser.nlargest(1) expected = Series(expected) tm.assert_series_equal(result, expected) + + def test_nlargest_nullable(self, any_nullable_numeric_dtype): + # GH#42816 + dtype = any_nullable_numeric_dtype + arr = np.random.randn(10).astype(dtype.lower(), copy=False) + + ser = Series(arr.copy(), dtype=dtype) + ser[1] = pd.NA + result = ser.nlargest(5) + + expected = ( + Series(np.delete(arr, 1), index=ser.index.delete(1)) + .nlargest(5) + .astype(dtype) + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 461c81bc3b44f..84bfe8524634b 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -217,3 +217,9 @@ def test_quantile_empty(self): res = s.quantile([0.5]) exp = Series([pd.NaT], index=[0.5]) tm.assert_series_equal(res, exp) + + @pytest.mark.parametrize("dtype", [int, float, "Int64"]) + def test_quantile_dtypes(self, dtype): + result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25)) + expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25)) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tseries/offsets/test_business_hour.py b/pandas/tests/tseries/offsets/test_business_hour.py index 72b939b79c321..ee05eab5ec5ca 100644 --- a/pandas/tests/tseries/offsets/test_business_hour.py +++ b/pandas/tests/tseries/offsets/test_business_hour.py @@ -920,3 +920,451 @@ def test_bday_ignores_timedeltas(self): freq=None, ) tm.assert_index_equal(t1, expected) + + +class TestOpeningTimes: + # opening time should be affected by sign of n, not by n's value and end + opening_time_cases = [ + ( + [ + BusinessHour(), + BusinessHour(n=2), + BusinessHour(n=4), + BusinessHour(end="10:00"), + BusinessHour(n=2, end="4:00"), + BusinessHour(n=4, end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + # if timestamp is on opening time, next opening time is + # as it is + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 3, 9), + datetime(2014, 7, 2, 9), + ), + # 2014-07-05 is saturday + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 8, 9), + datetime(2014, 7, 7, 9), + ), + }, + ), + ( + [ + BusinessHour(start="11:15"), + BusinessHour(n=2, start="11:15"), + BusinessHour(n=3, start="11:15"), + BusinessHour(start="11:15", end="10:00"), + BusinessHour(n=2, start="11:15", end="4:00"), + BusinessHour(n=3, start="11:15", end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 11, 15), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 11, 15): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 2, 11, 15, 1): ( + datetime(2014, 7, 3, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 11, 15), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + }, + ), + ( + [ + BusinessHour(-1), + BusinessHour(n=-2), + BusinessHour(n=-4), + BusinessHour(n=-1, end="10:00"), + BusinessHour(n=-2, end="4:00"), + BusinessHour(n=-4, end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 3, 9), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 9): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 8, 9), + ), + }, + ), + ( + [ + BusinessHour(start="17:00", end="05:00"), + BusinessHour(n=3, start="17:00", end="03:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 17), + datetime(2014, 6, 30, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 4, 17): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 3, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 7, 17, 1): ( + datetime(2014, 7, 8, 17), + datetime(2014, 7, 7, 17), + ), + }, + ), + ( + [ + BusinessHour(-1, start="17:00", end="05:00"), + BusinessHour(n=-2, start="17:00", end="03:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 16, 59): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 3, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 7, 18): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 17), + ), + }, + ), + ( + [ + BusinessHour(start=["11:15", "15:00"], end=["13:00", "20:00"]), + BusinessHour(n=3, start=["11:15", "15:00"], end=["12:00", "20:00"]), + BusinessHour(start=["11:15", "15:00"], end=["13:00", "17:00"]), + BusinessHour(n=2, start=["11:15", "15:00"], end=["12:00", "03:00"]), + BusinessHour(n=3, start=["11:15", "15:00"], end=["13:00", "16:00"]), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 15), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 11, 15): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 2, 11, 15, 1): ( + datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 15), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 12): ( + datetime(2014, 7, 7, 15), + datetime(2014, 7, 7, 11, 15), + ), + }, + ), + ( + [ + BusinessHour(n=-1, start=["17:00", "08:00"], end=["05:00", "10:00"]), + BusinessHour(n=-2, start=["08:00", "17:00"], end=["10:00", "03:00"]), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 8), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 16, 59): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 8), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 7, 18): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 8), + ), + }, + ), + ] + + @pytest.mark.parametrize("case", opening_time_cases) + def test_opening_time(self, case): + _offsets, cases = case + for offset in _offsets: + for dt, (exp_next, exp_prev) in cases.items(): + assert offset._next_opening_time(dt) == exp_next + assert offset._prev_opening_time(dt) == exp_prev diff --git a/pandas/tests/tseries/offsets/test_business_month.py b/pandas/tests/tseries/offsets/test_business_month.py index 3f537fc450764..bb2049fd35489 100644 --- a/pandas/tests/tseries/offsets/test_business_month.py +++ b/pandas/tests/tseries/offsets/test_business_month.py @@ -7,6 +7,7 @@ import pytest +import pandas as pd from pandas.tests.tseries.offsets.common import ( Base, assert_is_on_offset, @@ -19,6 +20,29 @@ ) +@pytest.mark.parametrize("n", [-2, 1]) +@pytest.mark.parametrize( + "cls", + [ + BMonthBegin, + BMonthEnd, + ], +) +def test_apply_index(cls, n): + offset = cls(n=n) + rng = pd.date_range(start="1/1/2000", periods=100000, freq="T") + ser = pd.Series(rng) + + res = rng + offset + assert res.freq is None # not retained + assert res[0] == rng[0] + offset + assert res[-1] == rng[-1] + offset + res2 = ser + offset + # apply_index is only for indexes, not series, so no res2_v2 + assert res2.iloc[0] == ser.iloc[0] + offset + assert res2.iloc[-1] == ser.iloc[-1] + offset + + class TestBMonthBegin(Base): _offset = BMonthBegin diff --git a/pandas/tests/tseries/offsets/test_easter.py b/pandas/tests/tseries/offsets/test_easter.py new file mode 100644 index 0000000000000..90ee7c7f69d5e --- /dev/null +++ b/pandas/tests/tseries/offsets/test_easter.py @@ -0,0 +1,36 @@ +""" +Tests for the following offsets: +- Easter +""" +from __future__ import annotations + +from datetime import datetime + +import pytest + +from pandas.tests.tseries.offsets.common import ( + Base, + assert_offset_equal, +) + +from pandas.tseries.offsets import Easter + + +class TestEaster(Base): + @pytest.mark.parametrize( + "offset,date,expected", + [ + (Easter(), datetime(2010, 1, 1), datetime(2010, 4, 4)), + (Easter(), datetime(2010, 4, 5), datetime(2011, 4, 24)), + (Easter(2), datetime(2010, 1, 1), datetime(2011, 4, 24)), + (Easter(), datetime(2010, 4, 4), datetime(2011, 4, 24)), + (Easter(2), datetime(2010, 4, 4), datetime(2012, 4, 8)), + (-Easter(), datetime(2011, 1, 1), datetime(2010, 4, 4)), + (-Easter(), datetime(2010, 4, 5), datetime(2010, 4, 4)), + (-Easter(2), datetime(2011, 1, 1), datetime(2009, 4, 12)), + (-Easter(), datetime(2010, 4, 4), datetime(2009, 4, 12)), + (-Easter(2), datetime(2010, 4, 4), datetime(2008, 3, 23)), + ], + ) + def test_offset(self, offset, date, expected): + assert_offset_equal(offset, date, expected) diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_index.py similarity index 84% rename from pandas/tests/tseries/offsets/test_yqm_offsets.py rename to pandas/tests/tseries/offsets/test_index.py index 39b88074a6883..ad3478b319898 100644 --- a/pandas/tests/tseries/offsets/test_yqm_offsets.py +++ b/pandas/tests/tseries/offsets/test_index.py @@ -1,9 +1,12 @@ """ -Tests for Year, Quarter, and Month-based DateOffset subclasses +Tests for offset behavior with indices. """ import pytest -import pandas as pd +from pandas import ( + Series, + date_range, +) from pandas.tseries.offsets import ( BMonthBegin, @@ -41,8 +44,8 @@ ) def test_apply_index(cls, n): offset = cls(n=n) - rng = pd.date_range(start="1/1/2000", periods=100000, freq="T") - ser = pd.Series(rng) + rng = date_range(start="1/1/2000", periods=100000, freq="T") + ser = Series(rng) res = rng + offset assert res.freq is None # not retained diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 08dbc1345b9d4..f807a1fe729b1 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -36,7 +36,6 @@ from pandas.tests.tseries.offsets.common import ( Base, WeekDay, - assert_offset_equal, ) import pandas.tseries.offsets as offsets @@ -51,7 +50,6 @@ CustomBusinessMonthBegin, CustomBusinessMonthEnd, DateOffset, - Day, Easter, FY5253Quarter, LastWeekOfMonth, @@ -565,22 +563,6 @@ def test_eq(self): assert offset1 != offset2 -def test_Easter(): - assert_offset_equal(Easter(), datetime(2010, 1, 1), datetime(2010, 4, 4)) - assert_offset_equal(Easter(), datetime(2010, 4, 5), datetime(2011, 4, 24)) - assert_offset_equal(Easter(2), datetime(2010, 1, 1), datetime(2011, 4, 24)) - - assert_offset_equal(Easter(), datetime(2010, 4, 4), datetime(2011, 4, 24)) - assert_offset_equal(Easter(2), datetime(2010, 4, 4), datetime(2012, 4, 8)) - - assert_offset_equal(-Easter(), datetime(2011, 1, 1), datetime(2010, 4, 4)) - assert_offset_equal(-Easter(), datetime(2010, 4, 5), datetime(2010, 4, 4)) - assert_offset_equal(-Easter(2), datetime(2011, 1, 1), datetime(2009, 4, 12)) - - assert_offset_equal(-Easter(), datetime(2010, 4, 4), datetime(2009, 4, 12)) - assert_offset_equal(-Easter(2), datetime(2010, 4, 4), datetime(2008, 3, 23)) - - class TestOffsetNames: def test_get_offset_name(self): assert BDay().freqstr == "B" @@ -798,65 +780,6 @@ def test_tick_normalize_raises(tick_classes): cls(n=3, normalize=True) -def test_weeks_onoffset(): - # GH#18510 Week with weekday = None, normalize = False should always - # be is_on_offset - offset = Week(n=2, weekday=None) - ts = Timestamp("1862-01-13 09:03:34.873477378+0210", tz="Africa/Lusaka") - fast = offset.is_on_offset(ts) - slow = (ts + offset) - offset == ts - assert fast == slow - - # negative n - offset = Week(n=2, weekday=None) - ts = Timestamp("1856-10-24 16:18:36.556360110-0717", tz="Pacific/Easter") - fast = offset.is_on_offset(ts) - slow = (ts + offset) - offset == ts - assert fast == slow - - -def test_weekofmonth_onoffset(): - # GH#18864 - # Make sure that nanoseconds don't trip up is_on_offset (and with it apply) - offset = WeekOfMonth(n=2, week=2, weekday=0) - ts = Timestamp("1916-05-15 01:14:49.583410462+0422", tz="Asia/Qyzylorda") - fast = offset.is_on_offset(ts) - slow = (ts + offset) - offset == ts - assert fast == slow - - # negative n - offset = WeekOfMonth(n=-3, week=1, weekday=0) - ts = Timestamp("1980-12-08 03:38:52.878321185+0500", tz="Asia/Oral") - fast = offset.is_on_offset(ts) - slow = (ts + offset) - offset == ts - assert fast == slow - - -def test_last_week_of_month_on_offset(): - # GH#19036, GH#18977 _adjust_dst was incorrect for LastWeekOfMonth - offset = LastWeekOfMonth(n=4, weekday=6) - ts = Timestamp("1917-05-27 20:55:27.084284178+0200", tz="Europe/Warsaw") - slow = (ts + offset) - offset == ts - fast = offset.is_on_offset(ts) - assert fast == slow - - # negative n - offset = LastWeekOfMonth(n=-4, weekday=5) - ts = Timestamp("2005-08-27 05:01:42.799392561-0500", tz="America/Rainy_River") - slow = (ts + offset) - offset == ts - fast = offset.is_on_offset(ts) - assert fast == slow - - -def test_week_add_invalid(): - # Week with weekday should raise TypeError and _not_ AttributeError - # when adding invalid offset - offset = Week(weekday=1) - other = Day() - with pytest.raises(TypeError, match="Cannot add"): - offset + other - - @pytest.mark.parametrize( "attribute", [ diff --git a/pandas/tests/tseries/offsets/test_opening_times.py b/pandas/tests/tseries/offsets/test_opening_times.py deleted file mode 100644 index 107436e4b3343..0000000000000 --- a/pandas/tests/tseries/offsets/test_opening_times.py +++ /dev/null @@ -1,456 +0,0 @@ -""" -Test offset.BusinessHour._next_opening_time and offset.BusinessHour._prev_opening_time -""" -from datetime import datetime - -import pytest - -from pandas._libs.tslibs.offsets import BusinessHour - - -class TestOpeningTimes: - # opening time should be affected by sign of n, not by n's value and end - opening_time_cases = [ - ( - [ - BusinessHour(), - BusinessHour(n=2), - BusinessHour(n=4), - BusinessHour(end="10:00"), - BusinessHour(n=2, end="4:00"), - BusinessHour(n=4, end="15:00"), - ], - { - datetime(2014, 7, 1, 11): ( - datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9), - ), - datetime(2014, 7, 1, 18): ( - datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9), - ), - datetime(2014, 7, 1, 23): ( - datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9), - ), - datetime(2014, 7, 2, 8): ( - datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9), - ), - # if timestamp is on opening time, next opening time is - # as it is - datetime(2014, 7, 2, 9): ( - datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 9), - ), - datetime(2014, 7, 2, 10): ( - datetime(2014, 7, 3, 9), - datetime(2014, 7, 2, 9), - ), - # 2014-07-05 is saturday - datetime(2014, 7, 5, 10): ( - datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9), - ), - datetime(2014, 7, 4, 10): ( - datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9), - ), - datetime(2014, 7, 4, 23): ( - datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9), - ), - datetime(2014, 7, 6, 10): ( - datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9), - ), - datetime(2014, 7, 7, 5): ( - datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9), - ), - datetime(2014, 7, 7, 9, 1): ( - datetime(2014, 7, 8, 9), - datetime(2014, 7, 7, 9), - ), - }, - ), - ( - [ - BusinessHour(start="11:15"), - BusinessHour(n=2, start="11:15"), - BusinessHour(n=3, start="11:15"), - BusinessHour(start="11:15", end="10:00"), - BusinessHour(n=2, start="11:15", end="4:00"), - BusinessHour(n=3, start="11:15", end="15:00"), - ], - { - datetime(2014, 7, 1, 11): ( - datetime(2014, 7, 1, 11, 15), - datetime(2014, 6, 30, 11, 15), - ), - datetime(2014, 7, 1, 18): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15), - ), - datetime(2014, 7, 1, 23): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15), - ), - datetime(2014, 7, 2, 8): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15), - ), - datetime(2014, 7, 2, 9): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15), - ), - datetime(2014, 7, 2, 10): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15), - ), - datetime(2014, 7, 2, 11, 15): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 2, 11, 15), - ), - datetime(2014, 7, 2, 11, 15, 1): ( - datetime(2014, 7, 3, 11, 15), - datetime(2014, 7, 2, 11, 15), - ), - datetime(2014, 7, 5, 10): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15), - ), - datetime(2014, 7, 4, 10): ( - datetime(2014, 7, 4, 11, 15), - datetime(2014, 7, 3, 11, 15), - ), - datetime(2014, 7, 4, 23): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15), - ), - datetime(2014, 7, 6, 10): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15), - ), - datetime(2014, 7, 7, 5): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15), - ), - datetime(2014, 7, 7, 9, 1): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15), - ), - }, - ), - ( - [ - BusinessHour(-1), - BusinessHour(n=-2), - BusinessHour(n=-4), - BusinessHour(n=-1, end="10:00"), - BusinessHour(n=-2, end="4:00"), - BusinessHour(n=-4, end="15:00"), - ], - { - datetime(2014, 7, 1, 11): ( - datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9), - ), - datetime(2014, 7, 1, 18): ( - datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9), - ), - datetime(2014, 7, 1, 23): ( - datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9), - ), - datetime(2014, 7, 2, 8): ( - datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9), - ), - datetime(2014, 7, 2, 9): ( - datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 9), - ), - datetime(2014, 7, 2, 10): ( - datetime(2014, 7, 2, 9), - datetime(2014, 7, 3, 9), - ), - datetime(2014, 7, 5, 10): ( - datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9), - ), - datetime(2014, 7, 4, 10): ( - datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9), - ), - datetime(2014, 7, 4, 23): ( - datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9), - ), - datetime(2014, 7, 6, 10): ( - datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9), - ), - datetime(2014, 7, 7, 5): ( - datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9), - ), - datetime(2014, 7, 7, 9): ( - datetime(2014, 7, 7, 9), - datetime(2014, 7, 7, 9), - ), - datetime(2014, 7, 7, 9, 1): ( - datetime(2014, 7, 7, 9), - datetime(2014, 7, 8, 9), - ), - }, - ), - ( - [ - BusinessHour(start="17:00", end="05:00"), - BusinessHour(n=3, start="17:00", end="03:00"), - ], - { - datetime(2014, 7, 1, 11): ( - datetime(2014, 7, 1, 17), - datetime(2014, 6, 30, 17), - ), - datetime(2014, 7, 1, 18): ( - datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17), - ), - datetime(2014, 7, 1, 23): ( - datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17), - ), - datetime(2014, 7, 2, 8): ( - datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17), - ), - datetime(2014, 7, 2, 9): ( - datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17), - ), - datetime(2014, 7, 4, 17): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 4, 17), - ), - datetime(2014, 7, 5, 10): ( - datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17), - ), - datetime(2014, 7, 4, 10): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 3, 17), - ), - datetime(2014, 7, 4, 23): ( - datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17), - ), - datetime(2014, 7, 6, 10): ( - datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17), - ), - datetime(2014, 7, 7, 5): ( - datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17), - ), - datetime(2014, 7, 7, 17, 1): ( - datetime(2014, 7, 8, 17), - datetime(2014, 7, 7, 17), - ), - }, - ), - ( - [ - BusinessHour(-1, start="17:00", end="05:00"), - BusinessHour(n=-2, start="17:00", end="03:00"), - ], - { - datetime(2014, 7, 1, 11): ( - datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 17), - ), - datetime(2014, 7, 1, 18): ( - datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17), - ), - datetime(2014, 7, 1, 23): ( - datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17), - ), - datetime(2014, 7, 2, 8): ( - datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17), - ), - datetime(2014, 7, 2, 9): ( - datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17), - ), - datetime(2014, 7, 2, 16, 59): ( - datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17), - ), - datetime(2014, 7, 5, 10): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17), - ), - datetime(2014, 7, 4, 10): ( - datetime(2014, 7, 3, 17), - datetime(2014, 7, 4, 17), - ), - datetime(2014, 7, 4, 23): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17), - ), - datetime(2014, 7, 6, 10): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17), - ), - datetime(2014, 7, 7, 5): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17), - ), - datetime(2014, 7, 7, 18): ( - datetime(2014, 7, 7, 17), - datetime(2014, 7, 8, 17), - ), - }, - ), - ( - [ - BusinessHour(start=["11:15", "15:00"], end=["13:00", "20:00"]), - BusinessHour(n=3, start=["11:15", "15:00"], end=["12:00", "20:00"]), - BusinessHour(start=["11:15", "15:00"], end=["13:00", "17:00"]), - BusinessHour(n=2, start=["11:15", "15:00"], end=["12:00", "03:00"]), - BusinessHour(n=3, start=["11:15", "15:00"], end=["13:00", "16:00"]), - ], - { - datetime(2014, 7, 1, 11): ( - datetime(2014, 7, 1, 11, 15), - datetime(2014, 6, 30, 15), - ), - datetime(2014, 7, 1, 18): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15), - ), - datetime(2014, 7, 1, 23): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15), - ), - datetime(2014, 7, 2, 8): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15), - ), - datetime(2014, 7, 2, 9): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15), - ), - datetime(2014, 7, 2, 10): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15), - ), - datetime(2014, 7, 2, 11, 15): ( - datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 2, 11, 15), - ), - datetime(2014, 7, 2, 11, 15, 1): ( - datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 11, 15), - ), - datetime(2014, 7, 5, 10): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15), - ), - datetime(2014, 7, 4, 10): ( - datetime(2014, 7, 4, 11, 15), - datetime(2014, 7, 3, 15), - ), - datetime(2014, 7, 4, 23): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15), - ), - datetime(2014, 7, 6, 10): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15), - ), - datetime(2014, 7, 7, 5): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15), - ), - datetime(2014, 7, 7, 9, 1): ( - datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15), - ), - datetime(2014, 7, 7, 12): ( - datetime(2014, 7, 7, 15), - datetime(2014, 7, 7, 11, 15), - ), - }, - ), - ( - [ - BusinessHour(n=-1, start=["17:00", "08:00"], end=["05:00", "10:00"]), - BusinessHour(n=-2, start=["08:00", "17:00"], end=["10:00", "03:00"]), - ], - { - datetime(2014, 7, 1, 11): ( - datetime(2014, 7, 1, 8), - datetime(2014, 7, 1, 17), - ), - datetime(2014, 7, 1, 18): ( - datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 8), - ), - datetime(2014, 7, 1, 23): ( - datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 8), - ), - datetime(2014, 7, 2, 8): ( - datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 8), - ), - datetime(2014, 7, 2, 9): ( - datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 17), - ), - datetime(2014, 7, 2, 16, 59): ( - datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 17), - ), - datetime(2014, 7, 5, 10): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8), - ), - datetime(2014, 7, 4, 10): ( - datetime(2014, 7, 4, 8), - datetime(2014, 7, 4, 17), - ), - datetime(2014, 7, 4, 23): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8), - ), - datetime(2014, 7, 6, 10): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8), - ), - datetime(2014, 7, 7, 5): ( - datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8), - ), - datetime(2014, 7, 7, 18): ( - datetime(2014, 7, 7, 17), - datetime(2014, 7, 8, 8), - ), - }, - ), - ] - - @pytest.mark.parametrize("case", opening_time_cases) - def test_opening_time(self, case): - _offsets, cases = case - for offset in _offsets: - for dt, (exp_next, exp_prev) in cases.items(): - assert offset._next_opening_time(dt) == exp_next - assert offset._prev_opening_time(dt) == exp_prev diff --git a/pandas/tests/tseries/offsets/test_week.py b/pandas/tests/tseries/offsets/test_week.py index b46a36e00f2da..be574fd963eff 100644 --- a/pandas/tests/tseries/offsets/test_week.py +++ b/pandas/tests/tseries/offsets/test_week.py @@ -1,5 +1,8 @@ """ -Tests for offset.Week, offset.WeekofMonth and offset.LastWeekofMonth +Tests for the following offsets: +- Week +- WeekOfMonth +- LastWeekOfMonth """ from datetime import ( datetime, @@ -10,6 +13,7 @@ from pandas._libs.tslibs import Timestamp from pandas._libs.tslibs.offsets import ( + Day, LastWeekOfMonth, Week, WeekOfMonth, @@ -121,6 +125,30 @@ def test_is_on_offset(self, weekday): expected = False assert_is_on_offset(offset, date, expected) + @pytest.mark.parametrize( + "n,date", + [ + (2, "1862-01-13 09:03:34.873477378+0210"), + (-2, "1856-10-24 16:18:36.556360110-0717"), + ], + ) + def test_is_on_offset_weekday_none(self, n, date): + # GH 18510 Week with weekday = None, normalize = False + # should always be is_on_offset + offset = Week(n=n, weekday=None) + ts = Timestamp(date, tz="Africa/Lusaka") + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + + def test_week_add_invalid(self): + # Week with weekday should raise TypeError and _not_ AttributeError + # when adding invalid offset + offset = Week(weekday=1) + other = Day() + with pytest.raises(TypeError, match="Cannot add"): + offset + other + class TestWeekOfMonth(Base): _offset = WeekOfMonth @@ -221,6 +249,22 @@ def test_is_on_offset(self, case): offset = WeekOfMonth(week=week, weekday=weekday) assert offset.is_on_offset(dt) == expected + @pytest.mark.parametrize( + "n,week,date,tz", + [ + (2, 2, "1916-05-15 01:14:49.583410462+0422", "Asia/Qyzylorda"), + (-3, 1, "1980-12-08 03:38:52.878321185+0500", "Asia/Oral"), + ], + ) + def test_is_on_offset_nanoseconds(self, n, week, date, tz): + # GH 18864 + # Make sure that nanoseconds don't trip up is_on_offset (and with it apply) + offset = WeekOfMonth(n=n, week=week, weekday=0) + ts = Timestamp(date, tz=tz) + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + class TestLastWeekOfMonth(Base): _offset = LastWeekOfMonth @@ -298,6 +342,21 @@ def test_is_on_offset(self, case): offset = LastWeekOfMonth(weekday=weekday) assert offset.is_on_offset(dt) == expected + @pytest.mark.parametrize( + "n,weekday,date,tz", + [ + (4, 6, "1917-05-27 20:55:27.084284178+0200", "Europe/Warsaw"), + (-4, 5, "2005-08-27 05:01:42.799392561-0500", "America/Rainy_River"), + ], + ) + def test_last_week_of_month_on_offset(self, n, weekday, date, tz): + # GH 19036, GH 18977 _adjust_dst was incorrect for LastWeekOfMonth + offset = LastWeekOfMonth(n=n, weekday=weekday) + ts = Timestamp(date, tz=tz) + slow = (ts + offset) - offset == ts + fast = offset.is_on_offset(ts) + assert fast == slow + def test_repr(self): assert ( repr(LastWeekOfMonth(n=2, weekday=1)) == "<2 * LastWeekOfMonths: weekday=1>" diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index e70d079739003..7a5fcebfd23d7 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -68,7 +68,10 @@ def tests_skip_nuisance(): def test_skip_sum_object_raises(): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3) - result = r.sum() + msg = r"nuisance columns.*Dropped columns were Index\(\['C'\], dtype='object'\)" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#42738 + result = r.sum() expected = DataFrame( {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, columns=list("AB"), diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 977ce281c4b33..d2a3be88eb27b 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -116,8 +116,10 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): data = np.arange(10.0) data[::2] = np.nan df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) - result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() - expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() + with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): + # GH#42738 + result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() + expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 03b43026c9a6c..2523ec585a491 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -923,7 +923,12 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) - expected = df.groupby("A").apply(lambda x: getattr(x.ewm(com=1.0), method)()) + with tm.assert_produces_warning(FutureWarning, match="nuisance"): + # GH#42738 + expected = df.groupby("A").apply( + lambda x: getattr(x.ewm(com=1.0), method)() + ) + # There may be a bug in the above statement; not returning the correct index tm.assert_frame_equal(result.reset_index(drop=True), expected) @@ -955,7 +960,9 @@ def test_pairwise_methods(self, method, expected_data): def test_times(self, times_frame): # GH 40951 halflife = "23 days" - result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() + with tm.assert_produces_warning(FutureWarning, match="nuisance"): + # GH#42738 + result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() expected = DataFrame( { "B": [ @@ -992,22 +999,23 @@ def test_times(self, times_frame): def test_times_vs_apply(self, times_frame): # GH 40951 halflife = "23 days" - result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() - expected = ( - times_frame.groupby("A") - .apply(lambda x: x.ewm(halflife=halflife, times="C").mean()) - .iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] - .reset_index(drop=True) - ) + with tm.assert_produces_warning(FutureWarning, match="nuisance"): + # GH#42738 + result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() + expected = ( + times_frame.groupby("A") + .apply(lambda x: x.ewm(halflife=halflife, times="C").mean()) + .iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] + .reset_index(drop=True) + ) tm.assert_frame_equal(result.reset_index(drop=True), expected) def test_times_array(self, times_frame): # GH 40951 halflife = "23 days" - result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() - expected = ( - times_frame.groupby("A") - .ewm(halflife=halflife, times=times_frame["C"].values) - .mean() - ) + gb = times_frame.groupby("A") + with tm.assert_produces_warning(FutureWarning, match="nuisance"): + # GH#42738 + result = gb.ewm(halflife=halflife, times="C").mean() + expected = gb.ewm(halflife=halflife, times=times_frame["C"].values).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index a8ec9086e6b02..f507b6a465f5b 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -170,26 +170,39 @@ def test_invalid_engine_kwargs(self, grouper): engine="cython", engine_kwargs={"nopython": True} ) - @pytest.mark.parametrize( - "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] - ) + @pytest.mark.parametrize("grouper", ["None", "groupby"]) def test_cython_vs_numba( self, grouper, nogil, parallel, nopython, ignore_na, adjust ): + if grouper == "None": + grouper = lambda x: x + warn = FutureWarning + else: + grouper = lambda x: x.groupby("A") + warn = None + df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = ewm.mean(engine="cython") + with tm.assert_produces_warning(warn, match="nuisance"): + # GH#42738 + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] - ) + @pytest.mark.parametrize("grouper", ["None", "groupby"]) def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na): # GH 40951 + + if grouper == "None": + grouper = lambda x: x + warn = FutureWarning + else: + grouper = lambda x: x.groupby("A") + warn = None + halflife = "23 days" times = to_datetime( [ @@ -207,8 +220,11 @@ def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_ ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = ewm.mean(engine="cython") + + with tm.assert_produces_warning(warn, match="nuisance"): + # GH#42738 + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected)