From b28456a7926fa53ca88a39a3a714eeb0c9260d7b Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Wed, 25 Sep 2019 12:51:44 +0900 Subject: [PATCH 01/19] BUG: value_counts can handle the case even with empty groups (#28479) * If applying rep to recons_labels go fail, use ids which has no consecutive duplicates instead. --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/generic.py | 9 +++++++- pandas/tests/groupby/test_value_counts.py | 27 ++++++++++++++++++++++- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7ca93d7d75854..fbda7011e066d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -299,6 +299,7 @@ Other - Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) +- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) .. _whatsnew_1000.contributors: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f8f1455561c03..fbbcfc06bbdc3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1259,7 +1259,14 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + try: + labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + except ValueError: + # If applying rep to recons_labels go fail, use ids which has no + # consecutive duplicates instead. + _ids_idx = np.ones(len(ids), dtype=bool) + _ids_idx[1:] = ids[1:] != ids[:-1] + labels = list(map(rep, [ids[_ids_idx]])) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index f8bd8843ab7e3..b7236ab491011 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Series, date_range +from pandas import DataFrame, MultiIndex, Series, date_range, Grouper from pandas.util import testing as tm @@ -79,3 +79,28 @@ def rebuild_index(df): # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 tm.assert_series_equal(left.sort_index(), right.sort_index()) + + +@pytest.mark.parametrize( + "freq, size, frac", product(["1D", "2D", "1W", "1Y"], [100, 1000], [0.1, 0.5, 1]) +) +def test_series_groupby_value_counts_with_grouper(freq, size, frac): + np.random.seed(42) + + df = DataFrame.from_dict( + { + "date": date_range("2019-09-25", periods=size), + "name": np.random.choice(list("abcd"), size), + } + ).sample(frac=frac) + + gr = df.groupby(Grouper(key="date", freq=freq))["name"] + + # have to sort on index because of unstable sort on values xref GH9212 + result = gr.value_counts().sort_index() + expected = gr.apply(Series.value_counts).sort_index() + expected.index.names = ( + result.index.names + ) # .apply(Series.value_counts) can't create all names + + tm.assert_series_equal(result, expected) From 3ef5e8a445bdd55481f38404e2fe194a94535416 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Thu, 26 Sep 2019 22:57:22 +0900 Subject: [PATCH 02/19] . --- pandas/tests/groupby/test_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index b7236ab491011..d1470c0eb1e70 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Series, date_range, Grouper +from pandas import DataFrame, Grouper, MultiIndex, Series, date_range from pandas.util import testing as tm From 40475e8c5ca3381ef1ff35d53db13fab25d611c9 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Fri, 27 Sep 2019 17:58:09 +0900 Subject: [PATCH 03/19] removing consecutive duplicates was the same as just unique --- pandas/core/groupby/generic.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fbbcfc06bbdc3..ae851110af800 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1262,11 +1262,8 @@ def value_counts( try: labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] except ValueError: - # If applying rep to recons_labels go fail, use ids which has no - # consecutive duplicates instead. - _ids_idx = np.ones(len(ids), dtype=bool) - _ids_idx[1:] = ids[1:] != ids[:-1] - labels = list(map(rep, [ids[_ids_idx]])) + [llab(lab, inc)] + # If applying rep to recons_labels go fail, use unique ids + labels = list(map(rep, [np.unique(ids)])) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] From f1f104a606b9e2bc3acaa2310ecf316d0ee056ad Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Sat, 28 Sep 2019 18:44:17 +0900 Subject: [PATCH 04/19] get the performance while handling the exception explicitly --- pandas/core/groupby/generic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ae851110af800..32db190f30880 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -54,6 +54,7 @@ _transform_template, groupby, ) +from pandas.core.groupby.ops import BinGrouper from pandas.core.index import Index, MultiIndex, _all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block @@ -1262,8 +1263,11 @@ def value_counts( try: labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] except ValueError: - # If applying rep to recons_labels go fail, use unique ids - labels = list(map(rep, [np.unique(ids)])) + [llab(lab, inc)] + # If applying rep to recons_labels go fail and that's because empty periods, + is_len_different = len(self.grouper.binlabels) != len(self.grouper.indices) + if isinstance(self.grouper, BinGrouper) and is_len_different: + # then use unidue ids instead of self.grouper.recons_labels + labels = list(map(rep, [np.unique(ids)])) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] From 6fbaaa6b13f257d26f6522ddbc4b2af7f4c7af9b Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Thu, 3 Oct 2019 09:56:41 +0900 Subject: [PATCH 05/19] get rid of try-except --- pandas/core/groupby/generic.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 32db190f30880..baadfe198ffa4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1260,14 +1260,12 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - try: + if isinstance(self.grouper, BinGrouper) and ( + len(self.grouper.binlabels) != len(self.grouper.indices) + ): + labels = list(map(rep, [np.unique(ids)])) + [llab(lab, inc)] + else: labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] - except ValueError: - # If applying rep to recons_labels go fail and that's because empty periods, - is_len_different = len(self.grouper.binlabels) != len(self.grouper.indices) - if isinstance(self.grouper, BinGrouper) and is_len_different: - # then use unidue ids instead of self.grouper.recons_labels - labels = list(map(rep, [np.unique(ids)])) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] From 3a7f71e806ea26c85b36322f5765f96d9c16a877 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Thu, 3 Oct 2019 10:16:25 +0900 Subject: [PATCH 06/19] make test more idiomatic --- pandas/tests/groupby/test_value_counts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index d1470c0eb1e70..70d6b3db1923e 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -81,9 +81,9 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) -@pytest.mark.parametrize( - "freq, size, frac", product(["1D", "2D", "1W", "1Y"], [100, 1000], [0.1, 0.5, 1]) -) +@pytest.mark.parametrize("freq", ["1D", "2D", "1W", "1Y"]) +@pytest.mark.parametrize("size", [100, 1000]) +@pytest.mark.parametrize("frac", [0.1, 0.5, 1]) def test_series_groupby_value_counts_with_grouper(freq, size, frac): np.random.seed(42) From d101a730da855eecf14a9b20aae7cc257f2ff265 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Mon, 14 Oct 2019 17:24:29 +0900 Subject: [PATCH 07/19] Merge origin/master into fix-GH28479-1 --- pandas/core/groupby/generic.py | 8 +++++++- pandas/tests/groupby/test_value_counts.py | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 068d5e5275f0d..598f13de309b4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -53,6 +53,7 @@ _transform_template, groupby, ) +from pandas.core.groupby.ops import BinGrouper from pandas.core.index import Index, MultiIndex, _all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block @@ -639,7 +640,12 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + if isinstance(self.grouper, BinGrouper) and ( + len(self.grouper.binlabels) != len(self.grouper.indices) + ): + labels = list(map(rep, [np.unique(ids)])) + [llab(lab, inc)] + else: + labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index d1470c0eb1e70..70d6b3db1923e 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -81,9 +81,9 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) -@pytest.mark.parametrize( - "freq, size, frac", product(["1D", "2D", "1W", "1Y"], [100, 1000], [0.1, 0.5, 1]) -) +@pytest.mark.parametrize("freq", ["1D", "2D", "1W", "1Y"]) +@pytest.mark.parametrize("size", [100, 1000]) +@pytest.mark.parametrize("frac", [0.1, 0.5, 1]) def test_series_groupby_value_counts_with_grouper(freq, size, frac): np.random.seed(42) From 800560351ca4ab9203f8e8858a83b87f9014e285 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Sat, 19 Oct 2019 19:08:08 +0900 Subject: [PATCH 08/19] move the logic into the BinGrouper.recons_labels --- .travis.yml | 13 +- ci/build38.sh | 19 + ci/setup_env.sh | 5 + doc/source/getting_started/install.rst | 2 +- doc/source/user_guide/advanced.rst | 36 +- doc/source/user_guide/io.rst | 34 +- doc/source/user_guide/reshaping.rst | 16 +- doc/source/whatsnew/v0.25.2.rst | 79 +- doc/source/whatsnew/v1.0.0.rst | 40 +- pandas/_libs/algos_rank_helper.pxi.in | 424 +++++----- pandas/_libs/algos_take_helper.pxi.in | 43 +- pandas/_libs/groupby.pyx | 733 +++++++++++++++++- pandas/_libs/groupby_helper.pxi.in | 670 ---------------- pandas/_libs/index.pyx | 50 +- pandas/_libs/intervaltree.pxi.in | 4 +- pandas/_libs/lib.pyx | 25 +- pandas/_libs/reduction.pyx | 2 +- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/_libs/tslibs/timezones.pyx | 7 +- pandas/compat/numpy/__init__.py | 1 + pandas/core/accessor.py | 6 +- pandas/core/algorithms.py | 79 +- pandas/core/apply.py | 46 +- pandas/core/arrays/base.py | 13 +- pandas/core/arrays/categorical.py | 6 +- pandas/core/base.py | 22 +- pandas/core/frame.py | 69 +- pandas/core/generic.py | 2 +- pandas/core/groupby/generic.py | 110 ++- pandas/core/groupby/groupby.py | 47 +- pandas/core/groupby/ops.py | 18 +- pandas/core/indexes/base.py | 37 +- pandas/core/indexes/category.py | 8 - pandas/core/indexes/multi.py | 22 +- pandas/core/indexes/period.py | 10 +- pandas/core/indexes/timedeltas.py | 3 +- pandas/core/internals/blocks.py | 19 +- pandas/core/resample.py | 21 +- pandas/core/reshape/reshape.py | 17 +- pandas/core/series.py | 4 +- pandas/core/sorting.py | 5 +- pandas/io/formats/format.py | 5 +- pandas/io/json/_table_schema.py | 6 +- pandas/tests/computation/test_eval.py | 9 +- pandas/tests/extension/list/__init__.py | 3 + pandas/tests/extension/list/array.py | 133 ++++ pandas/tests/extension/list/test_list.py | 30 + pandas/tests/frame/test_apply.py | 11 - pandas/tests/frame/test_convert_to.py | 4 +- pandas/tests/groupby/aggregate/test_other.py | 2 +- pandas/tests/groupby/test_categorical.py | 12 +- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_groupby.py | 10 + pandas/tests/indexes/multi/test_astype.py | 2 +- .../tests/indexes/multi/test_constructor.py | 2 + pandas/tests/indexes/multi/test_names.py | 35 +- pandas/tests/indexes/multi/test_reindex.py | 10 +- pandas/tests/indexes/multi/test_reshape.py | 1 + pandas/tests/indexes/test_category.py | 20 +- pandas/tests/indexing/test_categorical.py | 71 +- pandas/tests/indexing/test_coercion.py | 3 +- pandas/tests/io/formats/test_to_html.py | 8 + pandas/tests/io/json/test_ujson.py | 5 +- pandas/tests/io/parser/conftest.py | 9 +- pandas/tests/io/test_sql.py | 4 +- pandas/tests/plotting/test_backend.py | 2 +- pandas/tests/reshape/test_concat.py | 6 +- pandas/tests/reshape/test_reshape.py | 5 +- pandas/tests/series/test_analytics.py | 13 + pandas/tests/series/test_operators.py | 35 +- pandas/tests/test_base.py | 6 + pandas/tests/test_multilevel.py | 22 +- .../offsets/test_offsets_properties.py | 4 +- pandas/tests/util/test_assert_frame_equal.py | 2 +- pandas/tests/util/test_assert_series_equal.py | 2 +- pandas/util/testing.py | 11 +- scripts/tests/test_validate_docstrings.py | 19 + scripts/validate_docstrings.py | 24 +- setup.cfg | 24 - setup.py | 4 +- 80 files changed, 1877 insertions(+), 1468 deletions(-) create mode 100644 ci/build38.sh delete mode 100644 pandas/_libs/groupby_helper.pxi.in create mode 100644 pandas/tests/extension/list/__init__.py create mode 100644 pandas/tests/extension/list/array.py create mode 100644 pandas/tests/extension/list/test_list.py diff --git a/.travis.yml b/.travis.yml index 79fecc41bec0d..b9fa06304d387 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,12 @@ matrix: - python: 3.5 include: + - dist: bionic + # 18.04 + python: 3.8-dev + env: + - JOB="3.8-dev" PATTERN="(not slow and not network)" + - dist: trusty env: - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network)" @@ -71,6 +77,7 @@ before_install: # This overrides travis and tells it to look nowhere. - export BOTO_CONFIG=/dev/null + install: - echo "install start" - ci/prep_cython_cache.sh @@ -78,17 +85,19 @@ install: - ci/submit_cython_cache.sh - echo "install done" + before_script: # display server (for clipboard functionality) needs to be started here, # does not work if done in install:setup_env.sh (GH-26103) - export DISPLAY=":99.0" - echo "sh -e /etc/init.d/xvfb start" - - sh -e /etc/init.d/xvfb start + - if [ "$JOB" != "3.8-dev" ]; then sh -e /etc/init.d/xvfb start; fi - sleep 3 script: - echo "script start" - - source activate pandas-dev + - echo "$JOB" + - if [ "$JOB" != "3.8-dev" ]; then source activate pandas-dev; fi - ci/run_tests.sh after_script: diff --git a/ci/build38.sh b/ci/build38.sh new file mode 100644 index 0000000000000..903016536d240 --- /dev/null +++ b/ci/build38.sh @@ -0,0 +1,19 @@ +#!/bin/bash -e +# Special build for python3.8 until numpy puts its own wheels up + +sudo apt-get install build-essential gcc xvfb +pip install --no-deps -U pip wheel setuptools +pip install python-dateutil pytz cython pytest pytest-xdist hypothesis + +# Possible alternative for getting numpy: +pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy + +python setup.py build_ext -inplace +python -m pip install --no-build-isolation -e . + +python -c "import sys; print(sys.version_info)" +python -c "import pandas as pd" +python -c "import hypothesis" + +# TODO: Is there anything else in setup_env that we really want to do? +# ci/setup_env.sh diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 382491a947488..794130355fd74 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,5 +1,9 @@ #!/bin/bash -e +if [ "$JOB" == "3.8-dev" ]; then + /bin/bash ci/build38.sh + exit 0 +fi # edit the locale file if needed if [ -n "$LOCALE_OVERRIDE" ]; then @@ -51,6 +55,7 @@ echo echo "update conda" conda config --set ssl_verify false conda config --set quiet true --set always_yes true --set changeps1 false +conda install pip # create conda to create a historical artifact for pip & setuptools conda update -n base conda echo "conda info -a" diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index fc99b458fa0af..7d1150c2f65fa 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.5.3 and above, 3.6, and 3.7. +Officially Python 3.5.3 and above, 3.6, 3.7, and 3.8. Installing pandas ----------------- diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 62a9b6396404a..4949dd580414f 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -783,27 +783,41 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df2.reindex(['a', 'e']) - df2.reindex(['a', 'e']).index - df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))) - df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index + df3 = pd.DataFrame({'A': np.arange(3), + 'B': pd.Series(list('abc')).astype('category')}) + df3 = df3.set_index('B') + df3 + +.. ipython:: python + + df3.reindex(['a', 'e']) + df3.reindex(['a', 'e']).index + df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) + df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index .. warning:: Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories or a ``TypeError`` will be raised. - .. code-block:: ipython + .. ipython:: python - In [9]: df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')}) + df4 = pd.DataFrame({'A': np.arange(2), + 'B': list('ba')}) + df4['B'] = df4['B'].astype(CategoricalDtype(list('ab'))) + df4 = df4.set_index('B') + df4.index - In [11]: df3 = df3.set_index('B') + df5 = pd.DataFrame({'A': np.arange(2), + 'B': list('bc')}) + df5['B'] = df5['B'].astype(CategoricalDtype(list('bc'))) + df5 = df5.set_index('B') + df5.index - In [11]: df3.index - Out[11]: CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category') + .. code-block:: ipython - In [12]: pd.concat([df2, df3]) - TypeError: categories must match existing categories when appending + In [1]: pd.concat([df4, df5]) + TypeError: categories must match existing categories when appending .. _indexing.rangeindex: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ee097c1f4d5e8..6b23c814843e1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3811,6 +3811,8 @@ storing/selecting from homogeneous index ``DataFrames``. # the levels are automatically included as data columns store.select('df_mi', 'foo=bar') +.. note:: + The ``index`` keyword is reserved and cannot be use as a level name. .. _io.hdf5-query: @@ -3829,6 +3831,7 @@ A query is specified using the ``Term`` class under the hood, as a boolean expre * ``index`` and ``columns`` are supported indexers of ``DataFrames``. * if ``data_columns`` are specified, these can be used as additional indexers. +* level name in a MultiIndex, with default name ``level_0``, ``level_1``, … if not provided. Valid comparison operators are: @@ -3947,7 +3950,7 @@ space. These are in terms of the total number of rows in a table. .. _io.hdf5-timedelta: -Using timedelta64[ns] +Query timedelta64[ns] +++++++++++++++++++++ You can store and query using the ``timedelta64[ns]`` type. Terms can be @@ -3966,6 +3969,35 @@ specified in the format: ``()``, where float may be signed (and fra store.append('dftd', dftd, data_columns=True) store.select('dftd', "C<'-3.5D'") +Query MultiIndex +++++++++++++++++ + +Selecting from a ``MultiIndex`` can be achieved by using the name of the level. + +.. ipython:: python + + df_mi.index.names + store.select('df_mi', "foo=baz and bar=two") + +If the ``MultiIndex`` levels names are ``None``, the levels are automatically made available via +the ``level_n`` keyword with ``n`` the level of the ``MultiIndex`` you want to select from. + +.. ipython:: python + + index = pd.MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + ) + df_mi_2 = pd.DataFrame(np.random.randn(10, 3), + index=index, columns=["A", "B", "C"]) + df_mi_2 + + store.append("df_mi_2", df_mi_2) + + # the levels are automatically included as data columns with keyword level_n + store.select("df_mi_2", "level_0=foo and level_1=two") + + Indexing ++++++++ diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index dd6d3062a8f0a..b2ee252495f23 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -728,14 +728,14 @@ Suppose we wanted to pivot ``df`` such that the ``col`` values are columns, ``row`` values are the index, and the mean of ``val0`` are the values? In particular, the resulting DataFrame should look like: -.. note:: - - col col0 col1 col2 col3 col4 - row - row0 0.77 0.605 NaN 0.860 0.65 - row2 0.13 NaN 0.395 0.500 0.25 - row3 NaN 0.310 NaN 0.545 NaN - row4 NaN 0.100 0.395 0.760 0.24 +.. code-block:: text + + col col0 col1 col2 col3 col4 + row + row0 0.77 0.605 NaN 0.860 0.65 + row2 0.13 NaN 0.395 0.500 0.25 + row3 NaN 0.310 NaN 0.545 NaN + row4 NaN 0.100 0.395 0.760 0.24 This solution uses :func:`~pandas.pivot_table`. Also note that ``aggfunc='mean'`` is the default. It is included here to be explicit. diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 9789c9fce3541..a99751f9bab9f 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -1,101 +1,38 @@ .. _whatsnew_0252: -What's new in 0.25.2 (October XX, 2019) +What's new in 0.25.2 (October 15, 2019) --------------------------------------- These are the changes in pandas 0.25.2. See :ref:`release` for a full changelog including other versions of pandas. +.. note:: + + Pandas 0.25.2 adds compatibility for Python 3.8 (:issue:`28147`). + .. _whatsnew_0252.bug_fixes: Bug fixes ~~~~~~~~~ -Categorical -^^^^^^^^^^^ - -- - -Datetimelike -^^^^^^^^^^^^ - -- -- -- - -Timezones -^^^^^^^^^ - -- - -Numeric -^^^^^^^ - -- -- -- -- - -Conversion -^^^^^^^^^^ - -- - -Interval -^^^^^^^^ - -- - Indexing ^^^^^^^^ -- Fix regression in :meth:`DataFrame.reindex` not following ``limit`` argument (:issue:`28631`). +- Fix regression in :meth:`DataFrame.reindex` not following the ``limit`` argument (:issue:`28631`). - Fix regression in :meth:`RangeIndex.get_indexer` for decreasing :class:`RangeIndex` where target values may be improperly identified as missing/present (:issue:`28678`) -- -- - -Missing -^^^^^^^ - -- I/O ^^^ -- Fix regression in notebook display where tags not used for :attr:`DataFrame.index` (:issue:`28204`). +- Fix regression in notebook display where ```` tags were missing for :attr:`DataFrame.index` values (:issue:`28204`). - Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) -- -- - -Plotting -^^^^^^^^ - -- -- -- +- Fix :meth:`~DataFrame.to_csv` with ``ExtensionArray`` with list-like values (:issue:`28840`). Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). - Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`) -- -- -- - -Reshaping -^^^^^^^^^ - -- -- -- -- -- - -Sparse -^^^^^^ - -- Other ^^^^^ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0001dbf188620..d13c815f0b829 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -109,6 +109,7 @@ Other enhancements (:issue:`28368`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) +- Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) Build Changes ^^^^^^^^^^^^^ @@ -123,7 +124,37 @@ source, you should no longer need to install Cython into your build environment Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). +.. _whatsnew_1000.api_breaking.MultiIndex._names: + +``MultiIndex.levels`` do not hold level names any longer +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- A :class:`MultiIndex` previously stored the level names as attributes of each of its + :attr:`MultiIndex.levels`. From Pandas 1.0, the names are only accessed through + :attr:`MultiIndex.names` (which was also possible previously). This is done in order to + make :attr:`MultiIndex.levels` more similar to :attr:`CategoricalIndex.categories` (:issue:`27242`:). + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + Out[2]: mi + MultiIndex([(1, 'a'), + (1, 'b'), + (2, 'a'), + (2, 'b')], + names=['x', 'y']) + Out[3]: mi.levels[0].name + 'x' + +*pandas 1.0.0* + +.. ipython:: python + + mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi.levels[0].name + - :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) *pandas 0.25.x* @@ -149,6 +180,7 @@ Backwards incompatible API changes Other API changes ^^^^^^^^^^^^^^^^^ +- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) - In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). @@ -162,6 +194,7 @@ Documentation Improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added new section on :ref:`scale` (:issue:`28315`). +- Added sub-section Query MultiIndex in IO tools user guide (:issue:`28791`) .. _whatsnew_1000.deprecations: @@ -194,6 +227,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) - Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) +- Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) - .. _whatsnew_1000.performance: @@ -221,6 +255,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) +- :meth:`DataFrame.reindex` with a :class:`CategoricalIndex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) - Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`) - Bug where :func:`merge` was unable to join on categorical and extension dtype columns (:issue:`28668`) - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) @@ -290,6 +325,9 @@ Indexing - Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`) - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) +- :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) +- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) +- Missing ^^^^^^^ diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 5dac94394c7ed..d5a31b6a13010 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -8,24 +8,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # rank_1d, rank_2d # ---------------------------------------------------------------------- -{{py: - -# dtype ctype pos_nan_value neg_nan_value -dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), - ('float64', 'float64_t', 'np.inf', '-np.inf'), - ('uint64', 'uint64_t', '', ''), - ('int64', 'int64_t', 'np.iinfo(np.int64).max', - 'np.iinfo(np.int64).min')] - -}} - -{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}} +ctypedef fused rank_t: + object + float64_t + uint64_t + int64_t @cython.wraparound(False) @cython.boundscheck(False) -def rank_1d_{{dtype}}(object in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_1d(rank_t[:] in_arr, ties_method='average', + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -33,85 +26,86 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', cdef: Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - {{if dtype == 'object'}} - ndarray sorted_data, values - {{else}} - ndarray[{{ctype}}] sorted_data, values - {{endif}} + ndarray[rank_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted ndarray[uint8_t, cast=True] sorted_mask - {{if dtype == 'uint64'}} - {{ctype}} val - {{else}} - {{ctype}} val, nan_value - {{endif}} + rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 - bint isnan + bint isnan, condition float64_t count = 0.0 + tiebreak = tiebreakers[ties_method] - {{if dtype == 'float64'}} - values = np.asarray(in_arr).copy() - {{elif dtype == 'object'}} - values = np.array(in_arr, copy=True) + if rank_t is float64_t: + values = np.asarray(in_arr).copy() + elif rank_t is object: + values = np.array(in_arr, copy=True) - if values.dtype != np.object_: - values = values.astype('O') - {{else}} - values = np.asarray(in_arr) - {{endif}} + if values.dtype != np.object_: + values = values.astype('O') + else: + values = np.asarray(in_arr) keep_na = na_option == 'keep' - {{if dtype == 'object'}} - mask = missing.isnaobj(values) - {{elif dtype == 'float64'}} - mask = np.isnan(values) - {{elif dtype == 'int64'}} - mask = values == NPY_NAT + if rank_t is object: + mask = missing.isnaobj(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - {{endif}} + # create copy in case of NPY_NAT + # values are mutated inplace + if mask.any(): + values = values.copy() # double sort first by mask and then by values to ensure nan values are # either at the beginning or the end. mask/(~mask) controls padding at # tail or the head - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - order = (values, mask) + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + order = (values, mask) + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).min + + order = (values, ~mask) + np.putmask(values, mask, nan_value) else: - nan_value = {{neg_nan_value}} - order = (values, ~mask) - np.putmask(values, mask, nan_value) - {{else}} - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - {{endif}} + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) n = len(values) ranks = np.empty(n, dtype='f8') - {{if dtype == 'object'}} - _as = np.lexsort(keys=order) - {{else}} - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here + if rank_t is object: _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING else: - _as = np.lexsort(keys=order) - {{endif}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = np.lexsort(keys=order) + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = np.lexsort(keys=order) if not ascending: _as = _as[::-1] @@ -122,38 +116,32 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', non_na_idx = _indices[0] if len(_indices) > 0 else -1 argsorted = _as.astype('i8') - {{if dtype == 'object'}} - if True: - {{else}} - with nogil: - {{endif}} - # TODO: why does the 2d version not have a nogil block? + if rank_t is object: + # TODO: de-duplicate once cython supports conditional nogil for i in range(n): sum_ranks += i + 1 dups += 1 - {{if dtype == 'object'}} - val = util.get_value_at(sorted_data, i) - {{else}} val = sorted_data[i] - {{endif}} - {{if dtype != 'uint64'}} - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - {{endif}} + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue count += 1.0 - {{if dtype == 'object'}} - if (i == n - 1 or - are_diff(util.get_value_at(sorted_data, i + 1), val) or - i == non_na_idx): - {{else}} - if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx): - {{endif}} + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) + + if condition: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -165,13 +153,12 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: - {{if dtype == 'object'}} - raise ValueError('first not supported for ' - 'non-numeric data') - {{else}} - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - {{endif}} + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 @@ -180,6 +167,60 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 + + else: + with nogil: + # TODO: why does the 2d version not have a nogil block? + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + if pct: if tiebreak == TIEBREAK_DENSE: return ranks / total_tie_count @@ -189,8 +230,14 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', return ranks -def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): +rank_1d_object = rank_1d["object"] +rank_1d_float64 = rank_1d["float64_t"] +rank_1d_uint64 = rank_1d["uint64_t"] +rank_1d_int64 = rank_1d["int64_t"] + + +def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -198,138 +245,130 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', cdef: Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - {{if dtype == 'object'}} Py_ssize_t infs - {{endif}} ndarray[float64_t, ndim=2] ranks - {{if dtype == 'int64' or dtype == 'uint64'}} - ndarray[{{ctype}}, ndim=2, cast=True] values - {{else}} - ndarray[{{ctype}}, ndim=2] values - {{endif}} + ndarray[rank_t, ndim=2] values ndarray[int64_t, ndim=2] argsorted - {{if dtype == 'uint64'}} - {{ctype}} val - {{else}} - {{ctype}} val, nan_value - {{endif}} + rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 float64_t count = 0.0 + bint condition, skip_condition tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' - in_arr = np.asarray(in_arr) - if axis == 0: - values = in_arr.T.copy() + values = np.asarray(in_arr).T.copy() else: - values = in_arr.copy() - - {{if dtype == 'object'}} - if values.dtype != np.object_: - values = values.astype('O') - {{endif}} + values = np.asarray(in_arr).copy() - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - else: - nan_value = {{neg_nan_value}} + if rank_t is object: + if values.dtype != np.object_: + values = values.astype('O') - {{if dtype == 'object'}} - mask = missing.isnaobj2d(values) - {{elif dtype == 'float64'}} - mask = np.isnan(values) - {{elif dtype == 'int64'}} - mask = values == NPY_NAT - {{endif}} + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max - np.putmask(values, mask, nan_value) - {{endif}} + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = NPY_NAT + + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + np.putmask(values, mask, nan_value) n, k = (values).shape ranks = np.empty((n, k), dtype='f8') - {{if dtype == 'object'}} - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - {{else}} - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING + if rank_t is object: + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks else: - _as = values.argsort(1) - {{endif}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) if not ascending: _as = _as[:, ::-1] - values = _take_2d_{{dtype}}(values, _as) + values = _take_2d(values, _as) argsorted = _as.astype('i8') for i in range(n): - {{if dtype == 'object'}} - dups = sum_ranks = infs = 0 - {{else}} - dups = sum_ranks = 0 - {{endif}} + if rank_t is object: + dups = sum_ranks = infs = 0 + else: + dups = sum_ranks = 0 total_tie_count = 0 count = 0.0 for j in range(k): - {{if dtype != 'object'}} - sum_ranks += j + 1 - dups += 1 - {{endif}} + if rank_t is not object: + sum_ranks += j + 1 + dups += 1 val = values[i, j] - {{if dtype != 'uint64'}} - {{if dtype == 'object'}} - if (val is nan_value) and keep_na: - {{else}} - if (val == nan_value) and keep_na: - {{endif}} - ranks[i, argsorted[i, j]] = NaN + if rank_t is not uint64_t: + if rank_t is object: + skip_condition = (val is nan_value) and keep_na + else: + skip_condition = (val == nan_value) and keep_na + if skip_condition: + ranks[i, argsorted[i, j]] = NaN - {{if dtype == 'object'}} - infs += 1 - {{endif}} + if rank_t is object: + infs += 1 - continue - {{endif}} + continue count += 1.0 - {{if dtype == 'object'}} - sum_ranks += (j - infs) + 1 - dups += 1 - {{endif}} + if rank_t is object: + sum_ranks += (j - infs) + 1 + dups += 1 - {{if dtype == 'object'}} - if j == k - 1 or are_diff(values[i, j + 1], val): - {{else}} - if j == k - 1 or values[i, j + 1] != val: - {{endif}} + if rank_t is object: + condition = j == k - 1 or are_diff(values[i, j + 1], val) + else: + condition = j == k - 1 or values[i, j + 1] != val + + if condition: if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = sum_ranks / dups @@ -340,13 +379,12 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j + 1 elif tiebreak == TIEBREAK_FIRST: - {{if dtype == 'object'}} - raise ValueError('first not supported ' - 'for non-numeric data') - {{else}} - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - {{endif}} + if rank_t is object: + raise ValueError('first not supported ' + 'for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 @@ -365,4 +403,8 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', else: return ranks -{{endfor}} + +rank_2d_object = rank_2d["object"] +rank_2d_float64 = rank_2d["float64_t"] +rank_2d_uint64 = rank_2d["uint64_t"] +rank_2d_int64 = rank_2d["int64_t"] diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 3a3adc71875ed..e7ee212065c5b 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -12,26 +12,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil dtypes = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True), + ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False, False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True), - ('object', 'object', 'object', 'object', '', '', False, False)] + 'True if ', ' > 0 else False', False), + ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), + ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), + ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), + ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), + ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), + ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), + ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), + ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), + ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), + ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), + ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), + ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), + ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), + ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), + ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), + ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), + ('object', 'object', 'object', 'object', '', '', False)] def get_dispatch(dtypes): @@ -118,7 +118,9 @@ def get_dispatch(dtypes): """ for (name, dest, c_type_in, c_type_out, preval, postval, - can_copy, nogil) in dtypes: + can_copy) in dtypes: + + nogil = c_type_out != "object" if nogil: nogil_str = "with nogil:" tab = ' ' @@ -276,7 +278,6 @@ cdef _take_2d(ndarray[take_t, ndim=2] values, object idx): Py_ssize_t i, j, N, K ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx ndarray[take_t, ndim=2] result - object val N, K = (values).shape diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3069bbbf34bb7..8a417d8fe3a92 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -8,9 +8,11 @@ import numpy as np cimport numpy as cnp from numpy cimport (ndarray, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t) + uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) cnp.import_array() +cdef extern from "numpy/npy_math.h": + float64_t NAN "NPY_NAN" from pandas._libs.util cimport numeric, get_nat @@ -21,6 +23,7 @@ from pandas._libs.algos import (take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers) cdef int64_t NPY_NAT = get_nat() +_int64_max = np.iinfo(np.int64).max cdef float64_t NaN = np.NaN @@ -372,7 +375,8 @@ def group_any_all(uint8_t[:] out, const uint8_t[:] mask, object val_test, bint skipna): - """Aggregated boolean values to show truthfulness of group elements + """ + Aggregated boolean values to show truthfulness of group elements. Parameters ---------- @@ -420,16 +424,23 @@ def group_any_all(uint8_t[:] out, if values[i] == flag_val: out[lab] = flag_val + # ---------------------------------------------------------------------- # group_add, group_prod, group_var, group_mean, group_ohlc # ---------------------------------------------------------------------- +ctypedef fused complexfloating_t: + float64_t + float32_t + complex64_t + complex128_t + @cython.wraparound(False) @cython.boundscheck(False) -def _group_add(floating[:, :] out, +def _group_add(complexfloating_t[:, :] out, int64_t[:] counts, - floating[:, :] values, + complexfloating_t[:, :] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -437,13 +448,14 @@ def _group_add(floating[:, :] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count - floating[:, :] sumx, nobs + complexfloating_t val, count + complexfloating_t[:, :] sumx + int64_t[:, :] nobs if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") + raise ValueError("len(index) != len(labels)") - nobs = np.zeros_like(out) + nobs = np.zeros((len(out), out.shape[1]), dtype=np.int64) sumx = np.zeros_like(out) N, K = (values).shape @@ -461,7 +473,12 @@ def _group_add(floating[:, :] out, # not nan if val == val: nobs[lab, j] += 1 - sumx[lab, j] += val + if (complexfloating_t is complex64_t or + complexfloating_t is complex128_t): + # clang errors if we use += with these dtypes + sumx[lab, j] = sumx[lab, j] + val + else: + sumx[lab, j] += val for i in range(ncounts): for j in range(K): @@ -471,8 +488,10 @@ def _group_add(floating[:, :] out, out[i, j] = sumx[i, j] -group_add_float32 = _group_add['float'] -group_add_float64 = _group_add['double'] +group_add_float32 = _group_add['float32_t'] +group_add_float64 = _group_add['float64_t'] +group_add_complex64 = _group_add['float complex'] +group_add_complex128 = _group_add['double complex'] @cython.wraparound(False) @@ -491,7 +510,7 @@ def _group_prod(floating[:, :] out, floating[:, :] prodx, nobs if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") + raise ValueError("len(index) != len(labels)") nobs = np.zeros_like(out) prodx = np.ones_like(out) @@ -541,7 +560,7 @@ def _group_var(floating[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") + raise ValueError("len(index) != len(labels)") nobs = np.zeros_like(out) mean = np.zeros_like(out) @@ -596,7 +615,7 @@ def _group_mean(floating[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") + raise ValueError("len(index) != len(labels)") nobs = np.zeros_like(out) sumx = np.zeros_like(out) @@ -788,5 +807,687 @@ def group_quantile(ndarray[float64_t] out, grp_start += grp_sz -# generated from template -include "groupby_helper.pxi" +# ---------------------------------------------------------------------- +# group_nth, group_last, group_rank +# ---------------------------------------------------------------------- + +ctypedef fused rank_t: + float64_t + float32_t + int64_t + uint64_t + object + + +cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: + if rank_t is object: + # Should never be used, but we need to avoid the `val != val` below + # or else cython will raise about gil acquisition. + raise NotImplementedError + + elif rank_t is int64_t: + return is_datetimelike and val == NPY_NAT + else: + return val != val + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + rank_t val + ndarray[rank_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) + + N, K = (values).shape + + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break + else: + out[i, j] = NAN + + else: + out[i, j] = resx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +group_last_float64 = group_last["float64_t"] +group_last_float32 = group_last["float32_t"] +group_last_int64 = group_last["int64_t"] +group_last_object = group_last["object"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, int64_t rank, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + rank_t val + ndarray[rank_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) + + N, K = (values).shape + + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break + else: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +group_nth_float64 = group_nth["float64_t"] +group_nth_float32 = group_nth["float32_t"] +group_nth_int64 = group_nth["int64_t"] +group_nth_object = group_nth["object"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank(float64_t[:, :] out, + rank_t[:, :] values, + const int64_t[:] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): + """ + Provides the rank of values within each group. + + Parameters + ---------- + out : array of float64_t values which this method will write its results to + values : array of rank_t values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + is_datetimelike : bool, default False + unused in this method but provided for call compatibility with other + Cython transformations + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default + 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + ascending : boolean, default True + False for ranks by high (1) to low (N) + na_option : {'keep', 'top', 'bottom'}, default 'keep' + pct : boolean, default False + Compute percentage rank of data within each group + na_option : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + TiebreakEnumType tiebreak + Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 + ndarray[int64_t] _as + ndarray[float64_t, ndim=2] grp_sizes + ndarray[rank_t] masked_vals + ndarray[uint8_t] mask + bint keep_na + rank_t nan_fill_val + + if rank_t is object: + raise NotImplementedError("Cant do nogil") + + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + N, K = (values).shape + grp_sizes = np.ones_like(out) + + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array + masked_vals = np.array(values[:, 0], copy=True) + if rank_t is int64_t: + mask = (masked_vals == NPY_NAT).astype(np.uint8) + else: + mask = np.isnan(masked_vals).astype(np.uint8) + + if ascending ^ (na_option == 'top'): + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max + else: + nan_fill_val = np.inf + order = (masked_vals, mask, labels) + else: + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).min + elif rank_t is uint64_t: + nan_fill_val = 0 + else: + nan_fill_val = -np.inf + + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) + + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + _as = np.lexsort(order).astype(np.int64, copy=False) + + if not ascending: + _as = _as[::-1] + + with nogil: + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays + for i in range(N): + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the the starting index of the current group (grp_start) + # and the current index + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]]) or + (labels[_as[i]] != labels[_as[i+1]])): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 - grp_start + else: + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]])): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (i - grp_start + 1 - + grp_na_count) + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count - + (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 + + if pct: + for i in range(N): + # We don't include NaN values in percentage + # rankings, so we assign them percentages of NaN. + if out[i, 0] != out[i, 0] or out[i, 0] == NAN: + out[i, 0] = NAN + elif grp_sizes[i, 0] != 0: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] + + +group_rank_float64 = group_rank["float64_t"] +group_rank_float32 = group_rank["float32_t"] +group_rank_int64 = group_rank["int64_t"] +group_rank_uint64 = group_rank["uint64_t"] +# Note: we do not have a group_rank_object because that would require a +# not-nogil implementation, see GH#19560 + + +# ---------------------------------------------------------------------- +# group_min, group_max +# ---------------------------------------------------------------------- + +# TODO: consider implementing for more dtypes +ctypedef fused groupby_t: + float64_t + float32_t + int64_t + uint64_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max(groupby_t[:, :] out, + int64_t[:] counts, + groupby_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] maxx, nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + if groupby_t is int64_t: + # Note: evaluated at compile-time + maxx[:] = -_int64_max + nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + maxx[:] = 0 + else: + maxx[:] = -np.inf + nan_val = NAN + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break + out[i, j] = nan_val + else: + out[i, j] = maxx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min(groupby_t[:, :] out, + int64_t[:] counts, + groupby_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] minx, nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + if groupby_t is int64_t: + minx[:] = _int64_max + nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + minx[:] = np.iinfo(np.uint64).max + else: + minx[:] = np.inf + nan_val = NAN + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break + out[i, j] = nan_val + else: + out[i, j] = minx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummin(groupby_t[:, :] out, + groupby_t[:, :] values, + const int64_t[:] labels, + int ngroups, + bint is_datetimelike): + """ + Cumulative minimum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummin in. + values : array + Values to take cummin of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + """ + + cdef: + Py_ssize_t i, j, N, K, size + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum + int64_t lab + + N, K = (values).shape + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + if groupby_t is int64_t: + accum[:] = _int64_max + elif groupby_t is uint64_t: + accum[:] = np.iinfo(np.uint64).max + else: + accum[:] = np.inf + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if _treat_as_na(val, is_datetimelike): + out[i, j] = val + else: + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummax(groupby_t[:, :] out, + groupby_t[:, :] values, + const int64_t[:] labels, + int ngroups, + bint is_datetimelike): + """ + Cumulative maximum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummax in. + values : array + Values to take cummax of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + """ + + cdef: + Py_ssize_t i, j, N, K, size + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum + int64_t lab + + N, K = (values).shape + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + if groupby_t is int64_t: + accum[:] = -_int64_max + elif groupby_t is uint64_t: + accum[:] = 0 + else: + accum[:] = -np.inf + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if _treat_as_na(val, is_datetimelike): + out[i, j] = val + else: + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in deleted file mode 100644 index 6b434b6470581..0000000000000 --- a/pandas/_libs/groupby_helper.pxi.in +++ /dev/null @@ -1,670 +0,0 @@ -""" -Template for each `dtype` helper function using groupby - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -cdef extern from "numpy/npy_math.h": - float64_t NAN "NPY_NAN" -_int64_max = np.iinfo(np.int64).max - -# ---------------------------------------------------------------------- -# group_nth, group_last, group_rank -# ---------------------------------------------------------------------- - -ctypedef fused rank_t: - float64_t - float32_t - int64_t - object - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last(rank_t[:, :] out, - int64_t[:] counts, - rank_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: - resx = np.empty((out).shape, dtype=object) - else: - resx = np.empty_like(out) - - N, K = (values).shape - - if rank_t is object: - # TODO: De-duplicate once conditional-nogil is available - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - else: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - else: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -group_last_float64 = group_last["float64_t"] -group_last_float32 = group_last["float32_t"] -group_last_int64 = group_last["int64_t"] -group_last_object = group_last["object"] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth(rank_t[:, :] out, - int64_t[:] counts, - rank_t[:, :] values, - const int64_t[:] labels, int64_t rank, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: - resx = np.empty((out).shape, dtype=object) - else: - resx = np.empty_like(out) - - N, K = (values).shape - - if rank_t is object: - # TODO: De-duplicate once conditional-nogil is available - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - else: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - -group_nth_float64 = group_nth["float64_t"] -group_nth_float32 = group_nth["float32_t"] -group_nth_int64 = group_nth["int64_t"] -group_nth_object = group_nth["object"] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_rank(float64_t[:, :] out, - rank_t[:, :] values, - const int64_t[:] labels, - bint is_datetimelike, object ties_method, - bint ascending, bint pct, object na_option): - """ - Provides the rank of values within each group. - - Parameters - ---------- - out : array of float64_t values which this method will write its results to - values : array of rank_t values to be ranked - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` - is_datetimelike : bool, default False - unused in this method but provided for call compatibility with other - Cython transformations - ties_method : {'average', 'min', 'max', 'first', 'dense'}, default - 'average' - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups - ascending : boolean, default True - False for ranks by high (1) to low (N) - na_option : {'keep', 'top', 'bottom'}, default 'keep' - pct : boolean, default False - Compute percentage rank of data within each group - na_option : {'keep', 'top', 'bottom'}, default 'keep' - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending - - Notes - ----- - This method modifies the `out` parameter rather than returning an object - """ - cdef: - TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 - ndarray[int64_t] _as - ndarray[float64_t, ndim=2] grp_sizes - ndarray[rank_t] masked_vals - ndarray[uint8_t] mask - bint keep_na - rank_t nan_fill_val - - if rank_t is object: - raise NotImplementedError("Cant do nogil") - - tiebreak = tiebreakers[ties_method] - keep_na = na_option == 'keep' - N, K = (values).shape - grp_sizes = np.ones_like(out) - - # Copy values into new array in order to fill missing data - # with mask, without obfuscating location of missing data - # in values array - masked_vals = np.array(values[:, 0], copy=True) - if rank_t is int64_t: - mask = (masked_vals == NPY_NAT).astype(np.uint8) - else: - mask = np.isnan(masked_vals).astype(np.uint8) - - if ascending ^ (na_option == 'top'): - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).max - else: - nan_fill_val = np.inf - order = (masked_vals, mask, labels) - else: - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).min - else: - nan_fill_val = -np.inf - - order = (masked_vals, ~mask, labels) - np.putmask(masked_vals, mask, nan_fill_val) - - # lexsort using labels, then mask, then actual values - # each label corresponds to a different group value, - # the mask helps you differentiate missing values before - # performing sort on the actual values - _as = np.lexsort(order).astype(np.int64, copy=False) - - if not ascending: - _as = _as[::-1] - - with nogil: - # Loop over the length of the value array - # each incremental i value can be looked up in the _as array - # that we sorted previously, which gives us the location of - # that sorted value for retrieval back from the original - # values / masked_vals arrays - for i in range(N): - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the the starting index of the current group (grp_start) - # and the current index - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]]) or - (labels[_as[i]] != labels[_as[i+1]])): - # if keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and mask[_as[i]]: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j], 0] = j + 1 - grp_start - else: - out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = grp_vals_seen - - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality - # and the end of iteration - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]])): - dups = sum_ranks = 0 - grp_vals_seen += 1 - grp_tie_count += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be - # sure to reset any of the items helping to calculate dups - if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (i - grp_start + 1 - - grp_na_count) - else: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (grp_tie_count - - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_tie_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - - if pct: - for i in range(N): - # We don't include NaN values in percentage - # rankings, so we assign them percentages of NaN. - if out[i, 0] != out[i, 0] or out[i, 0] == NAN: - out[i, 0] = NAN - elif grp_sizes[i, 0] != 0: - out[i, 0] = out[i, 0] / grp_sizes[i, 0] - - -group_rank_float64 = group_rank["float64_t"] -group_rank_float32 = group_rank["float32_t"] -group_rank_int64 = group_rank["int64_t"] -# Note: we do not have a group_rank_object because that would require a -# not-nogil implementation, see GH#19560 - - -# ---------------------------------------------------------------------- -# group_min, group_max -# ---------------------------------------------------------------------- - -# TODO: consider implementing for more dtypes -ctypedef fused groupby_t: - float64_t - float32_t - int64_t - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max(groupby_t[:, :] out, - int64_t[:] counts, - groupby_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] maxx, nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - if groupby_t is int64_t: - # Note: evaluated at compile-time - maxx[:] = -_int64_max - nan_val = NPY_NAT - else: - maxx[:] = -np.inf - nan_val = NAN - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if groupby_t is int64_t: - if val != nan_val: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - if val == val and val != nan_val: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan_val - else: - out[i, j] = maxx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min(groupby_t[:, :] out, - int64_t[:] counts, - groupby_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] minx, nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - if groupby_t is int64_t: - minx[:] = _int64_max - nan_val = NPY_NAT - else: - minx[:] = np.inf - nan_val = NAN - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if groupby_t is int64_t: - if val != nan_val: - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - if val == val and val != nan_val: - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan_val - else: - out[i, j] = minx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cummin(groupby_t[:, :] out, - groupby_t[:, :] values, - const int64_t[:] labels, - int ngroups, - bint is_datetimelike): - """ - Cumulative minimum of columns of `values`, in row groups `labels`. - - Parameters - ---------- - out : array - Array to store cummin in. - values : array - Values to take cummin of. - labels : int64 array - Labels to group by. - ngroups : int - Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. - - Notes - ----- - This method modifies the `out` parameter, rather than returning an object. - """ - - cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval - ndarray[groupby_t, ndim=2] accum - int64_t lab - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) - if groupby_t is int64_t: - accum[:] = _int64_max - else: - accum[:] = np.inf - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - # val = nan - if groupby_t is int64_t: - if is_datetimelike and val == NPY_NAT: - out[i, j] = NPY_NAT - else: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval - else: - if val == val: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cummax(groupby_t[:, :] out, - groupby_t[:, :] values, - const int64_t[:] labels, - int ngroups, - bint is_datetimelike): - """ - Cumulative maximum of columns of `values`, in row groups `labels`. - - Parameters - ---------- - out : array - Array to store cummax in. - values : array - Values to take cummax of. - labels : int64 array - Labels to group by. - ngroups : int - Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. - - Notes - ----- - This method modifies the `out` parameter, rather than returning an object. - """ - - cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval - ndarray[groupby_t, ndim=2] accum - int64_t lab - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) - if groupby_t is int64_t: - accum[:] = -_int64_max - else: - accum[:] = -np.inf - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if groupby_t is int64_t: - if is_datetimelike and val == NPY_NAT: - out[i, j] = NPY_NAT - else: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval - else: - if val == val: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 979dad6db0838..144d555258c50 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -41,11 +41,13 @@ cdef inline bint is_definitely_invalid_key(object val): cpdef get_value_at(ndarray arr, object loc, object tz=None): + obj = util.get_value_at(arr, loc) + if arr.descr.type_num == NPY_DATETIME: - return Timestamp(util.get_value_at(arr, loc), tz=tz) + return Timestamp(obj, tz=tz) elif arr.descr.type_num == NPY_TIMEDELTA: - return Timedelta(util.get_value_at(arr, loc)) - return util.get_value_at(arr, loc) + return Timedelta(obj) + return obj # Don't populate hash tables in monotonic indexes larger than this @@ -102,6 +104,9 @@ cdef class IndexEngine: arr[loc] = value cpdef get_loc(self, object val): + cdef: + Py_ssize_t loc + if is_definitely_invalid_key(val): raise TypeError("'{val}' is an invalid key".format(val=val)) @@ -114,7 +119,7 @@ cdef class IndexEngine: loc = _bin_search(values, val) # .searchsorted(val, side='left') if loc >= len(values): raise KeyError(val) - if util.get_value_at(values, loc) != val: + if values[loc] != val: raise KeyError(val) return loc @@ -281,7 +286,7 @@ cdef class IndexEngine: cdef: ndarray values, x ndarray[int64_t] result, missing - set stargets + set stargets, remaining_stargets dict d = {} object val int count = 0, count_missing = 0 @@ -304,12 +309,20 @@ cdef class IndexEngine: if stargets and len(stargets) < 5 and self.is_monotonic_increasing: # if there are few enough stargets and the index is monotonically # increasing, then use binary search for each starget + remaining_stargets = set() for starget in stargets: - start = values.searchsorted(starget, side='left') - end = values.searchsorted(starget, side='right') - if start != end: - d[starget] = list(range(start, end)) - else: + try: + start = values.searchsorted(starget, side='left') + end = values.searchsorted(starget, side='right') + except TypeError: # e.g. if we tried to search for string in int array + remaining_stargets.add(starget) + else: + if start != end: + d[starget] = list(range(start, end)) + + stargets = remaining_stargets + + if stargets: # otherwise, map by iterating through all items in the index for i in range(n): val = values[i] @@ -352,22 +365,22 @@ cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 object pval - if hi == 0 or (hi > 0 and val > util.get_value_at(values, hi)): + if hi == 0 or (hi > 0 and val > values[hi]): return len(values) while lo < hi: mid = (lo + hi) // 2 - pval = util.get_value_at(values, mid) + pval = values[mid] if val < pval: hi = mid elif val > pval: lo = mid + 1 else: - while mid > 0 and val == util.get_value_at(values, mid - 1): + while mid > 0 and val == values[mid - 1]: mid -= 1 return mid - if val <= util.get_value_at(values, mid): + if val <= values[mid]: return mid else: return mid + 1 @@ -387,13 +400,16 @@ cdef class DatetimeEngine(Int64Engine): return 'M8[ns]' def __contains__(self, object val): + cdef: + int64_t loc + if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: return self._get_loc_duplicates(val) values = self._get_index_values() conv = maybe_datetimelike_to_i8(val) loc = values.searchsorted(conv, side='left') - return util.get_value_at(values, loc) == conv + return values[loc] == conv self._ensure_mapping_populated() return maybe_datetimelike_to_i8(val) in self.mapping @@ -405,6 +421,8 @@ cdef class DatetimeEngine(Int64Engine): return algos.is_monotonic(values, timelike=True) cpdef get_loc(self, object val): + cdef: + int64_t loc if is_definitely_invalid_key(val): raise TypeError @@ -422,7 +440,7 @@ cdef class DatetimeEngine(Int64Engine): self._date_check_type(val) raise KeyError(val) - if loc == len(values) or util.get_value_at(values, loc) != conv: + if loc == len(values) or values[loc] != conv: raise KeyError(val) return loc diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index ac713a928973f..08bfaf21db9fb 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -158,7 +158,7 @@ cdef class IntervalTree(IntervalMixin): # TODO: write get_indexer_intervals cdef: - size_t old_len + Py_ssize_t old_len Py_ssize_t i Int64Vector result @@ -179,7 +179,7 @@ cdef class IntervalTree(IntervalMixin): the given array of scalar targets. Non-unique positions are repeated. """ cdef: - size_t old_len + Py_ssize_t old_len Py_ssize_t i Int64Vector result, missing diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1c2f80b832201..b13246a4a969c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -137,8 +137,8 @@ def is_scalar(val: object) -> bool: Examples -------- - >>> dt = pd.datetime.datetime(2018, 10, 3) - >>> pd.is_scalar(dt) + >>> dt = datetime.datetime(2018, 10, 3) + >>> pd.api.types.is_scalar(dt) True >>> pd.api.types.is_scalar([2, 3]) @@ -782,8 +782,16 @@ def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups): return starts, ends -def indices_fast(object index, const int64_t[:] labels, list keys, +def indices_fast(ndarray index, const int64_t[:] labels, list keys, list sorted_labels): + """ + Parameters + ---------- + index : ndarray + labels : ndarray[int64] + keys : list + sorted_labels : list[ndarray[int64]] + """ cdef: Py_ssize_t i, j, k, lab, cur, start, n = len(labels) dict result = {} @@ -803,8 +811,7 @@ def indices_fast(object index, const int64_t[:] labels, list keys, if lab != -1: tup = PyTuple_New(k) for j in range(k): - val = util.get_value_at(keys[j], - sorted_labels[j][i - 1]) + val = keys[j][sorted_labels[j][i - 1]] PyTuple_SET_ITEM(tup, j, val) Py_INCREF(val) @@ -814,8 +821,7 @@ def indices_fast(object index, const int64_t[:] labels, list keys, tup = PyTuple_New(k) for j in range(k): - val = util.get_value_at(keys[j], - sorted_labels[j][n - 1]) + val = keys[j][sorted_labels[j][n - 1]] PyTuple_SET_ITEM(tup, j, val) Py_INCREF(val) result[tup] = index[start:] @@ -2066,7 +2072,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, floats[i] = float(val) complexes[i] = complex(val) seen.float_ = 1 - except Exception: + except (ValueError, TypeError): seen.object_ = 1 break else: @@ -2346,7 +2352,8 @@ def to_object_array_tuples(rows: object): row = rows[i] for j in range(len(row)): result[i, j] = row[j] - except Exception: + except TypeError: + # e.g. "Expected tuple, got list" # upcast any subclasses to tuple for i in range(n): row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 34eb9412451c5..0eac0e94f0beb 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -121,7 +121,7 @@ cdef class Reducer: for i in range(self.nresults): if has_ndarray_labels: - name = util.get_value_at(labels, i) + name = labels[i] elif has_labels: # labels is an ExtensionArray name = labels[i] diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 33665484311ba..bf0a0ae5a3fe9 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -581,7 +581,7 @@ def try_parse_dates(object[:] values, parser=None, else: result[i] = parse_date(values[i]) except Exception: - # failed + # Since parser is user-defined, we can't guess what it migh raise return values else: parse_date = parser diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index cbfbc14c35b35..bc1fdfae99de9 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -226,11 +226,8 @@ cdef object get_dst_info(object tz): if treat_tz_as_pytz(tz): trans = np.array(tz._utc_transition_times, dtype='M8[ns]') trans = trans.view('i8') - try: - if tz._utc_transition_times[0].year == 1: - trans[0] = NPY_NAT + 1 - except Exception: - pass + if tz._utc_transition_times[0].year == 1: + trans[0] = NPY_NAT + 1 deltas = unbox_utcoffsets(tz._transition_info) typ = 'pytz' diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index ce56c08d3ec14..402ed62f2df65 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -12,6 +12,7 @@ _np_version_under1p15 = _nlv < LooseVersion("1.15") _np_version_under1p16 = _nlv < LooseVersion("1.16") _np_version_under1p17 = _nlv < LooseVersion("1.17") +_np_version_under1p18 = _nlv < LooseVersion("1.18") _is_numpy_dev = ".dev" in str(_nlv) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index bce6c352ce480..fc60c01d7b808 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -4,7 +4,7 @@ that can be mixed into or pinned onto other pandas classes. """ -from typing import Set +from typing import FrozenSet, Set import warnings from pandas.util._decorators import Appender @@ -12,9 +12,7 @@ class DirNamesMixin: _accessors = set() # type: Set[str] - _deprecations = frozenset( - ["asobject", "base", "data", "flags", "itemsize", "strides"] - ) + _deprecations = frozenset() # type: FrozenSet[str] def _dir_deletions(self): """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e5ab0d182aff..717c2eb26be8b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1304,7 +1304,7 @@ def get_indexer(current_indexer, other_indexer): return frame.sort_values(columns, ascending=ascending, kind="mergesort") -# ------- ## ---- # +# ---- # # take # # ---- # @@ -1712,59 +1712,44 @@ def take_nd( take_1d = take_nd -def take_2d_multi( - arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True -): +def take_2d_multi(arr, indexer, fill_value=np.nan): """ Specialized Cython take which sets NaN values in one pass """ - if indexer is None or (indexer[0] is None and indexer[1] is None): - row_idx = np.arange(arr.shape[0], dtype=np.int64) - col_idx = np.arange(arr.shape[1], dtype=np.int64) - indexer = row_idx, col_idx - dtype, fill_value = arr.dtype, arr.dtype.type() - else: - row_idx, col_idx = indexer - if row_idx is None: - row_idx = np.arange(arr.shape[0], dtype=np.int64) - else: - row_idx = ensure_int64(row_idx) - if col_idx is None: - col_idx = np.arange(arr.shape[1], dtype=np.int64) - else: - col_idx = ensure_int64(col_idx) - indexer = row_idx, col_idx - if not allow_fill: + # This is only called from one place in DataFrame._reindex_multi, + # so we know indexer is well-behaved. + assert indexer is not None + assert indexer[0] is not None + assert indexer[1] is not None + + row_idx, col_idx = indexer + + row_idx = ensure_int64(row_idx) + col_idx = ensure_int64(col_idx) + indexer = row_idx, col_idx + mask_info = None + + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype: + # check if promotion is actually required based on indexer + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + mask_info = (row_mask, col_mask), (row_needs, col_needs) + + if not (row_needs or col_needs): + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() - mask_info = None, False - else: - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype and (out is None or out.dtype != dtype): - # check if promotion is actually required based on indexer - if mask_info is not None: - (row_mask, col_mask), (row_needs, col_needs) = mask_info - else: - row_mask = row_idx == -1 - col_mask = col_idx == -1 - row_needs = row_mask.any() - col_needs = col_mask.any() - mask_info = (row_mask, col_mask), (row_needs, col_needs) - if row_needs or col_needs: - if out is not None and out.dtype != dtype: - raise TypeError("Incompatible type for fill_value") - else: - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value - if out is None: - out_shape = len(row_idx), len(col_idx) - out = np.empty(out_shape, dtype=dtype) + out_shape = len(row_idx), len(col_idx) + out = np.empty(out_shape, dtype=dtype) func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) if func is None and arr.dtype != out.dtype: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 91f3e878c3807..f402154dc91ca 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,5 +1,4 @@ import inspect -import warnings import numpy as np @@ -21,9 +20,7 @@ def frame_apply( obj, func, axis=0, - broadcast=None, raw=False, - reduce=None, result_type=None, ignore_failures=False, args=None, @@ -40,9 +37,7 @@ def frame_apply( return klass( obj, func, - broadcast=broadcast, raw=raw, - reduce=reduce, result_type=result_type, ignore_failures=ignore_failures, args=args, @@ -51,18 +46,7 @@ def frame_apply( class FrameApply: - def __init__( - self, - obj, - func, - broadcast, - raw, - reduce, - result_type, - ignore_failures, - args, - kwds, - ): + def __init__(self, obj, func, raw, result_type, ignore_failures, args, kwds): self.obj = obj self.raw = raw self.ignore_failures = ignore_failures @@ -75,34 +59,6 @@ def __init__( "of {None, 'reduce', 'broadcast', 'expand'}" ) - if broadcast is not None: - warnings.warn( - "The broadcast argument is deprecated and will " - "be removed in a future version. You can specify " - "result_type='broadcast' to broadcast the result " - "to the original dimensions", - FutureWarning, - stacklevel=4, - ) - if broadcast: - result_type = "broadcast" - - if reduce is not None: - warnings.warn( - "The reduce argument is deprecated and will " - "be removed in a future version. You can specify " - "result_type='reduce' to try to reduce the result " - "to the original dimensions", - FutureWarning, - stacklevel=4, - ) - if reduce: - - if result_type is not None: - raise ValueError("cannot pass both reduce=True and result_type") - - result_type = "reduce" - self.result_type = result_type # curry if needed diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7a16c3f6a35b6..53755695c97e3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -474,7 +474,7 @@ def fillna(self, value=None, method=None, limit=None): method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap + backfill / bfill: use NEXT valid observation to fill gap. limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is @@ -485,7 +485,8 @@ def fillna(self, value=None, method=None, limit=None): Returns ------- - filled : ExtensionArray with NA/NaN filled + ExtensionArray + With NA/NaN filled. """ value, method = validate_fillna_kwargs(value, method) @@ -539,13 +540,14 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArra fill_value : object, optional The scalar value to use for newly introduced missing values. - The default is ``self.dtype.na_value`` + The default is ``self.dtype.na_value``. .. versionadded:: 0.24.0 Returns ------- - shifted : ExtensionArray + ExtensionArray + Shifted. Notes ----- @@ -869,11 +871,12 @@ def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: Parameters ---------- dtype : str, np.dtype, or ExtensionDtype, optional - Default None + Default None. Returns ------- ExtensionArray + A view of the :class:`ExtensionArray`. """ # NB: # - This must return a *new* object referencing the same data, not self. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ea19808b19fc9..795986127cde7 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -295,7 +295,7 @@ class Categorical(ExtensionArray, PandasObject): See Also -------- - api.types.CategoricalDtype : Type for categorical data. + CategoricalDtype : Type for categorical data. CategoricalIndex : An Index with an underlying ``Categorical``. Notes @@ -331,7 +331,9 @@ class Categorical(ExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = PandasObject._deprecations | frozenset(["tolist", "get_values"]) + _deprecations = PandasObject._deprecations | frozenset( + ["tolist", "itemsize", "get_values"] + ) _typ = "categorical" def __init__( diff --git a/pandas/core/base.py b/pandas/core/base.py index e4e14a950c96b..5ae3926952a67 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,7 +4,7 @@ import builtins from collections import OrderedDict import textwrap -from typing import Dict, Optional +from typing import Dict, FrozenSet, Optional import warnings import numpy as np @@ -267,7 +267,7 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - def _try_aggregate_string_function(self, arg, *args, **kwargs): + def _try_aggregate_string_function(self, arg: str, *args, **kwargs): """ if arg is a string, then try to operate on it: - try to find a function (or attribute) on ourselves @@ -292,12 +292,10 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): f = getattr(np, arg, None) if f is not None: - try: + if hasattr(self, "__array__"): + # in particular exclude Window return f(self, *args, **kwargs) - except (AttributeError, TypeError): - pass - raise AttributeError( "'{arg}' is not a valid function for " "'{cls}' object".format(arg=arg, cls=type(self).__name__) @@ -653,7 +651,17 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 - _deprecations = frozenset(["item"]) + _deprecations = frozenset( + [ + "tolist", # tolist is not deprecated, just suppressed in the __dir__ + "base", + "data", + "item", + "itemsize", + "flags", + "strides", + ] + ) # type: FrozenSet[str] def transpose(self, *args, **kwargs): """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79e941f262931..7880acb1b78da 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,7 +14,18 @@ import itertools import sys from textwrap import dedent -from typing import FrozenSet, List, Optional, Sequence, Set, Tuple, Type, Union +from typing import ( + FrozenSet, + Hashable, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, +) import warnings import numpy as np @@ -861,7 +872,7 @@ def style(self): """ @Appender(_shared_docs["items"]) - def items(self): + def items(self) -> Iterable[Tuple[Hashable, Series]]: if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) @@ -2207,6 +2218,7 @@ def to_html( border=None, table_id=None, render_links=False, + encoding=None, ): """ Render a DataFrame as an HTML table. @@ -2222,6 +2234,10 @@ def to_html( border : int A ``border=border`` attribute is included in the opening `` tag. Default ``pd.options.display.html.border``. + encoding : str, default "utf-8" + Set character encoding + + .. versionadded:: 1.0 table_id : str, optional A css id is included in the opening `
` tag if specified. @@ -2263,7 +2279,11 @@ def to_html( ) # TODO: a generic formatter wld b in DataFrameFormatter return formatter.to_html( - buf=buf, classes=classes, notebook=notebook, border=border + buf=buf, + classes=classes, + notebook=notebook, + border=border, + encoding=encoding, ) # ---------------------------------------------------------------------- @@ -6628,15 +6648,7 @@ def transform(self, func, axis=0, *args, **kwargs): return super().transform(func, *args, **kwargs) def apply( - self, - func, - axis=0, - broadcast=None, - raw=False, - reduce=None, - result_type=None, - args=(), - **kwds + self, func, axis=0, raw=False, reduce=None, result_type=None, args=(), **kwds ): """ Apply a function along an axis of the DataFrame. @@ -6656,21 +6668,9 @@ def apply( * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. - broadcast : bool, optional - Only relevant for aggregation functions: - - * ``False`` or ``None`` : returns a Series whose length is the - length of the index or the number of columns (based on the - `axis` parameter) - * ``True`` : results will be broadcast to the original shape - of the frame, the original index and columns will be retained. - - .. deprecated:: 0.23.0 - This argument will be removed in a future version, replaced - by result_type='broadcast'. raw : bool, default False - Determines if row or column is passed as a Series or ndarry object: + Determines if row or column is passed as a Series or ndarray object: * ``False`` : passes each row or column as a Series to the function. @@ -6678,20 +6678,6 @@ def apply( instead. If you are just applying a NumPy reduction function this will achieve much better performance. - reduce : bool or None, default None - Try to apply reduction procedures. If the DataFrame is empty, - `apply` will use `reduce` to determine whether the result - should be a Series or a DataFrame. If ``reduce=None`` (the - default), `apply`'s return value will be guessed by calling - `func` on an empty Series - (note: while guessing, exceptions raised by `func` will be - ignored). - If ``reduce=True`` a Series will always be returned, and if - ``reduce=False`` a DataFrame will always be returned. - - .. deprecated:: 0.23.0 - This argument will be removed in a future version, replaced - by ``result_type='reduce'``. result_type : {'expand', 'reduce', 'broadcast', None}, default None These only act when ``axis=1`` (columns): @@ -6805,9 +6791,7 @@ def apply( self, func=func, axis=axis, - broadcast=broadcast, raw=raw, - reduce=reduce, result_type=result_type, args=args, kwds=kwds, @@ -7772,7 +7756,8 @@ def _count_level(self, level, axis=0, numeric_only=False): if isinstance(level, str): level = count_axis._get_level_number(level) - level_index = count_axis.levels[level] + level_name = count_axis._names[level] + level_index = count_axis.levels[level]._shallow_copy(name=level_name) level_codes = ensure_int64(count_axis.codes[level]) counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e97772a418982..e3e59639de56b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4606,7 +4606,7 @@ def _needs_reindex_multi(self, axes, method, level): ) def _reindex_multi(self, axes, copy, fill_value): - return NotImplemented + raise AbstractMethodError(self) def _reindex_with_indexers( self, reindexers, fill_value=None, copy=False, allow_dups=False diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 598f13de309b4..8191c3519a36a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -11,7 +11,17 @@ from functools import partial from textwrap import dedent import typing -from typing import Any, Callable, FrozenSet, Sequence, Type, Union +from typing import ( + Any, + Callable, + FrozenSet, + Hashable, + Iterable, + Sequence, + Tuple, + Type, + Union, +) import warnings import numpy as np @@ -53,7 +63,6 @@ _transform_template, groupby, ) -from pandas.core.groupby.ops import BinGrouper from pandas.core.index import Index, MultiIndex, _all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block @@ -133,7 +142,7 @@ def pinner(cls): class SeriesGroupBy(GroupBy): _apply_whitelist = base.series_apply_whitelist - def _iterate_slices(self): + def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: yield self._selection_name, self._selected_obj @property @@ -252,6 +261,8 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) + except AssertionError: + raise except Exception: result = self._aggregate_named(func, *args, **kwargs) @@ -640,12 +651,7 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - if isinstance(self.grouper, BinGrouper) and ( - len(self.grouper.binlabels) != len(self.grouper.indices) - ): - labels = list(map(rep, [np.unique(ids)])) + [llab(lab, inc)] - else: - labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] @@ -883,7 +889,23 @@ def aggregate(self, func=None, *args, **kwargs): result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) - except Exception: + except ValueError as err: + if "no results" not in str(err): + # raised directly by _aggregate_multiple_funcs + raise + result = self._aggregate_frame(func) + except NotImplementedError as err: + if "axis other than 0 is not supported" in str(err): + # raised directly by _aggregate_multiple_funcs + pass + elif "decimal does not support skipna=True" in str(err): + # FIXME: kludge for DecimalArray tests + pass + else: + raise + # FIXME: this is raised in a bunch of + # test_whitelist.test_regression_whitelist_methods tests, + # can be avoided result = self._aggregate_frame(func) else: result.columns = Index( @@ -904,22 +926,20 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _iterate_slices(self): - if self.axis == 0: - # kludge - if self._selection is None: - slice_axis = self.obj.columns - else: - slice_axis = self._selection_list - slicer = lambda x: self.obj[x] + def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: + obj = self._selected_obj + if self.axis == 1: + obj = obj.T + + if isinstance(obj, Series) and obj.name not in self.exclusions: + # Occurs when doing DataFrameGroupBy(...)["X"] + yield obj.name, obj else: - slice_axis = self.obj.index - slicer = self.obj.xs + for label, values in obj.items(): + if label in self.exclusions: + continue - for val in slice_axis: - if val in self.exclusions: - continue - yield val, slicer(val) + yield label, values def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): new_items, new_blocks = self._cython_agg_blocks( @@ -958,11 +978,17 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): if alt is None: # we cannot perform the operation # in an alternate way, exclude the block + assert how == "ohlc" deleted_items.append(locs) continue # call our grouper again with only this block obj = self.obj[data.items[locs]] + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] + s = groupby(obj, self.grouper) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) @@ -971,17 +997,29 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # continue and exclude the block deleted_items.append(locs) continue + + # unwrap DataFrame to get array + assert len(result._data.blocks) == 1 + result = result._data.blocks[0].values + if result.ndim == 1 and isinstance(result, np.ndarray): + result = result.reshape(1, -1) + finally: + assert not isinstance(result, DataFrame) + if result is not no_result: # see if we can cast the block back to the original dtype result = maybe_downcast_numeric(result, block.dtype) - if result.ndim == 1 and isinstance(result, np.ndarray): + if block.is_extension and isinstance(result, np.ndarray): # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible result = type(block.values)._from_sequence( - result, dtype=block.values.dtype + result.ravel(), dtype=block.values.dtype ) except ValueError: # reshape to be valid for non-Extension Block @@ -1031,17 +1069,24 @@ def _aggregate_frame(self, func, *args, **kwargs): if axis != obj._info_axis_number: try: for name, data in self: - result[name] = self._try_cast(func(data, *args, **kwargs), data) + fres = func(data, *args, **kwargs) + result[name] = self._try_cast(fres, data) + except AssertionError: + raise except Exception: return self._aggregate_item_by_item(func, *args, **kwargs) else: for name in self.indices: + data = self.get_group(name, obj=obj) try: - data = self.get_group(name, obj=obj) - result[name] = self._try_cast(func(data, *args, **kwargs), data) + fres = func(data, *args, **kwargs) + except AssertionError: + raise except Exception: wrapper = lambda x: func(x, *args, **kwargs) result[name] = data.apply(wrapper, axis=axis) + else: + result[name] = self._try_cast(fres, data) return self._wrap_frame_output(result, obj) @@ -1392,6 +1437,8 @@ def _choose_path(self, fast_path, slow_path, group): # if we make it here, test if we can use the fast path try: res_fast = fast_path(group) + except AssertionError: + raise except Exception: # Hard to know ex-ante what exceptions `fast_path` might raise return path, res @@ -1416,9 +1463,12 @@ def _transform_item_by_item(self, obj, wrapper): for i, col in enumerate(obj): try: output[col] = self[col].transform(wrapper) - inds.append(i) + except AssertionError: + raise except Exception: pass + else: + inds.append(i) if len(output) == 0: raise TypeError("Transform function invalid for data types") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cc297629a7004..b27d5bb05ee8f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -14,7 +14,7 @@ class providing the base-class of operations. import inspect import re import types -from typing import FrozenSet, List, Optional, Tuple, Type, Union +from typing import FrozenSet, Hashable, Iterable, List, Optional, Tuple, Type, Union import numpy as np @@ -44,13 +44,7 @@ class providing the base-class of operations. from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical -from pandas.core.base import ( - DataError, - GroupByError, - PandasObject, - SelectionMixin, - SpecificationError, -) +from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import DataFrame @@ -598,14 +592,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) def _make_wrapper(self, name): - if name not in self._apply_whitelist: - is_callable = callable(getattr(self._selected_obj, name, None)) - kind = " callable " if is_callable else " " - msg = ( - "Cannot access{0}attribute {1!r} of {2!r} objects, try " - "using the 'apply' method".format(kind, name, type(self).__name__) - ) - raise AttributeError(msg) + assert name in self._apply_whitelist self._set_group_selection() @@ -758,7 +745,7 @@ def _python_apply_general(self, f): keys, values, not_indexed_same=mutated or self.mutated ) - def _iterate_slices(self): + def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): @@ -869,8 +856,6 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): result, names = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue - except AssertionError as e: - raise GroupByError(str(e)) if self._transform_should_cast(how): output[name] = self._try_cast(result, obj) else: @@ -897,12 +882,7 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): if numeric_only and not is_numeric: continue - try: - result, names = self.grouper.aggregate( - obj.values, how, min_count=min_count - ) - except AssertionError as e: - raise GroupByError(str(e)) + result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) output[name] = self._try_cast(result, obj) if len(output) == 0: @@ -919,9 +899,10 @@ def _python_agg_general(self, func, *args, **kwargs): for name, obj in self._iterate_slices(): try: result, counts = self.grouper.agg_series(obj, f) - output[name] = self._try_cast(result, obj, numeric_only=True) except TypeError: continue + else: + output[name] = self._try_cast(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) @@ -1359,10 +1340,18 @@ def f(self, **kwargs): # try a cython aggregation if we can try: return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except AssertionError as e: - raise SpecificationError(str(e)) - except Exception: + except DataError: pass + except NotImplementedError as err: + if "function is not implemented for this dtype" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + elif "decimal does not support skipna=True" in str(err): + # FIXME: kludge for test_decimal:test_in_numeric_groupby + pass + else: + raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 40517eefe4d5d..e4335d39929b3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -526,7 +526,13 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: - values = ensure_float64(values) + try: + values = ensure_float64(values) + except TypeError: + if lib.infer_dtype(values, skipna=False) == "complex": + values = values.astype(complex) + else: + raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise @@ -647,6 +653,8 @@ def _transform( def agg_series(self, obj, func): try: return self._aggregate_series_fast(obj, func) + except AssertionError: + raise except Exception: return self._aggregate_series_pure_python(obj, func) @@ -825,6 +833,14 @@ def levels(self): def names(self): return [self.binlabels.name] + @property + def recons_labels(self): + comp_ids, obs_ids, _ = self.group_info + if len(self.binlabels) != len(self.indices): + return [np.unique(comp_ids)] + labels = (ping.labels for ping in self.groupings) + return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels, xnull=True) + @property def groupings(self): from pandas.core.groupby.grouper import Grouping diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7dee3a17f8f9e..1a08609ccd99a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import Union +from typing import FrozenSet, Union import warnings import numpy as np @@ -63,7 +63,7 @@ from pandas.core.dtypes.missing import array_equivalent, isna from pandas.core import ops -from pandas.core.accessor import CachedAccessor, DirNamesMixin +from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject @@ -206,10 +206,10 @@ class Index(IndexOpsMixin, PandasObject): # tolist is not actually deprecated, just suppressed in the __dir__ _deprecations = ( - IndexOpsMixin._deprecations - | DirNamesMixin._deprecations - | frozenset(["tolist", "contains", "dtype_str", "get_values", "set_value"]) - ) + PandasObject._deprecations + | IndexOpsMixin._deprecations + | frozenset(["asobject", "contains", "dtype_str", "get_values", "set_value"]) + ) # type: FrozenSet[str] # To hand over control to subclasses _join_precedence = 1 @@ -2493,8 +2493,12 @@ def _union(self, other, sort): value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) else: - indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() + # find indexes of things in "other" that are not in "self" + if self.is_unique: + indexer = self.get_indexer(other) + indexer = (indexer == -1).nonzero()[0] + else: + indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) if len(indexer) > 0: other_diff = algos.take_nd(rvals, indexer, allow_fill=False) @@ -3138,16 +3142,7 @@ def is_int(v): elif is_positional: indexer = key else: - try: - indexer = self.slice_indexer(start, stop, step, kind=kind) - except Exception: - if is_index_slice: - if self.is_integer(): - raise - else: - indexer = key - else: - raise + indexer = self.slice_indexer(start, stop, step, kind=kind) return indexer @@ -4672,11 +4667,11 @@ def get_value(self, series, key): raise InvalidIndexError(key) else: raise e1 - except Exception: # pragma: no cover + except Exception: raise e1 except TypeError: - # python 3 - if is_scalar(key): # pragma: no cover + # e.g. "[False] is an invalid key" + if is_scalar(key): raise IndexError(key) raise InvalidIndexError(key) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b538c4df00e19..e5a8edb56e413 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -552,10 +552,6 @@ def get_value(self, series: AnyArrayLike, key: Any): # we might be a positional inexer return super().get_value(series, key) - def _can_reindex(self, indexer): - """ always allow reindexing """ - pass - @Substitution(klass="CategoricalIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): @@ -585,7 +581,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): Indices of output values in original index """ - if method is not None: raise NotImplementedError( "argument method is not implemented for CategoricalIndex.reindex" @@ -605,9 +600,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): indexer = None missing = [] else: - if not target.is_unique: - raise ValueError("cannot reindex with a non-unique indexer") - indexer, missing = self.get_indexer_non_unique(np.array(target)) if len(self.codes) and indexer is not None: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 596eaf0c55dbd..fda5c78a61e53 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -274,6 +274,7 @@ def __new__( result._set_levels(levels, copy=copy, validate=False) result._set_codes(codes, copy=copy, validate=False) + result._names = [None] * len(levels) if names is not None: # handles name validation result._set_names(names) @@ -638,7 +639,10 @@ def from_frame(cls, df, sortorder=None, names=None): @property def levels(self): - return self._levels + result = [ + x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) + ] + return FrozenList(result) @property def _values(self): @@ -829,7 +833,7 @@ def _set_codes( if level is None: new_codes = FrozenList( _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy() - for lev, level_codes in zip(self.levels, codes) + for lev, level_codes in zip(self._levels, codes) ) else: level = [self._get_level_number(l) for l in level] @@ -1216,7 +1220,7 @@ def __len__(self): return len(self.codes[0]) def _get_names(self): - return FrozenList(level.name for level in self.levels) + return FrozenList(self._names) def _set_names(self, names, level=None, validate=True): """ @@ -1262,7 +1266,7 @@ def _set_names(self, names, level=None, validate=True): level = [self._get_level_number(l) for l in level] # set the name - for l, name in zip(level, names): + for lev, name in zip(level, names): if name is not None: # GH 20527 # All items in 'names' need to be hashable: @@ -1272,7 +1276,7 @@ def _set_names(self, names, level=None, validate=True): self.__class__.__name__ ) ) - self.levels[l].rename(name, inplace=True) + self._names[lev] = name names = property( fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" @@ -1582,13 +1586,13 @@ def _get_level_values(self, level, unique=False): values : ndarray """ - values = self.levels[level] + lev = self.levels[level] level_codes = self.codes[level] + name = self._names[level] if unique: level_codes = algos.unique(level_codes) - filled = algos.take_1d(values._values, level_codes, fill_value=values._na_value) - values = values._shallow_copy(filled) - return values + filled = algos.take_1d(lev._values, level_codes, fill_value=lev._na_value) + return lev._shallow_copy(filled, name=name) def get_level_values(self, level): """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0fc74f4e78c9f..f085dff84462d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -457,7 +457,11 @@ def __contains__(self, key): try: self.get_loc(key) return True - except Exception: + except (ValueError, TypeError, KeyError): + # TypeError can be reached if we pass a tuple that is not hashable + # ValueError can be reached if pass a 2-tuple and parse_time_string + # raises with the wrong number of return values + # TODO: the latter is a bug in parse_time_string return False @cache_readonly @@ -765,7 +769,9 @@ def _maybe_cast_slice_bound(self, label, side, kind): _, parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) return bounds[0 if side == "left" else 1] - except Exception: + except ValueError: + # string cannot be parsed as datetime-like + # TODO: we need tests for this case raise KeyError(label) elif is_integer(label) or is_float(label): self._invalid_indexer("slice", label) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 755992c881fe5..62a74fefa6577 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -630,7 +630,8 @@ def insert(self, loc, item): if _is_convertible_to_td(item): try: item = Timedelta(item) - except Exception: + except ValueError: + # e.g. str that can't be parsed to timedelta pass elif is_scalar(item) and isna(item): # GH 18295 diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b76cb5cbec626..1495be1f26df5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -687,7 +687,6 @@ def _try_coerce_args(self, other): def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ - values = self.get_values() if slicer is not None: @@ -1783,6 +1782,23 @@ def get_values(self, dtype=None): def to_dense(self): return np.asarray(self.values) + def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): + """override to use ExtensionArray astype for the conversion""" + values = self.values + if slicer is not None: + values = values[slicer] + mask = isna(values) + + try: + values = values.astype(str) + values[mask] = na_rep + except Exception: + # eg SparseArray does not support setitem, needs to be converted to ndarray + return super().to_native_types(slicer, na_rep, quoting, **kwargs) + + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ Take values according to indexer and return them as a block. @@ -2265,6 +2281,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): is_extension = True _can_hold_element = DatetimeBlock._can_hold_element + to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") @property diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 545bc21dd6d1b..d4ae3767f6157 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -17,6 +17,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries import pandas.core.algorithms as algos +from pandas.core.base import DataError from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy @@ -360,7 +361,25 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) - except Exception: + except AssertionError: + raise + except DataError: + # we have a non-reducing function; try to evaluate + result = grouped.apply(how, *args, **kwargs) + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named + pass + elif "len(index) != len(labels)" in str(err): + # raised in libgroupby validation + pass + elif "No objects to concatenate" in str(err): + # raised in concat call + # In tests this is reached via either + # _apply_to_column_groupbys (ohlc) or DataFrameGroupBy.nunique + pass + else: + raise # we have a non-reducing function # try to evaluate diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e654685d24d9d..340e964d7c14f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -259,10 +259,10 @@ def get_new_values(self): def get_new_columns(self): if self.value_columns is None: if self.lift == 0: - return self.removed_level + return self.removed_level._shallow_copy(name=self.removed_name) - lev = self.removed_level - return lev.insert(0, lev._na_value) + lev = self.removed_level.insert(0, item=self.removed_level._na_value) + return lev.rename(self.removed_name) stride = len(self.removed_level) + self.lift width = len(self.value_columns) @@ -298,10 +298,10 @@ def get_new_index(self): # construct the new index if len(self.new_index_levels) == 1: - lev, lab = self.new_index_levels[0], result_codes[0] - if (lab == -1).any(): - lev = lev.insert(len(lev), lev._na_value) - return lev.take(lab) + level, level_codes = self.new_index_levels[0], result_codes[0] + if (level_codes == -1).any(): + level = level.insert(len(level), level._na_value) + return level.take(level_codes).rename(self.new_index_names[0]) return MultiIndex( levels=self.new_index_levels, @@ -661,7 +661,8 @@ def _convert_level_number(level_num, columns): new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: - new_columns = unique_groups = this.columns.levels[0] + new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0]) + unique_groups = new_columns # time to ravel the values new_data = {} diff --git a/pandas/core/series.py b/pandas/core/series.py index 539a09f7046ac..1039e9af929d4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,7 +54,7 @@ import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops -from pandas.core.accessor import CachedAccessor, DirNamesMixin +from pandas.core.accessor import CachedAccessor from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import Categorical, CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor @@ -178,10 +178,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _deprecations = ( base.IndexOpsMixin._deprecations | generic.NDFrame._deprecations - | DirNamesMixin._deprecations | frozenset( [ - "tolist", # tolist is not deprecated, just suppressed in the __dir__ "asobject", "compress", "valid", diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e6edad656d430..6d80cf8c697d6 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -162,7 +162,6 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): xnull: boolean, if nulls are excluded; i.e. -1 labels are passed through """ - if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") shape = np.asarray(shape, dtype="i8") + lift @@ -303,8 +302,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): def get_indexer_dict(label_list, keys): - """ return a diction of {labels} -> {indexers} """ - shape = list(map(len, keys)) + """ return a dict of {labels} -> {indexers} """ + shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) ngroups = ( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ad62c56a337b6..b8c40e3f62221 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -942,6 +942,7 @@ def _format_col(self, i: int) -> List[str]: def to_html( self, buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, classes: Optional[Union[str, List, Tuple]] = None, notebook: bool = False, border: Optional[int] = None, @@ -963,7 +964,9 @@ def to_html( from pandas.io.formats.html import HTMLFormatter, NotebookFormatter Klass = NotebookFormatter if notebook else HTMLFormatter - return Klass(self, classes=classes, border=border).get_result(buf=buf) + return Klass(self, classes=classes, border=border).get_result( + buf=buf, encoding=encoding + ) def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: from pandas.core.index import _sparsify diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 9016e8a98e5ba..1e27421a55499 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -243,8 +243,10 @@ def build_table_schema(data, index=True, primary_key=None, version=True): if index: if data.index.nlevels > 1: - for level in data.index.levels: - fields.append(convert_pandas_type_to_json_field(level)) + for level, name in zip(data.index.levels, data.index.names): + new_field = convert_pandas_type_to_json_field(level) + new_field["name"] = name + fields.append(new_field) else: fields.append(convert_pandas_type_to_json_field(data.index)) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index b6ffd8a83e409..4d40cd3a2d4ca 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2,6 +2,7 @@ from functools import reduce from itertools import product import operator +from typing import Dict, Type import warnings import numpy as np @@ -19,7 +20,11 @@ from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines import pandas.core.computation.expr as expr -from pandas.core.computation.expr import PandasExprVisitor, PythonExprVisitor +from pandas.core.computation.expr import ( + BaseExprVisitor, + PandasExprVisitor, + PythonExprVisitor, +) from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR from pandas.core.computation.ops import ( _arith_ops_syms, @@ -1884,7 +1889,7 @@ def test_invalid_parser(): "python": PythonExprVisitor, "pytables": pytables.ExprVisitor, "pandas": PandasExprVisitor, -} +} # type: Dict[str, Type[BaseExprVisitor]] @pytest.mark.parametrize("engine", _engines) diff --git a/pandas/tests/extension/list/__init__.py b/pandas/tests/extension/list/__init__.py new file mode 100644 index 0000000000000..108f1937d07d3 --- /dev/null +++ b/pandas/tests/extension/list/__init__.py @@ -0,0 +1,3 @@ +from .array import ListArray, ListDtype, make_data + +__all__ = ["ListArray", "ListDtype", "make_data"] diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py new file mode 100644 index 0000000000000..0ca9fadb68829 --- /dev/null +++ b/pandas/tests/extension/list/array.py @@ -0,0 +1,133 @@ +""" +Test extension array for storing nested data in a pandas container. + +The ListArray stores an ndarray of lists. +""" +import numbers +import random +import string + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype + +import pandas as pd +from pandas.core.arrays import ExtensionArray + + +class ListDtype(ExtensionDtype): + type = list + name = "list" + na_value = np.nan + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ListArray + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + + +class ListArray(ExtensionArray): + dtype = ListDtype() + __array_priority__ = 1000 + + def __init__(self, values, dtype=None, copy=False): + if not isinstance(values, np.ndarray): + raise TypeError("Need to pass a numpy array as values") + for val in values: + if not isinstance(val, self.dtype.type) and not pd.isna(val): + raise TypeError("All values must be of type " + str(self.dtype.type)) + self.data = values + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + data = np.empty(len(scalars), dtype=object) + data[:] = scalars + return cls(data) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + else: + # slice, list-like, mask + return type(self)(self.data[item]) + + def __len__(self) -> int: + return len(self.data) + + def isna(self): + return np.array( + [not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool + ) + + def take(self, indexer, allow_fill=False, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) + + if allow_fill: + if fill_value is None: + fill_value = self.dtype.na_value + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] + except IndexError: + raise IndexError(msg) + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError: + raise IndexError(msg) + + return self._from_sequence(output) + + def copy(self): + return type(self)(self.data[:]) + + def astype(self, dtype, copy=True): + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + elif pd.api.types.is_string_dtype(dtype) and not pd.api.types.is_object_dtype( + dtype + ): + # numpy has problems with astype(str) for nested elements + return np.array([str(x) for x in self.data], dtype=dtype) + return np.array(self.data, dtype=dtype, copy=copy) + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x.data for x in to_concat]) + return cls(data) + + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + data = np.empty(100, dtype=object) + data[:] = [ + [random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))] + for _ in range(100) + ] + return data diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py new file mode 100644 index 0000000000000..c5c4417155562 --- /dev/null +++ b/pandas/tests/extension/list/test_list.py @@ -0,0 +1,30 @@ +import pytest + +import pandas as pd + +from .array import ListArray, ListDtype, make_data + + +@pytest.fixture +def dtype(): + return ListDtype() + + +@pytest.fixture +def data(): + """Length-100 ListArray for semantics test.""" + data = make_data() + + while len(data[0]) == len(data[1]): + data = make_data() + + return ListArray(data) + + +def test_to_csv(data): + # https://github.com/pandas-dev/pandas/issues/28840 + # array with list-likes fail when doing astype(str) on the numpy array + # which was done in to_native_types + df = pd.DataFrame({"a": data}) + res = df.to_csv() + assert str(data[0]) in res diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 0328232213588..fe034504b8161 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -137,13 +137,6 @@ def test_nunique_empty(self): expected = Series([], index=pd.Index([])) assert_series_equal(result, expected) - def test_apply_deprecate_reduce(self): - empty_frame = DataFrame() - - x = [] - with tm.assert_produces_warning(FutureWarning): - empty_frame.apply(x.append, axis=1, reduce=True) - def test_apply_standard_nonunique(self): df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) @@ -170,10 +163,6 @@ def test_apply_with_string_funcs(self, float_frame, func, args, kwds): expected = getattr(float_frame, func)(*args, **kwds) tm.assert_series_equal(result, expected) - def test_apply_broadcast_deprecated(self, float_frame): - with tm.assert_produces_warning(FutureWarning): - float_frame.apply(np.mean, broadcast=True) - def test_apply_broadcast(self, float_frame, int_frame_const_col): # scalars diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 3f0768ad5bdac..c9a7507969f5b 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -575,9 +575,9 @@ def test_frame_to_dict_tz(self): ), ), ( - defaultdict(list), + defaultdict(dict), defaultdict( - list, + dict, { 0: {"int_col": 1, "float_col": 1.0}, 1: {"int_col": 2, "float_col": 2.0}, diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 7e3cbed09c6d7..5dad868c8c3aa 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -19,7 +19,7 @@ date_range, period_range, ) -from pandas.core.groupby.groupby import SpecificationError +from pandas.core.base import SpecificationError import pandas.util.testing as tm from pandas.io.formats.printing import pprint_thing diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 2831c07cb21d3..5391cb5ce821f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas.compat import PY37, is_platform_windows +from pandas.compat import PY37 import pandas as pd from pandas import ( @@ -209,10 +209,9 @@ def test_level_get_group(observed): assert_frame_equal(result, expected) -# GH#21636 previously flaky on py37 -@pytest.mark.xfail( - is_platform_windows() and PY37, reason="Flaky, GH-27902", strict=False -) +# GH#21636 flaky on py37; may be related to older numpy, see discussion +# https://github.com/MacPython/pandas-wheels/pull/64 +@pytest.mark.xfail(PY37, reason="Flaky, GH-27902", strict=False) @pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 @@ -229,6 +228,9 @@ def test_apply(ordered): idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"]) + # GH#21636 tracking down the xfail, in some builds np.mean(df.loc[[0]]) + # is coming back as Series([0., 1., 0.], index=["missing", "dense", "values"]) + # when we expect Series(0., index=["values"]) result = grouped.apply(lambda x: np.mean(x)) assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index afb22a732691c..571e710ba8928 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -378,7 +378,7 @@ def test_median_empty_bins(observed): @pytest.mark.parametrize( - "dtype", ["int8", "int16", "int32", "int64", "float32", "float64"] + "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] ) @pytest.mark.parametrize( "method,data", diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6212a37472000..dff5baa9b5984 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1944,3 +1944,13 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): result = getattr(grouped, op)() expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) assert_frame_equal(result, expected) + + +def test_groupby_only_none_group(): + # see GH21624 + # this was crashing with "ValueError: Length of passed values is 1, index implies 0" + df = pd.DataFrame({"g": [None], "x": 1}) + actual = df.groupby("g")["x"].transform("sum") + expected = pd.Series([np.nan], name="x") + + assert_series_equal(actual, expected) diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index 4adcdd0112b26..f320a89c471bf 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -11,7 +11,7 @@ def test_astype(idx): actual = idx.astype("O") assert_copy(actual.levels, expected.levels) assert_copy(actual.codes, expected.codes) - assert [level.name for level in actual.levels] == list(expected.names) + assert actual.names == list(expected.names) with pytest.raises(TypeError, match="^Setting.*dtype.*object"): idx.astype(np.dtype(int)) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 9472d539537ba..ff98da85cfb2d 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -294,6 +294,7 @@ def test_from_arrays_empty(): assert isinstance(result, MultiIndex) expected = Index([], name="A") tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["A"] # N levels for N in [2, 3]: @@ -441,6 +442,7 @@ def test_from_product_empty_one_level(): result = MultiIndex.from_product([[]], names=["A"]) expected = pd.Index([], name="A") tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["A"] @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 5856cb56b307b..5c3a48c9dd481 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -27,28 +27,25 @@ def test_index_name_retained(): def test_changing_names(idx): - - # names should be applied to levels - level_names = [level.name for level in idx.levels] - check_level_names(idx, idx.names) + assert [level.name for level in idx.levels] == ["first", "second"] view = idx.view() copy = idx.copy() shallow_copy = idx._shallow_copy() - # changing names should change level names on object + # changing names should not change level names on object new_names = [name + "a" for name in idx.names] idx.names = new_names - check_level_names(idx, new_names) + check_level_names(idx, ["firsta", "seconda"]) - # but not on copies - check_level_names(view, level_names) - check_level_names(copy, level_names) - check_level_names(shallow_copy, level_names) + # and not on copies + check_level_names(view, ["first", "second"]) + check_level_names(copy, ["first", "second"]) + check_level_names(shallow_copy, ["first", "second"]) # and copies shouldn't change original shallow_copy.names = [name + "c" for name in shallow_copy.names] - check_level_names(idx, new_names) + check_level_names(idx, ["firsta", "seconda"]) def test_take_preserve_name(idx): @@ -82,9 +79,9 @@ def test_copy_names(): def test_names(idx, index_names): # names are assigned in setup - names = index_names + assert index_names == ["first", "second"] level_names = [level.name for level in idx.levels] - assert names == level_names + assert level_names == index_names # setting bad names on existing index = idx @@ -109,11 +106,10 @@ def test_names(idx, index_names): names=["first", "second", "third"], ) - # names are assigned + # names are assigned on index, but not transferred to the levels index.names = ["a", "b"] - ind_names = list(index.names) level_names = [level.name for level in index.levels] - assert ind_names == level_names + assert level_names == ["a", "b"] def test_duplicate_level_names_access_raises(idx): @@ -121,3 +117,10 @@ def test_duplicate_level_names_access_raises(idx): idx.names = ["foo", "foo"] with pytest.raises(ValueError, match="name foo occurs multiple times"): idx._get_level_number("foo") + + +def test_get_names_from_levels(): + idx = pd.MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + + assert idx.levels[0].name == "a" + assert idx.levels[1].name == "b" diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 88de4d1e80386..513efa8941de8 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -6,19 +6,17 @@ import pandas.util.testing as tm -def check_level_names(index, names): - assert [level.name for level in index.levels] == list(names) - - def test_reindex(idx): result, indexer = idx.reindex(list(idx[:4])) assert isinstance(result, MultiIndex) - check_level_names(result, idx[:4].names) + assert result.names == ["first", "second"] + assert [level.name for level in result.levels] == ["first", "second"] result, indexer = idx.reindex(list(idx)) assert isinstance(result, MultiIndex) assert indexer is None - check_level_names(result, idx.names) + assert result.names == ["first", "second"] + assert [level.name for level in result.levels] == ["first", "second"] def test_reindex_level(idx): diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index a30e6f33d1499..37df420e9ea2e 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -17,6 +17,7 @@ def test_insert(idx): exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") tm.assert_index_equal(new_index.levels[0], exp0) + assert new_index.names == ["first", "second"] exp1 = Index(list(idx.levels[1]) + ["three"], name="second") tm.assert_index_equal(new_index.levels[1], exp1) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 4326c3f8188fc..8ed7f1a890c39 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -599,15 +599,19 @@ def test_reindex_dtype(self): tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) def test_reindex_duplicate_target(self): - # See GH23963 - c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - with pytest.raises(ValueError, match="non-unique indexer"): - c.reindex(["a", "a", "c"]) + # See GH25459 + cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + res, indexer = cat.reindex(["a", "c", "c"]) + exp = Index(["a", "c", "c"], dtype="object") + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) - with pytest.raises(ValueError, match="non-unique indexer"): - c.reindex( - CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"]) - ) + res, indexer = cat.reindex( + CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + ) + exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) def test_reindex_empty_index(self): # See GH16770 diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index c365c985eb4b6..005a9a24dc597 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -561,26 +561,30 @@ def test_read_only_source(self): assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3]) def test_reindexing(self): + df = DataFrame( + { + "A": np.arange(3, dtype="int64"), + "B": Series(list("abc")).astype(CDT(list("cabe"))), + } + ).set_index("B") # reindexing # convert to a regular index - result = self.df2.reindex(["a", "b", "e"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} - ).set_index("B") + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} - ).set_index("B") + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["e"]) + result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["d"]) + result = df.reindex(["d"]) expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) @@ -588,65 +592,58 @@ def test_reindexing(self): # then return a Categorical cats = list("cabe") - result = self.df2.reindex(Categorical(["a", "d"], categories=cats)) + result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( - {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))} + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(["a"], categories=cats)) + result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( - {"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))} + {"A": [0], "B": Series(list("a")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b", "e"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} - ).set_index("B") + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} - ).set_index("B") + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["e"]) + result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = self.df2.reindex( - Categorical(["a", "d"], categories=cats, ordered=True) - ) + result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( - { - "A": [0, 1, 5, np.nan], - "B": Series(list("aaad")).astype(CDT(cats, ordered=True)), - } + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"])) + result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( - {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))} + {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # passed duplicate indexers are not allowed - msg = "cannot reindex with a non-unique indexer" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - self.df2.reindex(["a", "a"]) + self.df2.reindex(["a", "b"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): - self.df2.reindex(["a"], method="ffill") + df.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): - self.df2.reindex(["a"], level=1) + df.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): - self.df2.reindex(["a"], limit=2) + df.reindex(["a"], limit=2) def test_loc_slice(self): # slicing diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 05b58b0eca9b8..4f38d7beb9c0b 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1,4 +1,5 @@ import itertools +from typing import Dict, List import numpy as np import pytest @@ -928,7 +929,7 @@ class TestReplaceSeriesCoercion(CoercionBase): klasses = ["series"] method = "replace" - rep = {} + rep = {} # type: Dict[str, List] rep["object"] = ["a", "b"] rep["int64"] = [4, 5] rep["float64"] = [1.1, 2.2] diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index ef19319e208d9..6c4a226b7ebd2 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -99,6 +99,14 @@ def test_to_html_unicode(df, expected, datapath): assert result == expected +def test_to_html_encoding(float_frame, tmp_path): + # GH 28663 + path = tmp_path / "test.html" + float_frame.to_html(path, encoding="gbk") + with open(str(path), "r", encoding="gbk") as f: + assert float_frame.to_html() == f.read() + + def test_to_html_decimal(datapath): # GH 12031 df = DataFrame({"A": [6.0, 3.1, 2.2]}) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index d6572ac7b7bfe..20e2690084e2a 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -1,10 +1,7 @@ -try: - import json -except ImportError: - import simplejson as json import calendar import datetime import decimal +import json import locale import math import re diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 2c347a096006a..183ad500b15f3 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,4 +1,5 @@ import os +from typing import List, Optional import pytest @@ -6,9 +7,9 @@ class BaseParser: - engine = None + engine = None # type: Optional[str] low_memory = True - float_precision_choices = [] + float_precision_choices = [] # type: List[Optional[str]] def update_kwargs(self, kwargs): kwargs = kwargs.copy() @@ -59,11 +60,11 @@ def csv1(csv_dir_path): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_all_parsers = _c_parsers_only + _py_parsers_only +_all_parsers = [*_c_parsers_only, *_py_parsers_only] _py_parser_ids = ["python"] _c_parser_ids = ["c_high", "c_low"] -_all_parser_ids = _c_parser_ids + _py_parser_ids +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids] @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7491cef17ebfc..183a47c6039ec 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -583,7 +583,7 @@ class _TestSQLApi(PandasSQLTest): """ flavor = "sqlite" - mode = None + mode = None # type: str def setup_connect(self): self.conn = self.connect() @@ -1234,7 +1234,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ - flavor = None + flavor = None # type: str @pytest.fixture(autouse=True, scope="class") def setup_class(cls): diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index 6511d94aa4c09..41b1a88b15acb 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -9,7 +9,7 @@ import pandas dummy_backend = types.ModuleType("pandas_dummy_backend") -dummy_backend.plot = lambda *args, **kwargs: None +setattr(dummy_backend, "plot", lambda *args, **kwargs: None) @pytest.fixture diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 13f0f14014a31..eda7bc0ec4df7 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1220,7 +1220,9 @@ def test_concat_keys_specific_levels(self): ) tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) - assert result.columns.names[0] == "group_key" + tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) + + assert result.columns.names == ["group_key", None] def test_concat_dataframe_keys_bug(self, sort): t1 = DataFrame( @@ -1409,7 +1411,7 @@ def test_concat_keys_and_levels(self): keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], names=["first", "second"], ) - assert result.index.names == ("first", "second") + (None,) + assert result.index.names == ("first", "second", None) tm.assert_index_equal( result.index.levels[0], Index(["baz", "foo"], name="first") ) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index e2c6f7d1c8feb..0b9392a0eeb5b 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -618,16 +618,15 @@ def test_reshaping_multi_index_categorical(self): df.index.names = ["major", "minor"] df["str"] = "foo" - dti = df.index.levels[0] - df["category"] = df["str"].astype("category") result = df["category"].unstack() + dti = df.index.levels[0] c = Categorical(["foo"] * len(dti)) expected = DataFrame( {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, columns=Index(list("ABCD"), name="minor"), - index=dti, + index=dti.rename("major"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d60cd3029e5a8..c8e1c04f3e3fb 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p18 import pandas.util._test_decorators as td import pandas as pd @@ -160,6 +161,9 @@ def test_cummax(self, datetime_series): tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummin_datetime64(self): s = pd.Series( pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) @@ -179,6 +183,9 @@ def test_cummin_datetime64(self): result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummax_datetime64(self): s = pd.Series( pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) @@ -198,6 +205,9 @@ def test_cummax_datetime64(self): result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummin_timedelta64(self): s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) @@ -213,6 +223,9 @@ def test_cummin_timedelta64(self): result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummax_timedelta64(self): s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 467f2c177850a..6bfcc02ca633a 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -7,7 +7,6 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna from pandas.core import ops -from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( @@ -282,13 +281,27 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + def test_reversed_xor_with_index_returns_index(self): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + expected = Index.symmetric_difference(idx1, ser) + result = idx1 ^ ser + assert_index_equal(result, expected) + + expected = Index.symmetric_difference(idx2, ser) + result = idx2 ^ ser + assert_index_equal(result, expected) + @pytest.mark.parametrize( "op", [ pytest.param( ops.rand_, marks=pytest.mark.xfail( - reason="GH#22092 Index implementation returns Index", + reason="GH#22092 Index __and__ returns Index intersection", raises=AssertionError, strict=True, ), @@ -296,30 +309,26 @@ def test_logical_ops_with_index(self, op): pytest.param( ops.ror_, marks=pytest.mark.xfail( - reason="Index.get_indexer with non unique index", - raises=InvalidIndexError, + reason="GH#22092 Index __or__ returns Index union", + raises=AssertionError, strict=True, ), ), - ops.rxor, ], ) - def test_reversed_logical_ops_with_index(self, op): + def test_reversed_logical_op_with_index_returns_series(self, op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) idx1 = Index([True, False, True, False]) idx2 = Index([1, 0, 1, 0]) - # symmetric_difference is only for rxor, but other 2 should fail - expected = idx1.symmetric_difference(ser) - + expected = pd.Series(op(idx1.values, ser.values)) result = op(ser, idx1) - assert_index_equal(result, expected) - - expected = idx2.symmetric_difference(ser) + assert_series_equal(result, expected) + expected = pd.Series(op(idx2.values, ser.values)) result = op(ser, idx2) - assert_index_equal(result, expected) + assert_series_equal(result, expected) @pytest.mark.parametrize( "op, expected", diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 483122a0eeaba..1f19f58e80f26 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1009,6 +1009,12 @@ def test_bool_indexing(self, indexer_klass, indexer): s = pd.Series(idx) tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) + def test_get_indexer_non_unique_dtype_mismatch(self): + # GH 25459 + indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) + class TestTranspose(Ops): errmsg = "the 'axes' parameter is not supported" diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e641d6f842d87..79c9fe2b60bd9 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -335,7 +335,7 @@ def test_count_level_corner(self): df = self.frame[:0] result = df.count(level=0) expected = ( - DataFrame(index=s.index.levels[0], columns=df.columns) + DataFrame(index=s.index.levels[0].set_names(["first"]), columns=df.columns) .fillna(0) .astype(np.int64) ) @@ -975,14 +975,12 @@ def test_count(self): series.index.names = ["a", "b"] result = series.count(level="b") - expect = self.series.count(level=1) - tm.assert_series_equal(result, expect, check_names=False) - assert result.index.name == "b" + expect = self.series.count(level=1).rename_axis("b") + tm.assert_series_equal(result, expect) result = series.count(level="a") - expect = self.series.count(level=0) - tm.assert_series_equal(result, expect, check_names=False) - assert result.index.name == "a" + expect = self.series.count(level=0).rename_axis("a") + tm.assert_series_equal(result, expect) msg = "Level x not found" with pytest.raises(KeyError, match=msg): @@ -1014,6 +1012,8 @@ def test_frame_group_ops(self, op, level, axis, skipna, sort): self.frame.iloc[1, [1, 2]] = np.nan self.frame.iloc[7, [0, 1]] = np.nan + level_name = self.frame.index.names[level] + if axis == 0: frame = self.frame else: @@ -1034,7 +1034,7 @@ def aggf(x): frame = frame.sort_index(level=level, axis=axis) # for good measure, groupby detail - level_index = frame._get_axis(axis).levels[level] + level_index = frame._get_axis(axis).levels[level].rename(level_name) tm.assert_index_equal(leftside._get_axis(axis), level_index) tm.assert_index_equal(rightside._get_axis(axis), level_index) @@ -1639,10 +1639,14 @@ def test_constructor_with_tz(self): ) result = MultiIndex.from_arrays([index, columns]) + + assert result.names == ["dt1", "dt2"] tm.assert_index_equal(result.levels[0], index) tm.assert_index_equal(result.levels[1], columns) result = MultiIndex.from_arrays([Series(index), Series(columns)]) + + assert result.names == ["dt1", "dt2"] tm.assert_index_equal(result.levels[0], index) tm.assert_index_equal(result.levels[1], columns) @@ -1674,10 +1678,12 @@ def test_set_index_datetime(self): df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) + assert df.index.names == ["datetime", "label"] df = df.swaplevel(0, 1) tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) + assert df.index.names == ["label", "datetime"] df = DataFrame(np.random.random(6)) idx1 = pd.DatetimeIndex( diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 880ff1f137520..a05de78e299f7 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -36,8 +36,8 @@ with warnings.catch_warnings(): warnings.simplefilter("ignore") - min_dt = (pd.Timestamp(1900, 1, 1).to_pydatetime(),) - max_dt = (pd.Timestamp(1900, 1, 1).to_pydatetime(),) + min_dt = pd.Timestamp(1900, 1, 1).to_pydatetime() + max_dt = pd.Timestamp(1900, 1, 1).to_pydatetime() gen_date_range = st.builds( pd.date_range, diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 9571e8027ccf7..86e5d506e0779 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -141,7 +141,7 @@ def test_empty_dtypes(check_dtype): df1["col1"] = df1["col1"].astype("int64") if check_dtype: - msg = "Attributes are different" + msg = r"Attributes of DataFrame\..* are different" with pytest.raises(AssertionError, match=msg): assert_frame_equal(df1, df2, **kwargs) else: diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index a12d9386eb159..bad3f2e67f8bb 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -179,7 +179,7 @@ def test_series_equal_values_mismatch(check_less_precise): def test_series_equal_categorical_mismatch(check_categorical): - msg = """Attributes are different + msg = """Attributes of Series are different Attribute "dtype" are different \\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c8b41a87baa9d..73535e55d4fa5 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1156,7 +1156,9 @@ def assert_series_equal( ): pass else: - assert_attr_equal("dtype", left, right) + assert_attr_equal( + "dtype", left, right, obj="Attributes of {obj}".format(obj=obj) + ) if check_exact: assert_numpy_array_equal( @@ -1315,8 +1317,9 @@ def assert_frame_equal( >>> assert_frame_equal(df1, df2) Traceback (most recent call last): - AssertionError: Attributes are different ... + AssertionError: Attributes of DataFrame.iloc[:, 1] are different + Attribute "dtype" are different [left]: int64 [right]: float64 @@ -1600,7 +1603,9 @@ def makeUnicodeIndex(k=10, name=None): def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) - return CategoricalIndex(np.random.choice(x, k), name=name, **kwargs) + return CategoricalIndex( + Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs + ) def makeIntervalIndex(k=10, name=None, **kwargs): diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index b1b5be6d4faeb..1506acc95edf9 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -1,3 +1,4 @@ +import functools import io import random import string @@ -68,6 +69,23 @@ def sample(self): """ return random.random() + @functools.lru_cache(None) + def decorated_sample(self, max): + """ + Generate and return a random integer between 0 and max. + + Parameters + ---------- + max : int + The maximum value of the random number. + + Returns + ------- + int + Random number generated. + """ + return random.randint(0, max) + def random_letters(self): """ Generate and return a sequence of random letters. @@ -870,6 +888,7 @@ def test_good_class(self, capsys): "plot", "swap", "sample", + "decorated_sample", "random_letters", "sample_values", "head", diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 50b02c0fcbaf5..1d0f4b583bd0c 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -430,6 +430,17 @@ def doc_parameters(self): @property def signature_parameters(self): + def add_stars(param_name: str, info: inspect.Parameter): + """ + Add stars to *args and **kwargs parameters + """ + if info.kind == inspect.Parameter.VAR_POSITIONAL: + return f"*{param_name}" + elif info.kind == inspect.Parameter.VAR_KEYWORD: + return f"**{param_name}" + else: + return param_name + if inspect.isclass(self.obj): if hasattr(self.obj, "_accessors") and ( self.name.split(".")[-1] in self.obj._accessors @@ -437,17 +448,16 @@ def signature_parameters(self): # accessor classes have a signature but don't want to show this return tuple() try: - sig = inspect.getfullargspec(self.obj) + sig = inspect.signature(self.obj) except (TypeError, ValueError): # Some objects, mainly in C extensions do not support introspection # of the signature return tuple() - params = sig.args - if sig.varargs: - params.append("*" + sig.varargs) - if sig.varkw: - params.append("**" + sig.varkw) - params = tuple(params) + + params = tuple( + add_stars(parameter, sig.parameters[parameter]) + for parameter in sig.parameters + ) if params and params[0] in ("self", "cls"): return params[1:] return params diff --git a/setup.cfg b/setup.cfg index 64494bf84363e..ca1ca4a7b5733 100644 --- a/setup.cfg +++ b/setup.cfg @@ -145,9 +145,6 @@ ignore_errors=True [mypy-pandas.tests.arrays.test_period] ignore_errors=True -[mypy-pandas.tests.computation.test_eval] -ignore_errors=True - [mypy-pandas.tests.dtypes.test_common] ignore_errors=True @@ -166,9 +163,6 @@ ignore_errors=True [mypy-pandas.tests.frame.test_constructors] ignore_errors=True -[mypy-pandas.tests.frame.test_convert_to] -ignore_errors=True - [mypy-pandas.tests.indexes.datetimes.test_datetimelike] ignore_errors=True @@ -196,24 +190,9 @@ ignore_errors=True [mypy-pandas.tests.indexes.timedeltas.test_timedelta] ignore_errors=True -[mypy-pandas.tests.indexing.test_coercion] -ignore_errors=True - [mypy-pandas.tests.indexing.test_loc] ignore_errors=True -[mypy-pandas.tests.io.json.test_ujson] -ignore_errors=True - -[mypy-pandas.tests.io.parser.conftest] -ignore_errors=True - -[mypy-pandas.tests.io.test_sql] -ignore_errors=True - -[mypy-pandas.tests.plotting.test_backend] -ignore_errors=True - [mypy-pandas.tests.series.test_constructors] ignore_errors=True @@ -226,8 +205,5 @@ ignore_errors=True [mypy-pandas.tests.tseries.offsets.test_offsets] ignore_errors=True -[mypy-pandas.tests.tseries.offsets.test_offsets_properties] -ignore_errors=True - [mypy-pandas.tests.tseries.offsets.test_yqm_offsets] ignore_errors=True diff --git a/setup.py b/setup.py index 04aedcb101e25..2892cd0b2e294 100755 --- a/setup.py +++ b/setup.py @@ -88,7 +88,6 @@ def is_platform_mac(): "_libs/algos_take_helper.pxi.in", "_libs/algos_rank_helper.pxi.in", ], - "groupby": ["_libs/groupby_helper.pxi.in"], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -228,6 +227,7 @@ def build_extensions(self): "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Programming Language :: Cython", "Topic :: Scientific/Engineering", ] @@ -563,7 +563,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ext_data = { "_libs.algos": {"pyxfile": "_libs/algos", "depends": _pxi_dep["algos"]}, - "_libs.groupby": {"pyxfile": "_libs/groupby", "depends": _pxi_dep["groupby"]}, + "_libs.groupby": {"pyxfile": "_libs/groupby"}, "_libs.hashing": {"pyxfile": "_libs/hashing", "include": [], "depends": []}, "_libs.hashtable": { "pyxfile": "_libs/hashtable", From 67847ff90e42482db02b0375d1b99f5592207d53 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Sat, 19 Oct 2019 21:47:17 +0900 Subject: [PATCH 09/19] checkout unrelated files to master --- pandas/core/sorting.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 6d80cf8c697d6..e6edad656d430 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -162,6 +162,7 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): xnull: boolean, if nulls are excluded; i.e. -1 labels are passed through """ + if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") shape = np.asarray(shape, dtype="i8") + lift @@ -302,8 +303,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): def get_indexer_dict(label_list, keys): - """ return a dict of {labels} -> {indexers} """ - shape = [len(x) for x in keys] + """ return a diction of {labels} -> {indexers} """ + shape = list(map(len, keys)) group_index = get_group_index(label_list, shape, sort=True, xnull=True) ngroups = ( From 66a96e37725bded91934d15f69f5334df35adb32 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Sat, 19 Oct 2019 21:55:32 +0900 Subject: [PATCH 10/19] checkout sorting.py to upstream/master not origin/master --- pandas/core/sorting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e6edad656d430..94810369785d3 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -303,8 +303,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): def get_indexer_dict(label_list, keys): - """ return a diction of {labels} -> {indexers} """ - shape = list(map(len, keys)) + """ return a dict of {labels} -> {indexers} """ + shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) ngroups = ( From 9390d2172337b16633a8d94082c6fbebac8d6c17 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Thu, 24 Oct 2019 23:01:28 +0900 Subject: [PATCH 11/19] accept TomAugspurger requests 1. remove the seed 2. remove meaningless comment 3. refer to GH issue number --- pandas/tests/groupby/test_value_counts.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 70d6b3db1923e..032ae7b6d8fec 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -85,8 +85,7 @@ def rebuild_index(df): @pytest.mark.parametrize("size", [100, 1000]) @pytest.mark.parametrize("frac", [0.1, 0.5, 1]) def test_series_groupby_value_counts_with_grouper(freq, size, frac): - np.random.seed(42) - + # GH28479 df = DataFrame.from_dict( { "date": date_range("2019-09-25", periods=size), @@ -99,8 +98,6 @@ def test_series_groupby_value_counts_with_grouper(freq, size, frac): # have to sort on index because of unstable sort on values xref GH9212 result = gr.value_counts().sort_index() expected = gr.apply(Series.value_counts).sort_index() - expected.index.names = ( - result.index.names - ) # .apply(Series.value_counts) can't create all names + expected.index.names = result.index.names tm.assert_series_equal(result, expected) From d8da75adb589664a424e466d845e1db90c5823f5 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Fri, 25 Oct 2019 02:03:33 +0900 Subject: [PATCH 12/19] use deterministic values instead of random values --- pandas/tests/groupby/test_value_counts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 032ae7b6d8fec..434a4834123c6 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -4,7 +4,7 @@ and proper parameter handling """ -from itertools import product +from itertools import cycle, islice, product import numpy as np import pytest @@ -89,7 +89,7 @@ def test_series_groupby_value_counts_with_grouper(freq, size, frac): df = DataFrame.from_dict( { "date": date_range("2019-09-25", periods=size), - "name": np.random.choice(list("abcd"), size), + "name": islice(cycle("abc"), size), } ).sample(frac=frac) From 6f6371ca2b7ff05d71738b3ac568dccfa93a97fd Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Fri, 25 Oct 2019 23:31:21 +0900 Subject: [PATCH 13/19] undo adding recons_labels method --- pandas/core/groupby/ops.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 808646a259541..fbe1598767736 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -865,14 +865,6 @@ def levels(self): def names(self): return [self.binlabels.name] - @property - def recons_labels(self): - comp_ids, obs_ids, _ = self.group_info - if len(self.binlabels) != len(self.indices): - return [np.unique(comp_ids)] - labels = (ping.labels for ping in self.groupings) - return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels, xnull=True) - @property def groupings(self): from pandas.core.groupby.grouper import Grouping From 82e5153810d986fd4f801d919a537b915117db82 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Sat, 26 Oct 2019 01:08:14 +0900 Subject: [PATCH 14/19] move a change returning unique comp ids --- pandas/core/sorting.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 94810369785d3..4bd9d74592d81 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -162,6 +162,10 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): xnull: boolean, if nulls are excluded; i.e. -1 labels are passed through """ + labels = list(labels) + unique_comp_ids = np.unique(comp_ids) + if (shape[0] != len(unique_comp_ids)) and (shape[0] == len(labels[0])): + return [unique_comp_ids] if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") From df6454b2b04d3beedca7bd6a16389868afa6d165 Mon Sep 17 00:00:00 2001 From: donghojung Date: Mon, 4 Nov 2019 10:52:32 +0900 Subject: [PATCH 15/19] add recons_labels under the BinGrouper --- pandas/core/groupby/ops.py | 4 ++++ pandas/core/sorting.py | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8d13c37270d7a..b953df78c05c0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -747,6 +747,10 @@ def group_info(self): ngroups, ) + @cache_readonly + def recons_labels(self): + return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1])]] + @cache_readonly def result_index(self): if len(self.binlabels) != 0 and isna(self.binlabels[0]): diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5d9d5c6c751a1..9b8a1a76e419c 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -162,10 +162,6 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): xnull: boolean, if nulls are excluded; i.e. -1 labels are passed through """ - labels = list(labels) - unique_comp_ids = np.unique(comp_ids) - if (shape[0] != len(unique_comp_ids)) and (shape[0] == len(labels[0])): - return [unique_comp_ids] if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") From 28ee287ae00ab99bbe80dc8e57a5f69f133d1be9 Mon Sep 17 00:00:00 2001 From: donghojung Date: Mon, 4 Nov 2019 18:43:58 +0900 Subject: [PATCH 16/19] fix indexing in recons_labels --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b953df78c05c0..1bbd3d1b6d777 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -749,7 +749,7 @@ def group_info(self): @cache_readonly def recons_labels(self): - return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1])]] + return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] @cache_readonly def result_index(self): From a1049fde9bb5cddddb0748a1051a1436414404b2 Mon Sep 17 00:00:00 2001 From: donghojung Date: Mon, 4 Nov 2019 18:44:17 +0900 Subject: [PATCH 17/19] add an exact reproduction of the original issue --- pandas/tests/groupby/test_value_counts.py | 32 ++++++++++++++--------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 8d630147a9304..c76ee09f977b5 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -4,12 +4,12 @@ and proper parameter handling """ -from itertools import cycle, islice, product +from itertools import product import numpy as np import pytest -from pandas import DataFrame, Grouper, MultiIndex, Series, date_range +from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime import pandas.util.testing as tm @@ -81,23 +81,29 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) -@pytest.mark.parametrize("freq", ["1D", "2D", "1W", "1Y"]) -@pytest.mark.parametrize("size", [100, 1000]) -@pytest.mark.parametrize("frac", [0.1, 0.5, 1]) -def test_series_groupby_value_counts_with_grouper(freq, size, frac): +def test_series_groupby_value_counts_with_grouper(): # GH28479 - df = DataFrame.from_dict( + df = DataFrame( { - "date": date_range("2019-09-25", periods=size), - "name": islice(cycle("abc"), size), + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], } - ).sample(frac=frac) + ).drop([3]) - gr = df.groupby(Grouper(key="date", freq=freq))["name"] + df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") + dfg = df.groupby(Grouper(freq="1D", key="Datetime")) # have to sort on index because of unstable sort on values xref GH9212 - result = gr.value_counts().sort_index() - expected = gr.apply(Series.value_counts).sort_index() + result = dfg["Food"].value_counts().sort_index() + expected = dfg["Food"].apply(Series.value_counts).sort_index() expected.index.names = result.index.names tm.assert_series_equal(result, expected) From 961a72cc78a2c5c43d68e67a55ecbff2b59b4629 Mon Sep 17 00:00:00 2001 From: donghojung Date: Wed, 6 Nov 2019 10:13:25 +0900 Subject: [PATCH 18/19] add a comment under recons_labels --- pandas/core/groupby/ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ee78af3e29e9c..5b61fc8624b32 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -752,6 +752,7 @@ def group_info(self): @cache_readonly def recons_labels(self): + # get unique result indices, and prepend 0 as groupby starts from the first return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] @cache_readonly From 368311c9adbaf2bef9764fd001a8e3f3340701e1 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Thu, 7 Nov 2019 23:17:53 +0900 Subject: [PATCH 19/19] rename from recons_labels to recons_codes --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 29ba64b4c90bf..9599ce0bf39a9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -768,7 +768,7 @@ def group_info(self): ) @cache_readonly - def recons_labels(self): + def recons_codes(self): # get unique result indices, and prepend 0 as groupby starts from the first return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]]