From 2b9ad6f772d33428d0a6e7e5c26ee542e082000d Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 19 Mar 2021 09:22:48 -0500 Subject: [PATCH 01/14] CI: run database tests only #39550 (#39666) --- .github/workflows/database.yml | 102 ++---------------- ...-37-locale.yaml => actions-37-db-min.yaml} | 1 + ...actions-37-cov.yaml => actions-37-db.yaml} | 0 3 files changed, 10 insertions(+), 93 deletions(-) rename ci/deps/{actions-37-locale.yaml => actions-37-db-min.yaml} (97%) rename ci/deps/{actions-37-cov.yaml => actions-37-db.yaml} (100%) diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index a30dbc048c03d..ba5a0a1fd0909 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -12,17 +12,19 @@ env: PYTEST_WORKERS: "auto" PANDAS_CI: 1 PATTERN: ((not slow and not network and not clipboard) or (single and db)) + COVERAGE: true jobs: - Linux_py37_locale: + Linux_py37_IO: runs-on: ubuntu-latest defaults: run: shell: bash -l {0} - env: - ENV_FILE: ci/deps/actions-37-locale.yaml - LOCALE_OVERRIDE: zh_CN.UTF-8 + strategy: + matrix: + ENV_FILE: [ci/deps/actions-37-db-min.yaml, ci/deps/actions-37-db.yaml] + fail-fast: false services: mysql: @@ -63,106 +65,20 @@ jobs: with: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('${{ env.ENV_FILE }}') }} + hashFiles('${{ matrix.ENV_FILE }}') }} - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: pandas-dev channel-priority: strict - environment-file: ${{ env.ENV_FILE }} + environment-file: ${{ matrix.ENV_FILE }} use-only-tar-bz2: true - name: Build Pandas uses: ./.github/actions/build_pandas - name: Test - run: ci/run_tests.sh - if: always() - - - name: Build Version - run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - - name: Publish test results - uses: actions/upload-artifact@master - with: - name: Test results - path: test-data.xml - if: failure() - - - name: Print skipped tests - run: python ci/print_skipped.py - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - flags: unittests - name: codecov-pandas - fail_ci_if_error: false - - Linux_py37_cov: - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - - env: - ENV_FILE: ci/deps/actions-37-cov.yaml - PANDAS_TESTING_MODE: deprecate - COVERAGE: true - - services: - mysql: - image: mysql - env: - MYSQL_ALLOW_EMPTY_PASSWORD: yes - MYSQL_DATABASE: pandas - options: >- - --health-cmd "mysqladmin ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 3306:3306 - - postgres: - image: postgres - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: pandas - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - - steps: - - name: Checkout - uses: actions/checkout@v1 - - - name: Cache conda - uses: actions/cache@v1 - env: - CACHE_NUMBER: 0 - with: - path: ~/conda_pkgs_dir - key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('${{ env.ENV_FILE }}') }} - - - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: pandas-dev - channel-priority: strict - environment-file: ${{ env.ENV_FILE }} - use-only-tar-bz2: true - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - run: ci/run_tests.sh + run: pytest -m "${{ env.PATTERN }}" -n 2 --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml -s --cov=pandas --cov-report=xml pandas/tests/io if: always() - name: Build Version diff --git a/ci/deps/actions-37-locale.yaml b/ci/deps/actions-37-db-min.yaml similarity index 97% rename from ci/deps/actions-37-locale.yaml rename to ci/deps/actions-37-db-min.yaml index 551308f1d5fac..1d3794576220a 100644 --- a/ci/deps/actions-37-locale.yaml +++ b/ci/deps/actions-37-db-min.yaml @@ -7,6 +7,7 @@ dependencies: # tools - cython>=0.29.21 - pytest>=5.0.1 + - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-37-cov.yaml b/ci/deps/actions-37-db.yaml similarity index 100% rename from ci/deps/actions-37-cov.yaml rename to ci/deps/actions-37-db.yaml From 69a4d60d20c4c05a4dbe489533b5c19cc95ca914 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 19 Mar 2021 14:47:39 +0000 Subject: [PATCH 02/14] no no-string-hints (#40516) --- .pre-commit-config.yaml | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aa8c2b74d7a7e..e3dd6b018b8aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,10 +50,6 @@ repos: rev: 5.7.0 hooks: - id: isort -- repo: https://github.com/MarcoGorelli/no-string-hints - rev: v0.1.7 - hooks: - - id: no-string-hints - repo: https://github.com/asottile/pyupgrade rev: v2.10.0 hooks: @@ -111,12 +107,6 @@ repos: pandas/tests/io/excel/test_writers\.py |pandas/tests/io/pytables/common\.py |pandas/tests/io/pytables/test_store\.py$ - - id: no-pandas-api-types - name: Check code for instances of pd.api.types - entry: (pd|pandas)\.api\.types\. - language: pygrep - types: [python] - files: ^pandas/tests/ - id: non-standard-imports name: Check for non-standard imports language: pygrep @@ -128,6 +118,11 @@ repos: # Check for imports from collections.abc instead of `from collections import abc` |from\ collections\.abc\ import + + # Numpy + |from\ numpy\ import\ random + |from\ numpy\.random\ import + types: [python] - id: non-standard-imports-in-tests name: Check for non-standard imports in test suite language: pygrep @@ -143,26 +138,17 @@ repos: # Check for use of pandas.testing instead of tm |pd\.testing\. + + # Check for pd.api.types instead of from pandas.api.types import ... + |(pd|pandas)\.api\.types\. types: [python] files: ^pandas/tests/ - - id: non-standard-numpy-random-related-imports - name: Check for non-standard numpy.random-related imports excluding pandas/_testing.py - language: pygrep - exclude: pandas/_testing.py + - id: np-bool-and-np-object + name: Check for use of np.bool/np.object instead of np.bool_/np.object_ entry: | (?x) - # Check for imports from np.random. instead of `from numpy import random` or `from numpy.random import ` - from\ numpy\ import\ random - |from\ numpy.random\ import - types: [python] - - id: np-bool - name: Check for use of np.bool instead of np.bool_ - entry: np\.bool[^_8] - language: pygrep - types_or: [python, cython, rst] - - id: np-object - name: Check for use of np.object instead of np.object_ - entry: np\.object[^_8] + np\.bool[^_8] + |np\.object[^_8] language: pygrep types_or: [python, cython, rst] - id: pip-to-conda From bbe34fc21131796918bc12aa9e12d74bc608c888 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 19 Mar 2021 09:46:36 -0700 Subject: [PATCH 03/14] REF: share to_native_types with ArrayManager (#40490) --- pandas/core/internals/array_manager.py | 3 +- pandas/core/internals/blocks.py | 158 ++++++++++++------------- 2 files changed, 81 insertions(+), 80 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a417cd0e06872..34b3d83c066c2 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -88,6 +88,7 @@ from pandas.core.internals.blocks import ( ensure_block_shape, new_block, + to_native_types, ) if TYPE_CHECKING: @@ -634,7 +635,7 @@ def replace_list( ) def to_native_types(self, **kwargs): - return self.apply_with_block("to_native_types", **kwargs) + return self.apply(to_native_types, **kwargs) @property def is_mixed_type(self) -> bool: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3fd1ebaca19f0..99e54bace8915 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -97,6 +97,7 @@ FloatingArray, IntegerArray, PandasArray, + TimedeltaArray, ) from pandas.core.base import PandasObject import pandas.core.common as com @@ -260,9 +261,11 @@ def get_block_values_for_json(self) -> np.ndarray: # TODO(EA2D): reshape will be unnecessary with 2D EAs return np.asarray(self.values).reshape(self.shape) + @final @property def fill_value(self): - return np.nan + # Used in reindex_indexer + return na_value_for_dtype(self.dtype, compat=False) @property def mgr_locs(self) -> BlockPlacement: @@ -652,24 +655,11 @@ def should_store(self, value: ArrayLike) -> bool: """ return is_dtype_equal(value.dtype, self.dtype) + @final def to_native_types(self, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format """ - values = self.values - - mask = isna(values) - itemsize = writers.word_len(na_rep) - - if not self.is_object and not quoting and itemsize: - values = values.astype(str) - if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: - # enlarge for the na_rep - values = values.astype(f" np.ndarray: def array_values(self) -> ExtensionArray: return self.values - def to_native_types(self, na_rep="nan", quoting=None, **kwargs): - """override to use ExtensionArray astype for the conversion""" - values = self.values - mask = isna(values) - - new_values = np.asarray(values.astype(object)) - new_values[mask] = na_rep - return self.make_block(new_values) - def take_nd( self, indexer, @@ -1808,41 +1784,6 @@ def is_bool(self): class FloatBlock(NumericBlock): __slots__ = () - def to_native_types( - self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs - ): - """ convert to our native types format """ - values = self.values - - # see gh-13418: no special formatting is desired at the - # output (important for appropriate 'quoting' behaviour), - # so do not pass it through the FloatArrayFormatter - if float_format is None and decimal == ".": - mask = isna(values) - - if not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype="object") - - values[mask] = na_rep - values = values.astype(object, copy=False) - return self.make_block(values) - - from pandas.io.formats.format import FloatArrayFormatter - - formatter = FloatArrayFormatter( - values, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - quoting=quoting, - fixed_width=False, - ) - res = formatter.get_result_as_array() - res = res.astype(object, copy=False) - return self.make_block(res) - class NDArrayBackedExtensionBlock(HybridMixin, Block): """ @@ -1962,18 +1903,6 @@ def array_values(self): def _holder(self): return type(self.array_values()) - @property - def fill_value(self): - return na_value_for_dtype(self.dtype) - - def to_native_types(self, na_rep="NaT", **kwargs): - """ convert to our native types format """ - arr = self.array_values() - - result = arr._format_native_types(na_rep=na_rep, **kwargs) - result = result.astype(object, copy=False) - return self.make_block(result) - class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () @@ -1999,7 +1928,6 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): internal_values = Block.internal_values _can_hold_element = DatetimeBlock._can_hold_element - to_native_types = DatetimeBlock.to_native_types diff = DatetimeBlock.diff where = DatetimeBlock.where putmask = DatetimeLikeBlockMixin.putmask @@ -2316,3 +2244,75 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # We can't, and don't need to, reshape. values = np.asarray(values).reshape(1, -1) return values + + +def to_native_types( + values: ArrayLike, + *, + na_rep="nan", + quoting=None, + float_format=None, + decimal=".", + **kwargs, +) -> np.ndarray: + """ convert to our native types format """ + values = ensure_wrapped_if_datetimelike(values) + + if isinstance(values, (DatetimeArray, TimedeltaArray)): + result = values._format_native_types(na_rep=na_rep, **kwargs) + result = result.astype(object, copy=False) + return result + + elif isinstance(values, ExtensionArray): + mask = isna(values) + + new_values = np.asarray(values.astype(object)) + new_values[mask] = na_rep + return new_values + + elif values.dtype.kind == "f": + # see GH#13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == ".": + mask = isna(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype="object") + + values[mask] = na_rep + values = values.astype(object, copy=False) + return values + + from pandas.io.formats.format import FloatArrayFormatter + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) + res = formatter.get_result_as_array() + res = res.astype(object, copy=False) + return res + + else: + + mask = isna(values) + itemsize = writers.word_len(na_rep) + + if values.dtype != _dtype_obj and not quoting and itemsize: + values = values.astype(str) + if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: + # enlarge for the na_rep + values = values.astype(f" Date: Fri, 19 Mar 2021 17:51:28 +0100 Subject: [PATCH 04/14] PERF: optimize is_numeric_v_string_like (#40501) --- pandas/core/dtypes/common.py | 53 ++++-------------------------- pandas/core/dtypes/missing.py | 3 +- pandas/tests/dtypes/test_common.py | 13 +------- 3 files changed, 8 insertions(+), 61 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 7a2d6468f1b63..32ea82d9c0402 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1100,7 +1100,7 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: # This exists to silence numpy deprecation warnings, see GH#29553 -def is_numeric_v_string_like(a, b): +def is_numeric_v_string_like(a: ArrayLike, b): """ Check if we are comparing a string-like object to a numeric ndarray. NumPy doesn't like to compare such objects, especially numeric arrays @@ -1108,7 +1108,7 @@ def is_numeric_v_string_like(a, b): Parameters ---------- - a : array-like, scalar + a : array-like The first object to check. b : array-like, scalar The second object to check. @@ -1120,16 +1120,8 @@ def is_numeric_v_string_like(a, b): Examples -------- - >>> is_numeric_v_string_like(1, 1) - False - >>> is_numeric_v_string_like("foo", "foo") - False - >>> is_numeric_v_string_like(1, "foo") # non-array numeric - False >>> is_numeric_v_string_like(np.array([1]), "foo") True - >>> is_numeric_v_string_like("foo", np.array([1])) # symmetric check - True >>> is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"])) True >>> is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2])) @@ -1142,17 +1134,15 @@ def is_numeric_v_string_like(a, b): is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) - is_a_numeric_array = is_a_array and is_numeric_dtype(a) - is_b_numeric_array = is_b_array and is_numeric_dtype(b) - is_a_string_array = is_a_array and is_string_like_dtype(a) - is_b_string_array = is_b_array and is_string_like_dtype(b) + is_a_numeric_array = is_a_array and a.dtype.kind in ("u", "i", "f", "c", "b") + is_b_numeric_array = is_b_array and b.dtype.kind in ("u", "i", "f", "c", "b") + is_a_string_array = is_a_array and a.dtype.kind in ("S", "U") + is_b_string_array = is_b_array and b.dtype.kind in ("S", "U") - is_a_scalar_string_like = not is_a_array and isinstance(a, str) is_b_scalar_string_like = not is_b_array and isinstance(b, str) return ( (is_a_numeric_array and is_b_scalar_string_like) - or (is_b_numeric_array and is_a_scalar_string_like) or (is_a_numeric_array and is_b_string_array) or (is_b_numeric_array and is_a_string_array) ) @@ -1305,37 +1295,6 @@ def is_numeric_dtype(arr_or_dtype) -> bool: ) -def is_string_like_dtype(arr_or_dtype) -> bool: - """ - Check whether the provided array or dtype is of a string-like dtype. - - Unlike `is_string_dtype`, the object dtype is excluded because it - is a mixed dtype. - - Parameters - ---------- - arr_or_dtype : array-like - The array or dtype to check. - - Returns - ------- - boolean - Whether or not the array or dtype is of the string dtype. - - Examples - -------- - >>> is_string_like_dtype(str) - True - >>> is_string_like_dtype(object) - False - >>> is_string_like_dtype(np.array(['a', 'b'])) - True - >>> is_string_like_dtype(pd.Series([1, 2])) - False - """ - return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U")) - - def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 59d6f9a51ed43..8c2cff21c114e 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -35,7 +35,6 @@ is_object_dtype, is_scalar, is_string_dtype, - is_string_like_dtype, needs_i8_conversion, ) from pandas.core.dtypes.dtypes import ExtensionDtype @@ -258,7 +257,7 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> np.ndarray: dtype = values.dtype shape = values.shape - if is_string_like_dtype(dtype): + if dtype.kind in ("S", "U"): result = np.zeros(values.shape, dtype=bool) else: result = np.empty(shape, dtype=bool) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 248798408381e..406aec9d4c16e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -469,14 +469,11 @@ def test_is_datetime_or_timedelta_dtype(): def test_is_numeric_v_string_like(): - assert not com.is_numeric_v_string_like(1, 1) - assert not com.is_numeric_v_string_like(1, "foo") - assert not com.is_numeric_v_string_like("foo", "foo") + assert not com.is_numeric_v_string_like(np.array([1]), 1) assert not com.is_numeric_v_string_like(np.array([1]), np.array([2])) assert not com.is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"])) assert com.is_numeric_v_string_like(np.array([1]), "foo") - assert com.is_numeric_v_string_like("foo", np.array([1])) assert com.is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"])) assert com.is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2])) @@ -521,14 +518,6 @@ def test_is_numeric_dtype(): assert com.is_numeric_dtype(pd.Index([1, 2.0])) -def test_is_string_like_dtype(): - assert not com.is_string_like_dtype(object) - assert not com.is_string_like_dtype(pd.Series([1, 2])) - - assert com.is_string_like_dtype(str) - assert com.is_string_like_dtype(np.array(["a", "b"])) - - def test_is_float_dtype(): assert not com.is_float_dtype(str) assert not com.is_float_dtype(int) From b524462e1f88319912ee5ad91a45e6d1986c9dba Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 19 Mar 2021 18:15:26 +0100 Subject: [PATCH 05/14] CLN/PERF: remove catching of numpy deprecation warning in comparison_op (#40515) --- pandas/core/ops/array_ops.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 04737d91c0d4e..333bdbf57bab3 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -6,7 +6,6 @@ from functools import partial import operator from typing import Any -import warnings import numpy as np @@ -232,7 +231,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: """ # NB: We assume extract_array has already been called on left and right lvalues = ensure_wrapped_if_datetimelike(left) - rvalues = right + rvalues = ensure_wrapped_if_datetimelike(right) rvalues = lib.item_from_zerodim(rvalues) if isinstance(rvalues, list): @@ -267,10 +266,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - with warnings.catch_warnings(): - # suppress warnings from numpy about element-wise comparison - warnings.simplefilter("ignore", DeprecationWarning) - res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) + res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) return res_values From b519386e17ab4dbcc53d1a67d636dbb31c93085e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 19 Mar 2021 18:09:25 -0700 Subject: [PATCH 06/14] TYP: get_reverse_indexer, get_group_index_sorter (#40476) --- pandas/_libs/internals.pyx | 20 ++++++----- pandas/_libs/lib.pyx | 44 +++++++++++++++++------- pandas/core/groupby/ops.py | 6 ++-- pandas/core/indexes/base.py | 29 +++++++++++----- pandas/core/indexes/multi.py | 2 +- pandas/core/reshape/reshape.py | 2 +- pandas/core/sorting.py | 14 ++++++-- pandas/tests/internals/test_internals.py | 12 +++---- pandas/tests/libs/test_lib.py | 4 +-- 9 files changed, 88 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8b643c03b6a19..5352ca53e1b54 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -13,14 +13,16 @@ import numpy as np cimport numpy as cnp from numpy cimport ( - NPY_INT64, + NPY_INTP, int64_t, + intp_t, ndarray, ) cnp.import_array() from pandas._libs.algos import ensure_int64 + from pandas._libs.util cimport is_integer_object @@ -30,7 +32,7 @@ cdef class BlockPlacement: # __slots__ = '_as_slice', '_as_array', '_len' cdef: slice _as_slice - ndarray _as_array # Note: this still allows `None` + ndarray _as_array # Note: this still allows `None`; will be intp_t bint _has_slice, _has_array, _is_known_slice_like def __cinit__(self, val): @@ -53,12 +55,12 @@ cdef class BlockPlacement: self._as_slice = slc self._has_slice = True else: - arr = np.empty(0, dtype=np.int64) + arr = np.empty(0, dtype=np.intp) self._as_array = arr self._has_array = True else: # Cython memoryview interface requires ndarray to be writeable. - arr = np.require(val, dtype=np.int64, requirements='W') + arr = np.require(val, dtype=np.intp, requirements='W') assert arr.ndim == 1, arr.shape self._as_array = arr self._has_array = True @@ -125,8 +127,8 @@ cdef class BlockPlacement: if not self._has_array: start, stop, step, _ = slice_get_indices_ex(self._as_slice) # NOTE: this is the C-optimized equivalent of - # `np.arange(start, stop, step, dtype=np.int64)` - self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64) + # `np.arange(start, stop, step, dtype=np.intp)` + self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP) self._has_array = True return self._as_array @@ -325,13 +327,13 @@ cdef slice_getitem(slice slc, ind): else: # NOTE: # this is the C-optimized equivalent of - # `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]` - return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind] + # `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]` + return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind] @cython.boundscheck(False) @cython.wraparound(False) -cdef slice indexer_as_slice(int64_t[:] vals): +cdef slice indexer_as_slice(intp_t[:] vals): cdef: Py_ssize_t i, n, start, stop int64_t d diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1ff481553e413..fc3e1ecfb55c1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -451,7 +451,7 @@ def fast_zip(list ndarrays) -> ndarray[object]: return result -def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length): +def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: """ Reverse indexing operation. @@ -459,14 +459,25 @@ def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length): indexer_inv[indexer[x]] = x - .. note:: If indexer is not unique, only first occurrence is accounted. + Parameters + ---------- + indexer : np.ndarray[np.intp] + length : int + + Returns + ------- + np.ndarray[np.intp] + + Notes + ----- + If indexer is not unique, only first occurrence is accounted. """ cdef: Py_ssize_t i, n = len(indexer) - ndarray[int64_t] rev_indexer - int64_t idx + ndarray[intp_t] rev_indexer + intp_t idx - rev_indexer = np.empty(length, dtype=np.int64) + rev_indexer = np.empty(length, dtype=np.intp) rev_indexer[:] = -1 for i in range(n): idx = indexer[i] @@ -808,23 +819,32 @@ def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, @cython.boundscheck(False) @cython.wraparound(False) -def get_level_sorter(const int64_t[:] label, const int64_t[:] starts): +def get_level_sorter( + ndarray[int64_t, ndim=1] codes, const intp_t[:] starts +) -> ndarray: """ Argsort for a single level of a multi-index, keeping the order of higher levels unchanged. `starts` points to starts of same-key indices w.r.t to leading levels; equivalent to: - np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort') + np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort') + starts[i] for i in range(len(starts) - 1)]) + + Parameters + ---------- + codes : np.ndarray[int64_t, ndim=1] + starts : np.ndarray[intp, ndim=1] + + Returns + ------- + np.ndarray[np.int, ndim=1] """ cdef: - int64_t l, r - Py_ssize_t i - ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64) - ndarray[int64_t, ndim=1] label_arr = np.asarray(label) + Py_ssize_t i, l, r + ndarray[intp_t, ndim=1] out = np.empty(len(codes), dtype=np.intp) for i in range(len(starts) - 1): l, r = starts[i], starts[i + 1] - out[l:r] = l + label_arr[l:r].argsort(kind='mergesort') + out[l:r] = l + codes[l:r].argsort(kind='mergesort') return out diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74e96015b4544..a222a8cc464fb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -991,10 +991,10 @@ def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): @cache_readonly def slabels(self): # Sorted labels - return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False) + return algorithms.take_nd(self.labels, self._sort_idx, allow_fill=False) @cache_readonly - def sort_idx(self): + def _sort_idx(self) -> np.ndarray: # np.ndarray[np.intp] # Counting sort indexer return get_group_index_sorter(self.labels, self.ngroups) @@ -1013,7 +1013,7 @@ def __iter__(self): @cache_readonly def sorted_data(self) -> FrameOrSeries: - return self.data.take(self.sort_idx, axis=self.axis) + return self.data.take(self._sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3a468758ab3fd..a5c0a5c6694e5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4135,13 +4135,22 @@ def _join_level( """ from pandas.core.indexes.multi import MultiIndex - def _get_leaf_sorter(labels): + def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray: """ Returns sorter for the inner most level while preserving the order of higher levels. + + Parameters + ---------- + labels : list[np.ndarray] + Each ndarray has signed integer dtype, not necessarily identical. + + Returns + ------- + np.ndarray[np.intp] """ if labels[0].size == 0: - return np.empty(0, dtype="int64") + return np.empty(0, dtype=np.intp) if len(labels) == 1: return get_group_index_sorter(labels[0]) @@ -4154,7 +4163,7 @@ def _get_leaf_sorter(labels): starts = np.hstack(([True], tic, [True])).nonzero()[0] lab = ensure_int64(labels[-1]) - return lib.get_level_sorter(lab, ensure_int64(starts)) + return lib.get_level_sorter(lab, ensure_platform_int(starts)) if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): raise TypeError("Join on level between two MultiIndex objects is ambiguous") @@ -4189,12 +4198,12 @@ def _get_leaf_sorter(labels): join_index = left[left_indexer] else: - left_lev_indexer = ensure_int64(left_lev_indexer) + left_lev_indexer = ensure_platform_int(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) old_codes = left.codes[level] - new_lev_codes = algos.take_nd( - rev_indexer, old_codes[old_codes != -1], allow_fill=False - ) + + taker = old_codes[old_codes != -1] + new_lev_codes = rev_indexer.take(taker) new_codes = list(left.codes) new_codes[level] = new_lev_codes @@ -4204,6 +4213,7 @@ def _get_leaf_sorter(labels): if keep_order: # just drop missing values. o.w. keep order left_indexer = np.arange(len(left), dtype=np.intp) + left_indexer = cast(np.ndarray, left_indexer) mask = new_lev_codes != -1 if not mask.all(): new_codes = [lab[mask] for lab in new_codes] @@ -4213,11 +4223,12 @@ def _get_leaf_sorter(labels): if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( - new_lev_codes, ngroups + ensure_int64(new_lev_codes), ngroups ) # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0] :] + # error: Value of type "Optional[ndarray]" is not indexable + left_indexer = left_indexer[counts[0] :] # type: ignore[index] new_codes = [lab[left_indexer] for lab in new_codes] else: # sort the leaves diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 97492f35232e3..31aa5e301d17c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1952,7 +1952,7 @@ def _sort_levels_monotonic(self) -> MultiIndex: lev = lev.take(indexer) # indexer to reorder the level codes - indexer = ensure_int64(indexer) + indexer = ensure_platform_int(indexer) ri = lib.get_reverse_indexer(indexer, len(indexer)) level_codes = algos.take_nd(ri, level_codes) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6a0286b1c40ef..613669b8cc1d8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -145,7 +145,7 @@ def _indexer_and_to_sort(self): ngroups = len(obs_ids) indexer = get_group_index_sorter(comp_index, ngroups) - + indexer = ensure_platform_int(indexer) return indexer, to_sort @cache_readonly diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 720643d3d98aa..10c13327c79d3 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -582,6 +582,16 @@ def get_group_index_sorter( Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') + + Parameters + ---------- + group_index : np.ndarray + signed integer dtype + ngroups : int or None, default None + + Returns + ------- + np.ndarray[np.intp] """ if ngroups is None: # error: Incompatible types in assignment (expression has type "number[Any]", @@ -596,9 +606,9 @@ def get_group_index_sorter( ) if do_groupsort: sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) - return ensure_platform_int(sorter) else: - return group_index.argsort(kind="mergesort") + sorter = group_index.argsort(kind="mergesort") + return ensure_platform_int(sorter) def compress_group_index(group_index, sort: bool = True): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c63d5271f1fae..ef1c3ec0c2860 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -259,7 +259,7 @@ def _check(blk): def test_mgr_locs(self): assert isinstance(self.fblock.mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( - self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64) + self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.intp) ) def test_attrs(self): @@ -277,7 +277,7 @@ def test_delete(self): newb.delete(0) assert isinstance(newb.mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( - newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64) + newb.mgr_locs.as_array, np.array([2, 4], dtype=np.intp) ) assert (newb.values[0] == 1).all() @@ -285,14 +285,14 @@ def test_delete(self): newb.delete(1) assert isinstance(newb.mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( - newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64) + newb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp) ) assert (newb.values[1] == 2).all() newb = self.fblock.copy() newb.delete(2) tm.assert_numpy_array_equal( - newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64) + newb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp) ) assert (newb.values[1] == 1).all() @@ -665,7 +665,7 @@ def test_consolidate_ordering_issues(self, mgr): assert cons.nblocks == 1 assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement) tm.assert_numpy_array_equal( - cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64) + cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.intp) ) def test_reindex_items(self): @@ -1095,7 +1095,7 @@ def test_slice_iter(self, slc, expected): ) def test_slice_to_array_conversion(self, slc, arr): tm.assert_numpy_array_equal( - BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64) + BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.intp) ) def test_blockplacement_add(self): diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 60c42878497c2..0532de9998c5f 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -197,9 +197,9 @@ def test_maybe_booleans_to_slice(self): assert result == slice(0, 0) def test_get_reverse_indexer(self): - indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) + indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.intp) result = lib.get_reverse_indexer(indexer, 5) - expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) + expected = np.array([4, 2, 3, 6, 7], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) From 7e71c3bac3b6dc694642ffdf650c1d4ae605c080 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 19 Mar 2021 18:10:02 -0700 Subject: [PATCH 07/14] CLN: factorize returns ndarray[intp], not int64 (#40474) --- pandas/_libs/hashtable.pyx | 26 +++++++++++----- pandas/_libs/hashtable_class_helper.pxi.in | 36 +++++++++++----------- pandas/_libs/join.pyx | 20 ++++++------ pandas/core/reshape/merge.py | 15 ++++++--- pandas/tests/libs/test_join.py | 16 +++++----- 5 files changed, 65 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 1bbffaa7bb5d2..e402a4b7c0ccc 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -66,13 +66,18 @@ cdef class Factorizer: self.uniques = ObjectVector() self.count = 0 - def get_count(self): + def get_count(self) -> int: return self.count def factorize( self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None - ): + ) -> np.ndarray: """ + + Returns + ------- + np.ndarray[np.intp] + Examples -------- Factorize values with nans replaced by na_sentinel @@ -80,6 +85,9 @@ cdef class Factorizer: >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ + cdef: + ndarray[intp_t] labels + if self.uniques.external_view_exists: uniques = ObjectVector() uniques.extend(self.uniques.to_array()) @@ -89,8 +97,6 @@ cdef class Factorizer: mask = (labels == na_sentinel) # sort on if sort: - if labels.dtype != np.intp: - labels = labels.astype(np.intp) sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) @@ -119,8 +125,12 @@ cdef class Int64Factorizer: return self.count def factorize(self, const int64_t[:] values, sort=False, - na_sentinel=-1, na_value=None): + na_sentinel=-1, na_value=None) -> np.ndarray: """ + Returns + ------- + ndarray[intp_t] + Examples -------- Factorize values with nans replaced by na_sentinel @@ -128,6 +138,9 @@ cdef class Int64Factorizer: >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ + cdef: + ndarray[intp_t] labels + if self.uniques.external_view_exists: uniques = Int64Vector() uniques.extend(self.uniques.to_array()) @@ -138,9 +151,6 @@ cdef class Int64Factorizer: # sort on if sort: - if labels.dtype != np.intp: - labels = labels.astype(np.intp) - sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 0b6bb170cc531..6ace327ca3599 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -539,12 +539,12 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int ret = 0 {{c_type}} val, na_value2 khiter_t k @@ -553,7 +553,7 @@ cdef class {{name}}HashTable(HashTable): uint8_t[:] mask_values if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None @@ -614,7 +614,7 @@ cdef class {{name}}HashTable(HashTable): labels[i] = idx if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): @@ -633,7 +633,7 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) + labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = {{name}}Vector() @@ -668,7 +668,7 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[intp_t] The labels from values to uniques """ uniques_vector = {{name}}Vector() @@ -918,12 +918,12 @@ cdef class StringHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int64_t[:] uindexer int ret = 0 object val @@ -933,7 +933,7 @@ cdef class StringHashTable(HashTable): bint use_na_value if return_inverse: - labels = np.zeros(n, dtype=np.int64) + labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -972,13 +972,13 @@ cdef class StringHashTable(HashTable): uindexer[count] = i if return_inverse: self.table.vals[k] = count - labels[i] = count + labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] - labels[i] = idx + labels[i] = idx free(vecs) @@ -987,7 +987,7 @@ cdef class StringHashTable(HashTable): uniques.append(values[uindexer[i]]) if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): @@ -1193,19 +1193,19 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int ret = 0 object val khiter_t k bint use_na_value if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None for i in range(n): @@ -1240,7 +1240,7 @@ cdef class PyObjectHashTable(HashTable): labels[i] = idx if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): @@ -1259,7 +1259,7 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) + labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() @@ -1292,7 +1292,7 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[intp_t] The labels from values to uniques """ uniques_vector = ObjectVector() diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 1b79d68c13570..511b373bc7e1f 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -10,6 +10,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + intp_t, ndarray, uint8_t, uint16_t, @@ -20,6 +21,7 @@ from numpy cimport ( cnp.import_array() from pandas._libs.algos import ( + ensure_int64, ensure_platform_int, groupsort_indexer, take_1d_int64_int64, @@ -27,7 +29,7 @@ from pandas._libs.algos import ( @cython.boundscheck(False) -def inner_join(const int64_t[:] left, const int64_t[:] right, +def inner_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 @@ -39,8 +41,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -78,7 +80,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, @cython.boundscheck(False) -def left_outer_join(const int64_t[:] left, const int64_t[:] right, +def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 @@ -91,8 +93,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -151,7 +153,7 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, @cython.boundscheck(False) -def full_outer_join(const int64_t[:] left, const int64_t[:] right, +def full_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 @@ -163,8 +165,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c01bf3931b27a..3c1279d62b126 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1973,7 +1973,7 @@ def _get_single_indexer(join_key, index, sort: bool = False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = libjoin.left_outer_join( - ensure_int64(left_key), ensure_int64(right_key), count, sort=sort + left_key, right_key, count, sort=sort ) return left_indexer, right_indexer @@ -2029,9 +2029,9 @@ def _factorize_keys( Returns ------- - array + np.ndarray[np.intp] Left (resp. right if called with `key='right'`) labels, as enumerated type. - array + np.ndarray[np.intp] Right (resp. left if called with `key='right'`) labels, as enumerated type. int Number of unique elements in union of left and right labels. @@ -2117,6 +2117,8 @@ def _factorize_keys( llab = rizer.factorize(lk) rlab = rizer.factorize(rk) + assert llab.dtype == np.intp, llab.dtype + assert rlab.dtype == np.intp, rlab.dtype count = rizer.get_count() @@ -2142,13 +2144,16 @@ def _factorize_keys( return llab, rlab, count -def _sort_labels(uniques: np.ndarray, left, right): +def _sort_labels( + uniques: np.ndarray, left: np.ndarray, right: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: + # Both returned ndarrays are np.intp llength = len(left) labels = np.concatenate([left, right]) _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) - new_labels = ensure_int64(new_labels) + assert new_labels.dtype == np.intp new_left, new_right = new_labels[:llength], new_labels[llength:] return new_left, new_right diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index 0bdb7b0e71e2d..f5426c71511bb 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -46,8 +46,8 @@ def test_outer_join_indexer(self, dtype): tm.assert_numpy_array_equal(rindexer, exp) def test_cython_left_outer_join(self): - left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp) + right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp) max_group = 5 ls, rs = left_outer_join(left, right, max_group) @@ -70,8 +70,8 @@ def test_cython_left_outer_join(self): tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): - left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp) + right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp) max_group = 5 rs, ls = left_outer_join(right, left, max_group) @@ -116,8 +116,8 @@ def test_cython_right_outer_join(self): tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): - left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp) + right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp) max_group = 5 ls, rs = inner_join(left, right, max_group) @@ -256,10 +256,10 @@ def test_left_outer_join_bug(): 0, 2, ], - dtype=np.int64, + dtype=np.intp, ) - right = np.array([3, 1], dtype=np.int64) + right = np.array([3, 1], dtype=np.intp) max_groups = 4 lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False) From bd8c79fa04ea76cb28b11052fb97ff00775f8f6d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 20 Mar 2021 02:14:13 +0100 Subject: [PATCH 08/14] PERF: increase the minimum number of elements to use numexpr for ops from 1e4 to 1e6 (#40502) --- pandas/core/computation/expressions.py | 2 +- pandas/tests/test_expressions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 0dbe5e8d83741..4f14ea73d5a88 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -40,7 +40,7 @@ } # the minimum prod shape that we will use numexpr -_MIN_ELEMENTS = 10000 +_MIN_ELEMENTS = 1_000_000 def set_use_numexpr(v=True): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 30f88ba5e76f6..96347ba5a733f 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -12,7 +12,7 @@ ) from pandas.core.computation import expressions as expr -_frame = DataFrame(np.random.randn(10000, 4), columns=list("ABCD"), dtype="float64") +_frame = DataFrame(np.random.randn(1000000, 4), columns=list("ABCD"), dtype="float64") _frame2 = DataFrame(np.random.randn(100, 4), columns=list("ABCD"), dtype="float64") _mixed = DataFrame( { From 3e6bc0c8ed88a5c5d238453e4548dc4f377b3fb7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 20 Mar 2021 02:15:23 +0100 Subject: [PATCH 09/14] [ArrayManager] TST: run all tests with separate not slow / slow build (#40495) --- .github/workflows/ci.yml | 45 ++++--------------- pandas/tests/reshape/concat/__init__.py | 4 ++ pandas/tests/reshape/test_crosstab.py | 3 ++ pandas/tests/reshape/test_pivot.py | 3 ++ pandas/tests/reshape/test_pivot_multilevel.py | 3 ++ 5 files changed, 21 insertions(+), 37 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 014c666a17084..d6744f578560c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -141,6 +141,9 @@ jobs: data_manager: name: Test experimental data manager runs-on: ubuntu-latest + strategy: + matrix: + pattern: ["not slow and not network and not clipboard", "slow"] steps: - name: Checkout @@ -152,43 +155,11 @@ jobs: - name: Run tests env: PANDAS_DATA_MANAGER: array + PATTERN: ${{ matrix.pattern }} + PYTEST_WORKERS: "auto" run: | source activate pandas-dev + ci/run_tests.sh - pytest pandas/tests/frame/ - pytest pandas/tests/reductions/ - pytest pandas/tests/generic/test_generic.py - pytest pandas/tests/arithmetic/ - pytest pandas/tests/groupby/ - pytest pandas/tests/resample/ - pytest pandas/tests/reshape/merge - pytest pandas/tests/series/ - pytest pandas/tests/indexing/ - - pytest pandas/tests/test_* - pytest pandas/tests/api/ - pytest pandas/tests/apply/ - pytest pandas/tests/arrays/ - pytest pandas/tests/base/ - pytest pandas/tests/computation/ - pytest pandas/tests/config/ - pytest pandas/tests/dtypes/ - pytest pandas/tests/extension/ - pytest pandas/tests/generic/ - pytest pandas/tests/indexes/ - pytest pandas/tests/internals/ - pytest pandas/tests/io/test_* -m "not slow and not clipboard" - pytest pandas/tests/io/excel/ -m "not slow and not clipboard" - pytest pandas/tests/io/formats/ -m "not slow and not clipboard" - pytest pandas/tests/io/parser/ -m "not slow and not clipboard" - pytest pandas/tests/io/sas/ -m "not slow and not clipboard" - pytest pandas/tests/io/xml/ -m "not slow and not clipboard" - pytest pandas/tests/libs/ - pytest pandas/tests/plotting/ - pytest pandas/tests/scalar/ - pytest pandas/tests/strings/ - pytest pandas/tests/tools/ - pytest pandas/tests/tseries/ - pytest pandas/tests/tslibs/ - pytest pandas/tests/util/ - pytest pandas/tests/window/ + - name: Print skipped tests + run: python ci/print_skipped.py diff --git a/pandas/tests/reshape/concat/__init__.py b/pandas/tests/reshape/concat/__init__.py index e69de29bb2d1d..777923be02398 100644 --- a/pandas/tests/reshape/concat/__init__.py +++ b/pandas/tests/reshape/concat/__init__.py @@ -0,0 +1,4 @@ +import pandas.util._test_decorators as td + +# TODO(ArrayManager) concat axis=0 +pytestmark = td.skip_array_manager_not_yet_implemented diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index e467dbb7d49b6..5cc65feee869b 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_categorical_dtype from pandas import ( @@ -438,6 +440,7 @@ def test_crosstab_normalize_arrays(self): ) tm.assert_frame_equal(test_case, norm_sum) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat axis=0 def test_crosstab_with_empties(self): # Check handling of empties df = DataFrame( diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c50886ba43019..8d8a83c233444 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -8,6 +8,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -1197,6 +1199,7 @@ def test_pivot_table_with_margins_set_margin_name(self, margin_name): margins_name=margin_name, ) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat axis=0 def test_pivot_timegrouper(self): df = DataFrame( { diff --git a/pandas/tests/reshape/test_pivot_multilevel.py b/pandas/tests/reshape/test_pivot_multilevel.py index df2ae0d52c660..ab41a94d1ff25 100644 --- a/pandas/tests/reshape/test_pivot_multilevel.py +++ b/pandas/tests/reshape/test_pivot_multilevel.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Index, @@ -196,6 +198,7 @@ def test_pivot_list_like_columns( tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat axis=0 def test_pivot_multiindexed_rows_and_cols(): # GH 36360 From fd9354a715c6bb4f5bcd689c6312188eeea5561e Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Sat, 20 Mar 2021 02:17:05 +0100 Subject: [PATCH 10/14] DOC: Styler docs - split PR from #39720 (#40493) --- pandas/io/formats/style.py | 148 +++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 49 deletions(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 3abb39d2194c0..5ec2141028fa4 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -217,8 +217,10 @@ def set_tooltips( css_class: Optional[str] = None, ) -> Styler: """ - Add string based tooltips that will appear in the `Styler` HTML result. These - tooltips are applicable only to`` elements. + Set the DataFrame of strings on ``Styler`` generating ``:hover`` tooltips. + + These string based tooltips are only applicable to ```` HTML elements, + and cannot be used for column or index headers. .. versionadded:: 1.3.0 @@ -227,7 +229,7 @@ def set_tooltips( ttips : DataFrame DataFrame containing strings that will be translated to tooltips, mapped by identical column and index values that must exist on the underlying - `Styler` data. None, NaN values, and empty strings will be ignored and + Styler data. None, NaN values, and empty strings will be ignored and not affect the rendered HTML. props : list-like or str, optional List of (attr, value) tuples or a valid CSS string. If ``None`` adopts @@ -671,21 +673,33 @@ def format( def set_td_classes(self, classes: DataFrame) -> Styler: """ - Add string based CSS class names to data cells that will appear within the - `Styler` HTML result. These classes are added within specified `` elements. + Set the DataFrame of strings added to the ``class`` attribute of ```` + HTML elements. Parameters ---------- classes : DataFrame DataFrame containing strings that will be translated to CSS classes, - mapped by identical column and index values that must exist on the - underlying `Styler` data. None, NaN values, and empty strings will + mapped by identical column and index key values that must exist on the + underlying Styler data. None, NaN values, and empty strings will be ignored and not affect the rendered HTML. Returns ------- self : Styler + See Also + -------- + Styler.set_table_styles: Set the table styles included within the ``' - '' + '' + '
' ' ' ' ' ' ' ' ' - ' ' + ' ' ' ' '
0
1
1
' """ @@ -736,7 +750,7 @@ def set_td_classes(self, classes: DataFrame) -> Styler: def render(self, **kwargs) -> str: """ - Render the built up styles to HTML. + Render the ``Styler`` including all applied styles to HTML. Parameters ---------- @@ -753,7 +767,7 @@ def render(self, **kwargs) -> str: Notes ----- - ``Styler`` objects have defined the ``_repr_html_`` method + Styler objects have defined the ``_repr_html_`` method which automatically calls ``self.render()`` when it's the last item in a Notebook cell. When calling ``Styler.render()`` directly, wrap the result in ``IPython.display.HTML`` to view @@ -779,7 +793,7 @@ def render(self, **kwargs) -> str: def _update_ctx(self, attrs: DataFrame) -> None: """ - Update the state of the Styler for data cells. + Update the state of the ``Styler`` for data cells. Collects a mapping of {index_label: [('', ''), ..]}. @@ -839,7 +853,7 @@ def __deepcopy__(self, memo) -> Styler: def clear(self) -> None: """ - Reset the styler, removing any previously applied styles. + Reset the ``Styler``, removing any previously applied styles. Returns None. """ @@ -923,10 +937,11 @@ def apply( Parameters ---------- func : function - ``func`` should take a Series or DataFrame (depending - on ``axis``), and return an object with the same shape. - Must return a DataFrame with identical index and - column labels or an ndarray with same shape as input when ``axis=None``. + ``func`` should take a Series if ``axis`` in [0,1] and return an object + of same length, also with identical index if the object is a Series. + ``func`` should take a DataFrame if ``axis`` is ``None`` and return either + an ndarray with the same shape or a DataFrame with identical columns and + index. .. versionchanged:: 1.3.0 @@ -944,13 +959,16 @@ def apply( ------- self : Styler + See Also + -------- + Styler.where: Apply CSS-styles based on a conditional function elementwise. + Styler.applymap: Apply a CSS-styling function elementwise. + Notes ----- - The output of ``func`` should be elements having CSS style as string or, + The elements of the output of ``func`` should be CSS styles as strings, in the + format 'attribute: value; attribute2: value2; ...' or, if nothing is to be applied to that element, an empty string or ``None``. - The output shape must match the input, i.e. if - ``x`` is the input row, column, or table (depending on ``axis``), - then ``func(x).shape == x.shape`` should be ``True``. This is similar to ``DataFrame.apply``, except that ``axis=None`` applies the function to the entire DataFrame at once, @@ -1001,13 +1019,14 @@ def applymap(self, func: Callable, subset=None, **kwargs) -> Styler: See Also -------- - Styler.where: Updates the HTML representation with a style which is - selected in accordance with the return value of a function. + Styler.where: Apply CSS-styles based on a conditional function elementwise. + Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. Notes ----- - The output of ``func`` should be a CSS style as string or, if nothing is to be - applied, an empty string or ``None``. + The elements of the output of ``func`` should be CSS styles as strings, in the + format 'attribute: value; attribute2: value2; ...' or, + if nothing is to be applied to that element, an empty string or ``None``. Examples -------- @@ -1030,7 +1049,7 @@ def where( **kwargs, ) -> Styler: """ - Apply a function elementwise. + Apply CSS-styles based on a conditional function elementwise. Updates the HTML representation with a style which is selected in accordance with the return value of a function. @@ -1055,7 +1074,15 @@ def where( See Also -------- - Styler.applymap: Updates the HTML representation with the result. + Styler.applymap: Apply a CSS-styling function elementwise. + Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. + + Examples + -------- + >>> def cond(v): + ... return v > 1 and v != 4 + >>> df = pd.DataFrame([[1, 2], [3, 4]]) + >>> df.style.where(cond, value='color:red;', other='font-size:2em;') """ if other is None: other = "" @@ -1092,10 +1119,9 @@ def set_precision(self, precision: int) -> Styler: def set_table_attributes(self, attributes: str) -> Styler: """ - Set the table attributes. + Set the table attributes added to the ```` HTML element. - These are the items that show up in the opening ``
`` tag - in addition to automatic (by default) id. + These are items in addition to automatic (by default) ``id`` attribute. Parameters ---------- @@ -1105,6 +1131,13 @@ def set_table_attributes(self, attributes: str) -> Styler: ------- self : Styler + See Also + -------- + Styler.set_table_styles: Set the table styles included within the ``