diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 014c666a17084..d6744f578560c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -141,6 +141,9 @@ jobs: data_manager: name: Test experimental data manager runs-on: ubuntu-latest + strategy: + matrix: + pattern: ["not slow and not network and not clipboard", "slow"] steps: - name: Checkout @@ -152,43 +155,11 @@ jobs: - name: Run tests env: PANDAS_DATA_MANAGER: array + PATTERN: ${{ matrix.pattern }} + PYTEST_WORKERS: "auto" run: | source activate pandas-dev + ci/run_tests.sh - pytest pandas/tests/frame/ - pytest pandas/tests/reductions/ - pytest pandas/tests/generic/test_generic.py - pytest pandas/tests/arithmetic/ - pytest pandas/tests/groupby/ - pytest pandas/tests/resample/ - pytest pandas/tests/reshape/merge - pytest pandas/tests/series/ - pytest pandas/tests/indexing/ - - pytest pandas/tests/test_* - pytest pandas/tests/api/ - pytest pandas/tests/apply/ - pytest pandas/tests/arrays/ - pytest pandas/tests/base/ - pytest pandas/tests/computation/ - pytest pandas/tests/config/ - pytest pandas/tests/dtypes/ - pytest pandas/tests/extension/ - pytest pandas/tests/generic/ - pytest pandas/tests/indexes/ - pytest pandas/tests/internals/ - pytest pandas/tests/io/test_* -m "not slow and not clipboard" - pytest pandas/tests/io/excel/ -m "not slow and not clipboard" - pytest pandas/tests/io/formats/ -m "not slow and not clipboard" - pytest pandas/tests/io/parser/ -m "not slow and not clipboard" - pytest pandas/tests/io/sas/ -m "not slow and not clipboard" - pytest pandas/tests/io/xml/ -m "not slow and not clipboard" - pytest pandas/tests/libs/ - pytest pandas/tests/plotting/ - pytest pandas/tests/scalar/ - pytest pandas/tests/strings/ - pytest pandas/tests/tools/ - pytest pandas/tests/tseries/ - pytest pandas/tests/tslibs/ - pytest pandas/tests/util/ - pytest pandas/tests/window/ + - name: Print skipped tests + run: python ci/print_skipped.py diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index a30dbc048c03d..ba5a0a1fd0909 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -12,17 +12,19 @@ env: PYTEST_WORKERS: "auto" PANDAS_CI: 1 PATTERN: ((not slow and not network and not clipboard) or (single and db)) + COVERAGE: true jobs: - Linux_py37_locale: + Linux_py37_IO: runs-on: ubuntu-latest defaults: run: shell: bash -l {0} - env: - ENV_FILE: ci/deps/actions-37-locale.yaml - LOCALE_OVERRIDE: zh_CN.UTF-8 + strategy: + matrix: + ENV_FILE: [ci/deps/actions-37-db-min.yaml, ci/deps/actions-37-db.yaml] + fail-fast: false services: mysql: @@ -63,106 +65,20 @@ jobs: with: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('${{ env.ENV_FILE }}') }} + hashFiles('${{ matrix.ENV_FILE }}') }} - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: pandas-dev channel-priority: strict - environment-file: ${{ env.ENV_FILE }} + environment-file: ${{ matrix.ENV_FILE }} use-only-tar-bz2: true - name: Build Pandas uses: ./.github/actions/build_pandas - name: Test - run: ci/run_tests.sh - if: always() - - - name: Build Version - run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - - name: Publish test results - uses: actions/upload-artifact@master - with: - name: Test results - path: test-data.xml - if: failure() - - - name: Print skipped tests - run: python ci/print_skipped.py - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - flags: unittests - name: codecov-pandas - fail_ci_if_error: false - - Linux_py37_cov: - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - - env: - ENV_FILE: ci/deps/actions-37-cov.yaml - PANDAS_TESTING_MODE: deprecate - COVERAGE: true - - services: - mysql: - image: mysql - env: - MYSQL_ALLOW_EMPTY_PASSWORD: yes - MYSQL_DATABASE: pandas - options: >- - --health-cmd "mysqladmin ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 3306:3306 - - postgres: - image: postgres - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: pandas - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - - steps: - - name: Checkout - uses: actions/checkout@v1 - - - name: Cache conda - uses: actions/cache@v1 - env: - CACHE_NUMBER: 0 - with: - path: ~/conda_pkgs_dir - key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('${{ env.ENV_FILE }}') }} - - - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: pandas-dev - channel-priority: strict - environment-file: ${{ env.ENV_FILE }} - use-only-tar-bz2: true - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - run: ci/run_tests.sh + run: pytest -m "${{ env.PATTERN }}" -n 2 --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml -s --cov=pandas --cov-report=xml pandas/tests/io if: always() - name: Build Version diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aa8c2b74d7a7e..e3dd6b018b8aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,10 +50,6 @@ repos: rev: 5.7.0 hooks: - id: isort -- repo: https://github.com/MarcoGorelli/no-string-hints - rev: v0.1.7 - hooks: - - id: no-string-hints - repo: https://github.com/asottile/pyupgrade rev: v2.10.0 hooks: @@ -111,12 +107,6 @@ repos: pandas/tests/io/excel/test_writers\.py |pandas/tests/io/pytables/common\.py |pandas/tests/io/pytables/test_store\.py$ - - id: no-pandas-api-types - name: Check code for instances of pd.api.types - entry: (pd|pandas)\.api\.types\. - language: pygrep - types: [python] - files: ^pandas/tests/ - id: non-standard-imports name: Check for non-standard imports language: pygrep @@ -128,6 +118,11 @@ repos: # Check for imports from collections.abc instead of `from collections import abc` |from\ collections\.abc\ import + + # Numpy + |from\ numpy\ import\ random + |from\ numpy\.random\ import + types: [python] - id: non-standard-imports-in-tests name: Check for non-standard imports in test suite language: pygrep @@ -143,26 +138,17 @@ repos: # Check for use of pandas.testing instead of tm |pd\.testing\. + + # Check for pd.api.types instead of from pandas.api.types import ... + |(pd|pandas)\.api\.types\. types: [python] files: ^pandas/tests/ - - id: non-standard-numpy-random-related-imports - name: Check for non-standard numpy.random-related imports excluding pandas/_testing.py - language: pygrep - exclude: pandas/_testing.py + - id: np-bool-and-np-object + name: Check for use of np.bool/np.object instead of np.bool_/np.object_ entry: | (?x) - # Check for imports from np.random. instead of `from numpy import random` or `from numpy.random import ` - from\ numpy\ import\ random - |from\ numpy.random\ import - types: [python] - - id: np-bool - name: Check for use of np.bool instead of np.bool_ - entry: np\.bool[^_8] - language: pygrep - types_or: [python, cython, rst] - - id: np-object - name: Check for use of np.object instead of np.object_ - entry: np\.object[^_8] + np\.bool[^_8] + |np\.object[^_8] language: pygrep types_or: [python, cython, rst] - id: pip-to-conda diff --git a/ci/deps/actions-37-locale.yaml b/ci/deps/actions-37-db-min.yaml similarity index 97% rename from ci/deps/actions-37-locale.yaml rename to ci/deps/actions-37-db-min.yaml index 551308f1d5fac..1d3794576220a 100644 --- a/ci/deps/actions-37-locale.yaml +++ b/ci/deps/actions-37-db-min.yaml @@ -7,6 +7,7 @@ dependencies: # tools - cython>=0.29.21 - pytest>=5.0.1 + - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-37-cov.yaml b/ci/deps/actions-37-db.yaml similarity index 100% rename from ci/deps/actions-37-cov.yaml rename to ci/deps/actions-37-db.yaml diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5783d3c2353aa..047eb848b7540 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -199,8 +199,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): Returns ------- - tuple - 1-d indexer ordered by groups, group counts. + ndarray[intp_t, ndim=1] + Indexer + ndarray[int64_t, ndim=1] + Group Counts Notes ----- @@ -208,11 +210,12 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): """ cdef: Py_ssize_t i, loc, label, n - ndarray[int64_t] counts, where, result + ndarray[int64_t] counts, where + ndarray[intp_t] indexer counts = np.zeros(ngroups + 1, dtype=np.int64) n = len(index) - result = np.zeros(n, dtype=np.int64) + indexer = np.zeros(n, dtype=np.intp) where = np.zeros(ngroups + 1, dtype=np.int64) with nogil: @@ -228,10 +231,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): # this is our indexer for i in range(n): label = index[i] + 1 - result[where[label]] = i + indexer[where[label]] = i where[label] += 1 - return result, counts + return indexer, counts @cython.boundscheck(False) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 4eefd9d1f7267..cdf4ef3b119d2 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -66,7 +66,7 @@ def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values, {{else}} def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, {{endif}} - const int64_t[:] indexer, + const intp_t[:] indexer, {{c_type_out}}[:] out, fill_value=np.nan): @@ -102,7 +102,7 @@ def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[int64_t] indexer, + ndarray[intp_t] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): cdef: @@ -156,7 +156,7 @@ def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[int64_t] indexer, + ndarray[intp_t] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1bfb66cbf21ac..89020f2078584 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -19,6 +19,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + intp_t, ndarray, uint8_t, uint16_t, @@ -141,6 +142,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K, ngroups, size ndarray[int64_t] _counts ndarray[float64_t, ndim=2] data + ndarray[intp_t] indexer float64_t* ptr assert min_count == -1, "'min_count' only used in add and prod" diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 1bbffaa7bb5d2..e402a4b7c0ccc 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -66,13 +66,18 @@ cdef class Factorizer: self.uniques = ObjectVector() self.count = 0 - def get_count(self): + def get_count(self) -> int: return self.count def factorize( self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None - ): + ) -> np.ndarray: """ + + Returns + ------- + np.ndarray[np.intp] + Examples -------- Factorize values with nans replaced by na_sentinel @@ -80,6 +85,9 @@ cdef class Factorizer: >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ + cdef: + ndarray[intp_t] labels + if self.uniques.external_view_exists: uniques = ObjectVector() uniques.extend(self.uniques.to_array()) @@ -89,8 +97,6 @@ cdef class Factorizer: mask = (labels == na_sentinel) # sort on if sort: - if labels.dtype != np.intp: - labels = labels.astype(np.intp) sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) @@ -119,8 +125,12 @@ cdef class Int64Factorizer: return self.count def factorize(self, const int64_t[:] values, sort=False, - na_sentinel=-1, na_value=None): + na_sentinel=-1, na_value=None) -> np.ndarray: """ + Returns + ------- + ndarray[intp_t] + Examples -------- Factorize values with nans replaced by na_sentinel @@ -128,6 +138,9 @@ cdef class Int64Factorizer: >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ + cdef: + ndarray[intp_t] labels + if self.uniques.external_view_exists: uniques = Int64Vector() uniques.extend(self.uniques.to_array()) @@ -138,9 +151,6 @@ cdef class Int64Factorizer: # sort on if sort: - if labels.dtype != np.intp: - labels = labels.astype(np.intp) - sorter = self.uniques.to_array().argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 0b6bb170cc531..6ace327ca3599 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -539,12 +539,12 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int ret = 0 {{c_type}} val, na_value2 khiter_t k @@ -553,7 +553,7 @@ cdef class {{name}}HashTable(HashTable): uint8_t[:] mask_values if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None @@ -614,7 +614,7 @@ cdef class {{name}}HashTable(HashTable): labels[i] = idx if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): @@ -633,7 +633,7 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) + labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = {{name}}Vector() @@ -668,7 +668,7 @@ cdef class {{name}}HashTable(HashTable): ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[intp_t] The labels from values to uniques """ uniques_vector = {{name}}Vector() @@ -918,12 +918,12 @@ cdef class StringHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int64_t[:] uindexer int ret = 0 object val @@ -933,7 +933,7 @@ cdef class StringHashTable(HashTable): bint use_na_value if return_inverse: - labels = np.zeros(n, dtype=np.int64) + labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -972,13 +972,13 @@ cdef class StringHashTable(HashTable): uindexer[count] = i if return_inverse: self.table.vals[k] = count - labels[i] = count + labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] - labels[i] = idx + labels[i] = idx free(vecs) @@ -987,7 +987,7 @@ cdef class StringHashTable(HashTable): uniques.append(values[uindexer[i]]) if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): @@ -1193,19 +1193,19 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) - int64_t[:] labels + intp_t[:] labels int ret = 0 object val khiter_t k bint use_na_value if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None for i in range(n): @@ -1240,7 +1240,7 @@ cdef class PyObjectHashTable(HashTable): labels[i] = idx if return_inverse: - return uniques.to_array(), np.asarray(labels) + return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): @@ -1259,7 +1259,7 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) + labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() @@ -1292,7 +1292,7 @@ cdef class PyObjectHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[intp_t] The labels from values to uniques """ uniques_vector = ObjectVector() diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8b643c03b6a19..5352ca53e1b54 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -13,14 +13,16 @@ import numpy as np cimport numpy as cnp from numpy cimport ( - NPY_INT64, + NPY_INTP, int64_t, + intp_t, ndarray, ) cnp.import_array() from pandas._libs.algos import ensure_int64 + from pandas._libs.util cimport is_integer_object @@ -30,7 +32,7 @@ cdef class BlockPlacement: # __slots__ = '_as_slice', '_as_array', '_len' cdef: slice _as_slice - ndarray _as_array # Note: this still allows `None` + ndarray _as_array # Note: this still allows `None`; will be intp_t bint _has_slice, _has_array, _is_known_slice_like def __cinit__(self, val): @@ -53,12 +55,12 @@ cdef class BlockPlacement: self._as_slice = slc self._has_slice = True else: - arr = np.empty(0, dtype=np.int64) + arr = np.empty(0, dtype=np.intp) self._as_array = arr self._has_array = True else: # Cython memoryview interface requires ndarray to be writeable. - arr = np.require(val, dtype=np.int64, requirements='W') + arr = np.require(val, dtype=np.intp, requirements='W') assert arr.ndim == 1, arr.shape self._as_array = arr self._has_array = True @@ -125,8 +127,8 @@ cdef class BlockPlacement: if not self._has_array: start, stop, step, _ = slice_get_indices_ex(self._as_slice) # NOTE: this is the C-optimized equivalent of - # `np.arange(start, stop, step, dtype=np.int64)` - self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64) + # `np.arange(start, stop, step, dtype=np.intp)` + self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP) self._has_array = True return self._as_array @@ -325,13 +327,13 @@ cdef slice_getitem(slice slc, ind): else: # NOTE: # this is the C-optimized equivalent of - # `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]` - return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind] + # `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]` + return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind] @cython.boundscheck(False) @cython.wraparound(False) -cdef slice indexer_as_slice(int64_t[:] vals): +cdef slice indexer_as_slice(intp_t[:] vals): cdef: Py_ssize_t i, n, start, stop int64_t d diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index a8728050f8071..1af5b23e3393f 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -238,6 +238,8 @@ NODE_CLASSES = {} {{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, cmp_left_converse, cmp_right_converse, fused_prefix in nodes}} + +@cython.internal cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: """Non-terminal node for an IntervalTree diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 1b79d68c13570..c2947de943e1a 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -10,6 +10,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + intp_t, ndarray, uint8_t, uint16_t, @@ -20,6 +21,7 @@ from numpy cimport ( cnp.import_array() from pandas._libs.algos import ( + ensure_int64, ensure_platform_int, groupsort_indexer, take_1d_int64_int64, @@ -27,11 +29,12 @@ from pandas._libs.algos import ( @cython.boundscheck(False) -def inner_join(const int64_t[:] left, const int64_t[:] right, +def inner_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[intp_t] left_sorter, right_sorter + ndarray[int64_t] left_count, right_count ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 @@ -39,8 +42,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -78,12 +81,12 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, @cython.boundscheck(False) -def left_outer_join(const int64_t[:] left, const int64_t[:] right, +def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter - ndarray rev + ndarray[int64_t] left_count, right_count + ndarray[intp_t] rev, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 @@ -91,8 +94,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -151,11 +154,12 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, @cython.boundscheck(False) -def full_outer_join(const int64_t[:] left, const int64_t[:] right, +def full_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[intp_t] left_sorter, right_sorter + ndarray[int64_t] left_count, right_count ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc int64_t left_pos = 0, right_pos = 0 @@ -163,8 +167,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) - right_sorter, right_count = groupsort_indexer(right, max_groups) + left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) + right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -213,12 +217,16 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, _get_result_indexer(right_sorter, right_indexer)) -cdef _get_result_indexer(ndarray[int64_t] sorter, ndarray[int64_t] indexer): +cdef ndarray[int64_t] _get_result_indexer( + ndarray[intp_t] sorter, ndarray[int64_t] indexer +): if len(sorter) > 0: # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` res = np.empty(len(indexer), dtype=np.int64) - take_1d_int64_int64(sorter, indexer, res, -1) + take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1) + # FIXME: sorter is intp_t, not int64_t, opposite for indexer; + # will this break on 32bit builds? else: # length-0 case res = np.empty(len(indexer), dtype=np.int64) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1ff481553e413..9ef3c859633c2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -451,7 +451,7 @@ def fast_zip(list ndarrays) -> ndarray[object]: return result -def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length): +def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: """ Reverse indexing operation. @@ -459,14 +459,25 @@ def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length): indexer_inv[indexer[x]] = x - .. note:: If indexer is not unique, only first occurrence is accounted. + Parameters + ---------- + indexer : np.ndarray[np.intp] + length : int + + Returns + ------- + np.ndarray[np.intp] + + Notes + ----- + If indexer is not unique, only first occurrence is accounted. """ cdef: Py_ssize_t i, n = len(indexer) - ndarray[int64_t] rev_indexer - int64_t idx + ndarray[intp_t] rev_indexer + intp_t idx - rev_indexer = np.empty(length, dtype=np.int64) + rev_indexer = np.empty(length, dtype=np.intp) rev_indexer[:] = -1 for i in range(n): idx = indexer[i] @@ -808,23 +819,32 @@ def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, @cython.boundscheck(False) @cython.wraparound(False) -def get_level_sorter(const int64_t[:] label, const int64_t[:] starts): +def get_level_sorter( + ndarray[int64_t, ndim=1] codes, const intp_t[:] starts +) -> ndarray: """ Argsort for a single level of a multi-index, keeping the order of higher levels unchanged. `starts` points to starts of same-key indices w.r.t to leading levels; equivalent to: - np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort') + np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort') + starts[i] for i in range(len(starts) - 1)]) + + Parameters + ---------- + codes : np.ndarray[int64_t, ndim=1] + starts : np.ndarray[intp, ndim=1] + + Returns + ------- + np.ndarray[np.int, ndim=1] """ cdef: - int64_t l, r - Py_ssize_t i - ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64) - ndarray[int64_t, ndim=1] label_arr = np.asarray(label) + Py_ssize_t i, l, r + ndarray[intp_t, ndim=1] out = np.empty(len(codes), dtype=np.intp) for i in range(len(starts) - 1): l, r = starts[i], starts[i + 1] - out[l:r] = l + label_arr[l:r].argsort(kind='mergesort') + out[l:r] = l + codes[l:r].argsort(kind='mergesort') return out @@ -1120,6 +1140,7 @@ except AttributeError: pass +@cython.internal cdef class Seen: """ Class for keeping track of the types of elements @@ -2580,7 +2601,7 @@ def tuples_to_object_array(ndarray[object] tuples): return result -def to_object_array_tuples(rows: object): +def to_object_array_tuples(rows: object) -> np.ndarray: """ Convert a list of tuples into an object array. Any subclass of tuple in `rows` will be casted to tuple. @@ -2592,7 +2613,7 @@ def to_object_array_tuples(rows: object): Returns ------- - numpy array of the object dtype. + np.ndarray[object, ndim=2] """ cdef: Py_ssize_t i, j, n, k, tmp diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 1e51a578c44ea..ecb7041fb2c5a 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -32,7 +32,7 @@ from pandas._libs.util cimport ( @cython.wraparound(False) @cython.boundscheck(False) -def scalar_compare(object[:] values, object val, object op): +def scalar_compare(object[:] values, object val, object op) -> ndarray: """ Compare each element of `values` array with the scalar `val`, with the comparison operation described by `op`. @@ -114,7 +114,7 @@ def scalar_compare(object[:] values, object val, object op): @cython.wraparound(False) @cython.boundscheck(False) -def vec_compare(ndarray[object] left, ndarray[object] right, object op): +def vec_compare(ndarray[object] left, ndarray[object] right, object op) -> ndarray: """ Compare the elements of `left` with the elements of `right` pointwise, with the comparison operation described by `op`. @@ -180,7 +180,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): @cython.wraparound(False) @cython.boundscheck(False) -def scalar_binop(object[:] values, object val, object op): +def scalar_binop(object[:] values, object val, object op) -> ndarray: """ Apply the given binary operator `op` between each element of the array `values` and the scalar `val`. @@ -217,7 +217,7 @@ def scalar_binop(object[:] values, object val, object op): @cython.wraparound(False) @cython.boundscheck(False) -def vec_binop(object[:] left, object[:] right, object op): +def vec_binop(object[:] left, object[:] right, object op) -> ndarray: """ Apply the given binary operator `op` pointwise to the elements of arrays `left` and `right`. diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 75dbb4b74aabd..05b255c40f4b2 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -41,7 +41,7 @@ ctypedef fused reshape_t: @cython.boundscheck(False) def unstack(reshape_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, - reshape_t[:, :] new_values, uint8_t[:, :] new_mask): + reshape_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: """ Transform long values to wide new_values. @@ -111,7 +111,10 @@ def explode(ndarray[object] values): Returns ------- - tuple(values, counts) + ndarray[object] + result + ndarray[int64_t] + counts """ cdef: Py_ssize_t i, j, count, n diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 2879528b2c501..d86d3261d404e 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -286,7 +286,7 @@ cdef class _NaT(datetime): # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. return "NaT" - def __hash__(self): + def __hash__(self) -> int: return NPY_NAT @property diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 30d9f5e64b282..02bdae3a8dbac 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -90,7 +90,7 @@ def ints_to_pydatetime( object freq=None, bint fold=False, str box="datetime" -): +) -> np.ndarray: """ Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp. @@ -116,7 +116,7 @@ def ints_to_pydatetime( Returns ------- - ndarray of dtype specified by box + ndarray[object] of type specified by box """ cdef: Py_ssize_t i, n = len(arr) @@ -223,7 +223,7 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts): return RESO_DAY -def get_resolution(const int64_t[:] stamps, tzinfo tz=None): +def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: Py_ssize_t i, n = len(stamps) npy_datetimestruct dts @@ -332,7 +332,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t @cython.wraparound(False) @cython.boundscheck(False) -def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): +def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: """ Check if all of the given (nanosecond) timestamps are normalized to midnight, i.e. hour == minute == second == 0. If the optional timezone diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 6adda1fe92044..9fbeb67aa35e9 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -77,7 +77,7 @@ def write_csv_rows( @cython.boundscheck(False) @cython.wraparound(False) -def convert_json_to_lines(arr: object) -> str: +def convert_json_to_lines(arr: str) -> str: """ replace comma separated json with line feeds, paying special attention to quotes & brackets diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index c1abd8bbf39d0..ba1b2a0f0e76e 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -117,10 +117,10 @@ def _take_nd_ndarray( ) -> np.ndarray: if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.int64) + indexer = np.arange(arr.shape[axis], dtype=np.intp) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = ensure_int64(indexer, copy=False) + indexer = ensure_platform_int(indexer) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, out, fill_value, allow_fill ) @@ -317,7 +317,7 @@ def _get_take_nd_function( if func is None: def func(arr, indexer, out, fill_value=np.nan): - indexer = ensure_int64(indexer) + indexer = ensure_platform_int(indexer) _take_nd_object( arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info ) @@ -468,7 +468,7 @@ def wrapper( def _take_nd_object( arr: np.ndarray, - indexer: np.ndarray, + indexer: np.ndarray, # np.ndarray[np.intp] out: np.ndarray, axis: int, fill_value, @@ -544,4 +544,5 @@ def _take_preprocess_indexer_and_fill_value( # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() + indexer = ensure_platform_int(indexer) return indexer, dtype, fill_value, mask_info diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 848e467afb7b6..678e532f05772 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -8,12 +8,16 @@ Type, TypeVar, Union, + cast, ) import numpy as np from pandas._libs import lib -from pandas._typing import Shape +from pandas._typing import ( + F, + Shape, +) from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import ( @@ -41,7 +45,7 @@ ) -def ravel_compat(meth): +def ravel_compat(meth: F) -> F: """ Decorator to ravel a 2D array before passing it to a cython operation, then reshape the result to our own shape. @@ -58,7 +62,7 @@ def method(self, *args, **kwargs): order = "F" if flags.f_contiguous else "C" return result.reshape(self.shape, order=order) - return method + return cast(F, method) class NDArrayBackedExtensionArray(ExtensionArray): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 864bd0684d445..769ae52744c74 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -305,7 +305,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Notes ----- See the `user guide - `_ + `__ for more. Examples @@ -1961,7 +1961,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: Returns ------- - dict of categories -> indexers + Dict[Hashable, np.ndarray[np.intp]] + dict of categories -> indexers Examples -------- @@ -1979,7 +1980,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ categories = self.categories r, counts = libalgos.groupsort_indexer( - self.codes.astype("int64"), categories.size + self.codes.astype("int64", copy=False), categories.size ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index ea561dca9a090..1cc0465121335 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -154,7 +154,7 @@ Notes ----- See the `user guide -`_ +`__ for more. %(examples)s\ diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 0dbe5e8d83741..4f14ea73d5a88 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -40,7 +40,7 @@ } # the minimum prod shape that we will use numexpr -_MIN_ELEMENTS = 10000 +_MIN_ELEMENTS = 1_000_000 def set_use_numexpr(v=True): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 7a2d6468f1b63..32ea82d9c0402 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1100,7 +1100,7 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: # This exists to silence numpy deprecation warnings, see GH#29553 -def is_numeric_v_string_like(a, b): +def is_numeric_v_string_like(a: ArrayLike, b): """ Check if we are comparing a string-like object to a numeric ndarray. NumPy doesn't like to compare such objects, especially numeric arrays @@ -1108,7 +1108,7 @@ def is_numeric_v_string_like(a, b): Parameters ---------- - a : array-like, scalar + a : array-like The first object to check. b : array-like, scalar The second object to check. @@ -1120,16 +1120,8 @@ def is_numeric_v_string_like(a, b): Examples -------- - >>> is_numeric_v_string_like(1, 1) - False - >>> is_numeric_v_string_like("foo", "foo") - False - >>> is_numeric_v_string_like(1, "foo") # non-array numeric - False >>> is_numeric_v_string_like(np.array([1]), "foo") True - >>> is_numeric_v_string_like("foo", np.array([1])) # symmetric check - True >>> is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"])) True >>> is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2])) @@ -1142,17 +1134,15 @@ def is_numeric_v_string_like(a, b): is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) - is_a_numeric_array = is_a_array and is_numeric_dtype(a) - is_b_numeric_array = is_b_array and is_numeric_dtype(b) - is_a_string_array = is_a_array and is_string_like_dtype(a) - is_b_string_array = is_b_array and is_string_like_dtype(b) + is_a_numeric_array = is_a_array and a.dtype.kind in ("u", "i", "f", "c", "b") + is_b_numeric_array = is_b_array and b.dtype.kind in ("u", "i", "f", "c", "b") + is_a_string_array = is_a_array and a.dtype.kind in ("S", "U") + is_b_string_array = is_b_array and b.dtype.kind in ("S", "U") - is_a_scalar_string_like = not is_a_array and isinstance(a, str) is_b_scalar_string_like = not is_b_array and isinstance(b, str) return ( (is_a_numeric_array and is_b_scalar_string_like) - or (is_b_numeric_array and is_a_scalar_string_like) or (is_a_numeric_array and is_b_string_array) or (is_b_numeric_array and is_a_string_array) ) @@ -1305,37 +1295,6 @@ def is_numeric_dtype(arr_or_dtype) -> bool: ) -def is_string_like_dtype(arr_or_dtype) -> bool: - """ - Check whether the provided array or dtype is of a string-like dtype. - - Unlike `is_string_dtype`, the object dtype is excluded because it - is a mixed dtype. - - Parameters - ---------- - arr_or_dtype : array-like - The array or dtype to check. - - Returns - ------- - boolean - Whether or not the array or dtype is of the string dtype. - - Examples - -------- - >>> is_string_like_dtype(str) - True - >>> is_string_like_dtype(object) - False - >>> is_string_like_dtype(np.array(['a', 'b'])) - True - >>> is_string_like_dtype(pd.Series([1, 2])) - False - """ - return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U")) - - def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 59d6f9a51ed43..8c2cff21c114e 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -35,7 +35,6 @@ is_object_dtype, is_scalar, is_string_dtype, - is_string_like_dtype, needs_i8_conversion, ) from pandas.core.dtypes.dtypes import ExtensionDtype @@ -258,7 +257,7 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> np.ndarray: dtype = values.dtype shape = values.shape - if is_string_like_dtype(dtype): + if dtype.kind in ("S", "U"): result = np.zeros(values.shape, dtype=bool) else: result = np.empty(shape, dtype=bool) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7138995d1b018..c20b2840a40ab 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7958,7 +7958,7 @@ def resample( Notes ----- See the `user guide - `_ + `__ for more. To learn more about the offset strings, please see `this link diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74e96015b4544..a222a8cc464fb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -991,10 +991,10 @@ def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): @cache_readonly def slabels(self): # Sorted labels - return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False) + return algorithms.take_nd(self.labels, self._sort_idx, allow_fill=False) @cache_readonly - def sort_idx(self): + def _sort_idx(self) -> np.ndarray: # np.ndarray[np.intp] # Counting sort indexer return get_group_index_sorter(self.labels, self.ngroups) @@ -1013,7 +1013,7 @@ def __iter__(self): @cache_readonly def sorted_data(self) -> FrameOrSeries: - return self.data.take(self.sort_idx, axis=self.axis) + return self.data.take(self._sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3a468758ab3fd..e8b83af16254a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4135,13 +4135,22 @@ def _join_level( """ from pandas.core.indexes.multi import MultiIndex - def _get_leaf_sorter(labels): + def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray: """ Returns sorter for the inner most level while preserving the order of higher levels. + + Parameters + ---------- + labels : list[np.ndarray] + Each ndarray has signed integer dtype, not necessarily identical. + + Returns + ------- + np.ndarray[np.intp] """ if labels[0].size == 0: - return np.empty(0, dtype="int64") + return np.empty(0, dtype=np.intp) if len(labels) == 1: return get_group_index_sorter(labels[0]) @@ -4154,7 +4163,7 @@ def _get_leaf_sorter(labels): starts = np.hstack(([True], tic, [True])).nonzero()[0] lab = ensure_int64(labels[-1]) - return lib.get_level_sorter(lab, ensure_int64(starts)) + return lib.get_level_sorter(lab, ensure_platform_int(starts)) if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): raise TypeError("Join on level between two MultiIndex objects is ambiguous") @@ -4189,12 +4198,12 @@ def _get_leaf_sorter(labels): join_index = left[left_indexer] else: - left_lev_indexer = ensure_int64(left_lev_indexer) + left_lev_indexer = ensure_platform_int(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) old_codes = left.codes[level] - new_lev_codes = algos.take_nd( - rev_indexer, old_codes[old_codes != -1], allow_fill=False - ) + + taker = old_codes[old_codes != -1] + new_lev_codes = rev_indexer.take(taker) new_codes = list(left.codes) new_codes[level] = new_lev_codes @@ -4204,6 +4213,7 @@ def _get_leaf_sorter(labels): if keep_order: # just drop missing values. o.w. keep order left_indexer = np.arange(len(left), dtype=np.intp) + left_indexer = cast(np.ndarray, left_indexer) mask = new_lev_codes != -1 if not mask.all(): new_codes = [lab[mask] for lab in new_codes] @@ -4213,11 +4223,12 @@ def _get_leaf_sorter(labels): if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( - new_lev_codes, ngroups + ensure_int64(new_lev_codes), ngroups ) # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0] :] + # error: Value of type "Optional[ndarray]" is not indexable + left_indexer = left_indexer[counts[0] :] # type: ignore[index] new_codes = [lab[left_indexer] for lab in new_codes] else: # sort the leaves @@ -6130,15 +6141,14 @@ def _maybe_disable_logical_methods(self, opname: str_t): # This call will raise make_invalid_op(opname)(self) + @final @property def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. """ - # not using "(len(self), )" to return "correct" shape if the values - # consists of a >1 D array (see GH-27775) - # overridden in MultiIndex.shape to avoid materializing the values - return self._values.shape + # See GH#27775, GH#27384 for history/reasoning in how this is defined. + return (len(self),) def ensure_index_from_sequences(sequences, names=None): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f372db5287604..c9c39fde1da46 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -150,7 +150,7 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): Notes ----- See the `user guide - `_ + `__ for more. Examples @@ -542,13 +542,6 @@ def _convert_list_indexer(self, keyarr): return self.get_indexer_for(keyarr) - @doc(Index._maybe_cast_slice_bound) - def _maybe_cast_slice_bound(self, label, side: str, kind): - if kind == "loc": - return label - - return super()._maybe_cast_slice_bound(label, side, kind) - # -------------------------------------------------------------------- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 97492f35232e3..244fcb9f49ec6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -267,7 +267,7 @@ class MultiIndex(Index): Notes ----- See the `user guide - `_ + `__ for more. Examples @@ -752,15 +752,6 @@ def dtypes(self) -> Series: } ) - @property - def shape(self) -> Shape: - """ - Return a tuple of the shape of the underlying data. - """ - # overriding the base Index.shape definition to avoid materializing - # the values (GH-27384, GH-27775) - return (len(self),) - def __len__(self) -> int: return len(self.codes[0]) @@ -1952,7 +1943,7 @@ def _sort_levels_monotonic(self) -> MultiIndex: lev = lev.take(indexer) # indexer to reorder the level codes - indexer = ensure_int64(indexer) + indexer = ensure_platform_int(indexer) ri = lib.get_reverse_indexer(indexer, len(indexer)) level_codes = algos.take_nd(ri, level_codes) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 456d87766bdb7..f37faa4ab844b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -450,7 +450,7 @@ def take( **kwargs, ) - def tolist(self): + def tolist(self) -> list[int]: return list(self._range) @doc(Int64Index.__iter__) @@ -494,13 +494,13 @@ def _minmax(self, meth: str): return self.start + self.step * no_steps - def min(self, axis=None, skipna=True, *args, **kwargs) -> int: + def min(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: """The minimum value of the RangeIndex""" nv.validate_minmax_axis(axis) nv.validate_min(args, kwargs) return self._minmax("min") - def max(self, axis=None, skipna=True, *args, **kwargs) -> int: + def max(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: """The maximum value of the RangeIndex""" nv.validate_minmax_axis(axis) nv.validate_max(args, kwargs) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a417cd0e06872..34b3d83c066c2 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -88,6 +88,7 @@ from pandas.core.internals.blocks import ( ensure_block_shape, new_block, + to_native_types, ) if TYPE_CHECKING: @@ -634,7 +635,7 @@ def replace_list( ) def to_native_types(self, **kwargs): - return self.apply_with_block("to_native_types", **kwargs) + return self.apply(to_native_types, **kwargs) @property def is_mixed_type(self) -> bool: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3fd1ebaca19f0..99e54bace8915 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -97,6 +97,7 @@ FloatingArray, IntegerArray, PandasArray, + TimedeltaArray, ) from pandas.core.base import PandasObject import pandas.core.common as com @@ -260,9 +261,11 @@ def get_block_values_for_json(self) -> np.ndarray: # TODO(EA2D): reshape will be unnecessary with 2D EAs return np.asarray(self.values).reshape(self.shape) + @final @property def fill_value(self): - return np.nan + # Used in reindex_indexer + return na_value_for_dtype(self.dtype, compat=False) @property def mgr_locs(self) -> BlockPlacement: @@ -652,24 +655,11 @@ def should_store(self, value: ArrayLike) -> bool: """ return is_dtype_equal(value.dtype, self.dtype) + @final def to_native_types(self, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format """ - values = self.values - - mask = isna(values) - itemsize = writers.word_len(na_rep) - - if not self.is_object and not quoting and itemsize: - values = values.astype(str) - if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: - # enlarge for the na_rep - values = values.astype(f" np.ndarray: def array_values(self) -> ExtensionArray: return self.values - def to_native_types(self, na_rep="nan", quoting=None, **kwargs): - """override to use ExtensionArray astype for the conversion""" - values = self.values - mask = isna(values) - - new_values = np.asarray(values.astype(object)) - new_values[mask] = na_rep - return self.make_block(new_values) - def take_nd( self, indexer, @@ -1808,41 +1784,6 @@ def is_bool(self): class FloatBlock(NumericBlock): __slots__ = () - def to_native_types( - self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs - ): - """ convert to our native types format """ - values = self.values - - # see gh-13418: no special formatting is desired at the - # output (important for appropriate 'quoting' behaviour), - # so do not pass it through the FloatArrayFormatter - if float_format is None and decimal == ".": - mask = isna(values) - - if not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype="object") - - values[mask] = na_rep - values = values.astype(object, copy=False) - return self.make_block(values) - - from pandas.io.formats.format import FloatArrayFormatter - - formatter = FloatArrayFormatter( - values, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - quoting=quoting, - fixed_width=False, - ) - res = formatter.get_result_as_array() - res = res.astype(object, copy=False) - return self.make_block(res) - class NDArrayBackedExtensionBlock(HybridMixin, Block): """ @@ -1962,18 +1903,6 @@ def array_values(self): def _holder(self): return type(self.array_values()) - @property - def fill_value(self): - return na_value_for_dtype(self.dtype) - - def to_native_types(self, na_rep="NaT", **kwargs): - """ convert to our native types format """ - arr = self.array_values() - - result = arr._format_native_types(na_rep=na_rep, **kwargs) - result = result.astype(object, copy=False) - return self.make_block(result) - class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () @@ -1999,7 +1928,6 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): internal_values = Block.internal_values _can_hold_element = DatetimeBlock._can_hold_element - to_native_types = DatetimeBlock.to_native_types diff = DatetimeBlock.diff where = DatetimeBlock.where putmask = DatetimeLikeBlockMixin.putmask @@ -2316,3 +2244,75 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # We can't, and don't need to, reshape. values = np.asarray(values).reshape(1, -1) return values + + +def to_native_types( + values: ArrayLike, + *, + na_rep="nan", + quoting=None, + float_format=None, + decimal=".", + **kwargs, +) -> np.ndarray: + """ convert to our native types format """ + values = ensure_wrapped_if_datetimelike(values) + + if isinstance(values, (DatetimeArray, TimedeltaArray)): + result = values._format_native_types(na_rep=na_rep, **kwargs) + result = result.astype(object, copy=False) + return result + + elif isinstance(values, ExtensionArray): + mask = isna(values) + + new_values = np.asarray(values.astype(object)) + new_values[mask] = na_rep + return new_values + + elif values.dtype.kind == "f": + # see GH#13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == ".": + mask = isna(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype="object") + + values[mask] = na_rep + values = values.astype(object, copy=False) + return values + + from pandas.io.formats.format import FloatArrayFormatter + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) + res = formatter.get_result_as_array() + res = res.astype(object, copy=False) + return res + + else: + + mask = isna(values) + itemsize = writers.word_len(na_rep) + + if values.dtype != _dtype_obj and not quoting and itemsize: + values = values.astype(str) + if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: + # enlarge for the na_rep + values = values.astype(f" ArrayLike: """ # NB: We assume extract_array has already been called on left and right lvalues = ensure_wrapped_if_datetimelike(left) - rvalues = right + rvalues = ensure_wrapped_if_datetimelike(right) rvalues = lib.item_from_zerodim(rvalues) if isinstance(rvalues, list): @@ -267,10 +266,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - with warnings.catch_warnings(): - # suppress warnings from numpy about element-wise comparison - warnings.simplefilter("ignore", DeprecationWarning) - res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) + res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) return res_values diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c01bf3931b27a..3c1279d62b126 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1973,7 +1973,7 @@ def _get_single_indexer(join_key, index, sort: bool = False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = libjoin.left_outer_join( - ensure_int64(left_key), ensure_int64(right_key), count, sort=sort + left_key, right_key, count, sort=sort ) return left_indexer, right_indexer @@ -2029,9 +2029,9 @@ def _factorize_keys( Returns ------- - array + np.ndarray[np.intp] Left (resp. right if called with `key='right'`) labels, as enumerated type. - array + np.ndarray[np.intp] Right (resp. left if called with `key='right'`) labels, as enumerated type. int Number of unique elements in union of left and right labels. @@ -2117,6 +2117,8 @@ def _factorize_keys( llab = rizer.factorize(lk) rlab = rizer.factorize(rk) + assert llab.dtype == np.intp, llab.dtype + assert rlab.dtype == np.intp, rlab.dtype count = rizer.get_count() @@ -2142,13 +2144,16 @@ def _factorize_keys( return llab, rlab, count -def _sort_labels(uniques: np.ndarray, left, right): +def _sort_labels( + uniques: np.ndarray, left: np.ndarray, right: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: + # Both returned ndarrays are np.intp llength = len(left) labels = np.concatenate([left, right]) _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) - new_labels = ensure_int64(new_labels) + assert new_labels.dtype == np.intp new_left, new_right = new_labels[:llength], new_labels[llength:] return new_left, new_right diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6a0286b1c40ef..613669b8cc1d8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -145,7 +145,7 @@ def _indexer_and_to_sort(self): ngroups = len(obs_ids) indexer = get_group_index_sorter(comp_index, ngroups) - + indexer = ensure_platform_int(indexer) return indexer, to_sort @cache_readonly diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 49eb87a3bc8ba..66d84ef85880c 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -143,7 +143,7 @@ Notes ----- See the `user guide -`_ for more. +`__ for more. """ _shared_docs[ diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 720643d3d98aa..3aa4d26f7dc8f 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -582,6 +582,16 @@ def get_group_index_sorter( Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') + + Parameters + ---------- + group_index : np.ndarray + signed integer dtype + ngroups : int or None, default None + + Returns + ------- + np.ndarray[np.intp] """ if ngroups is None: # error: Incompatible types in assignment (expression has type "number[Any]", @@ -596,9 +606,10 @@ def get_group_index_sorter( ) if do_groupsort: sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) - return ensure_platform_int(sorter) + # sorter _should_ already be intp, but mypy is not yet able to verify else: - return group_index.argsort(kind="mergesort") + sorter = group_index.argsort(kind="mergesort") + return ensure_platform_int(sorter) def compress_group_index(group_index, sort: bool = True): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 87be5c0997072..375901bc3fb58 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -56,7 +56,7 @@ def combine_hash_arrays(arrays: Iterator[np.ndarray], num_items: int) -> np.ndar Returns ------- - np.ndarray[int64] + np.ndarray[uint64] Should be the same as CPython's tupleobject.c """ @@ -184,7 +184,7 @@ def hash_tuples( Returns ------- - ndarray of hashed values array + ndarray[np.uint64] of hashed values """ if not is_list_like(vals): raise TypeError("must be convertible to a list-of-tuples") @@ -227,7 +227,7 @@ def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndar Returns ------- - ndarray of hashed values array, same size as len(c) + ndarray[np.uint64] of hashed values, same size as len(c) """ # Convert ExtensionArrays to ndarrays values = np.asarray(cat.categories._values) @@ -274,7 +274,8 @@ def hash_array( Returns ------- - 1d uint64 numpy array of hash values, same length as the vals + ndarray[np.uint64, ndim=1] + Hashed values, same length as the vals. """ if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 3abb39d2194c0..5ec2141028fa4 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -217,8 +217,10 @@ def set_tooltips( css_class: Optional[str] = None, ) -> Styler: """ - Add string based tooltips that will appear in the `Styler` HTML result. These - tooltips are applicable only to`` elements. + Set the DataFrame of strings on ``Styler`` generating ``:hover`` tooltips. + + These string based tooltips are only applicable to ```` HTML elements, + and cannot be used for column or index headers. .. versionadded:: 1.3.0 @@ -227,7 +229,7 @@ def set_tooltips( ttips : DataFrame DataFrame containing strings that will be translated to tooltips, mapped by identical column and index values that must exist on the underlying - `Styler` data. None, NaN values, and empty strings will be ignored and + Styler data. None, NaN values, and empty strings will be ignored and not affect the rendered HTML. props : list-like or str, optional List of (attr, value) tuples or a valid CSS string. If ``None`` adopts @@ -671,21 +673,33 @@ def format( def set_td_classes(self, classes: DataFrame) -> Styler: """ - Add string based CSS class names to data cells that will appear within the - `Styler` HTML result. These classes are added within specified `` elements. + Set the DataFrame of strings added to the ``class`` attribute of ```` + HTML elements. Parameters ---------- classes : DataFrame DataFrame containing strings that will be translated to CSS classes, - mapped by identical column and index values that must exist on the - underlying `Styler` data. None, NaN values, and empty strings will + mapped by identical column and index key values that must exist on the + underlying Styler data. None, NaN values, and empty strings will be ignored and not affect the rendered HTML. Returns ------- self : Styler + See Also + -------- + Styler.set_table_styles: Set the table styles included within the ``' - '' + '' + '
' ' ' ' ' ' ' ' ' - ' ' + ' ' ' ' '
0
1
1
' """ @@ -736,7 +750,7 @@ def set_td_classes(self, classes: DataFrame) -> Styler: def render(self, **kwargs) -> str: """ - Render the built up styles to HTML. + Render the ``Styler`` including all applied styles to HTML. Parameters ---------- @@ -753,7 +767,7 @@ def render(self, **kwargs) -> str: Notes ----- - ``Styler`` objects have defined the ``_repr_html_`` method + Styler objects have defined the ``_repr_html_`` method which automatically calls ``self.render()`` when it's the last item in a Notebook cell. When calling ``Styler.render()`` directly, wrap the result in ``IPython.display.HTML`` to view @@ -779,7 +793,7 @@ def render(self, **kwargs) -> str: def _update_ctx(self, attrs: DataFrame) -> None: """ - Update the state of the Styler for data cells. + Update the state of the ``Styler`` for data cells. Collects a mapping of {index_label: [('', ''), ..]}. @@ -839,7 +853,7 @@ def __deepcopy__(self, memo) -> Styler: def clear(self) -> None: """ - Reset the styler, removing any previously applied styles. + Reset the ``Styler``, removing any previously applied styles. Returns None. """ @@ -923,10 +937,11 @@ def apply( Parameters ---------- func : function - ``func`` should take a Series or DataFrame (depending - on ``axis``), and return an object with the same shape. - Must return a DataFrame with identical index and - column labels or an ndarray with same shape as input when ``axis=None``. + ``func`` should take a Series if ``axis`` in [0,1] and return an object + of same length, also with identical index if the object is a Series. + ``func`` should take a DataFrame if ``axis`` is ``None`` and return either + an ndarray with the same shape or a DataFrame with identical columns and + index. .. versionchanged:: 1.3.0 @@ -944,13 +959,16 @@ def apply( ------- self : Styler + See Also + -------- + Styler.where: Apply CSS-styles based on a conditional function elementwise. + Styler.applymap: Apply a CSS-styling function elementwise. + Notes ----- - The output of ``func`` should be elements having CSS style as string or, + The elements of the output of ``func`` should be CSS styles as strings, in the + format 'attribute: value; attribute2: value2; ...' or, if nothing is to be applied to that element, an empty string or ``None``. - The output shape must match the input, i.e. if - ``x`` is the input row, column, or table (depending on ``axis``), - then ``func(x).shape == x.shape`` should be ``True``. This is similar to ``DataFrame.apply``, except that ``axis=None`` applies the function to the entire DataFrame at once, @@ -1001,13 +1019,14 @@ def applymap(self, func: Callable, subset=None, **kwargs) -> Styler: See Also -------- - Styler.where: Updates the HTML representation with a style which is - selected in accordance with the return value of a function. + Styler.where: Apply CSS-styles based on a conditional function elementwise. + Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. Notes ----- - The output of ``func`` should be a CSS style as string or, if nothing is to be - applied, an empty string or ``None``. + The elements of the output of ``func`` should be CSS styles as strings, in the + format 'attribute: value; attribute2: value2; ...' or, + if nothing is to be applied to that element, an empty string or ``None``. Examples -------- @@ -1030,7 +1049,7 @@ def where( **kwargs, ) -> Styler: """ - Apply a function elementwise. + Apply CSS-styles based on a conditional function elementwise. Updates the HTML representation with a style which is selected in accordance with the return value of a function. @@ -1055,7 +1074,15 @@ def where( See Also -------- - Styler.applymap: Updates the HTML representation with the result. + Styler.applymap: Apply a CSS-styling function elementwise. + Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. + + Examples + -------- + >>> def cond(v): + ... return v > 1 and v != 4 + >>> df = pd.DataFrame([[1, 2], [3, 4]]) + >>> df.style.where(cond, value='color:red;', other='font-size:2em;') """ if other is None: other = "" @@ -1092,10 +1119,9 @@ def set_precision(self, precision: int) -> Styler: def set_table_attributes(self, attributes: str) -> Styler: """ - Set the table attributes. + Set the table attributes added to the ```` HTML element. - These are the items that show up in the opening ``
`` tag - in addition to automatic (by default) id. + These are items in addition to automatic (by default) ``id`` attribute. Parameters ---------- @@ -1105,6 +1131,13 @@ def set_table_attributes(self, attributes: str) -> Styler: ------- self : Styler + See Also + -------- + Styler.set_table_styles: Set the table styles included within the ``