From 687a0ce3b2594622f9cfd877c709e797639f3ddc Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 9 Oct 2020 10:47:24 -0500 Subject: [PATCH 1/4] BLD: remove blockslider #34014 --- pandas/_libs/reduction.pyx | 110 ++++--------------------------------- 1 file changed, 12 insertions(+), 98 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 3a0fda5aed620..3702449e803a4 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,9 +1,5 @@ from copy import copy -from cython import Py_ssize_t - -from libc.stdlib cimport free, malloc - import numpy as np cimport numpy as cnp @@ -11,14 +7,14 @@ from numpy cimport int64_t, ndarray cnp.import_array() -from pandas._libs cimport util +from pandas._libs.util cimport is_array from pandas._libs.lib import is_scalar, maybe_convert_objects cpdef check_result_array(object obj, Py_ssize_t cnt): - if (util.is_array(obj) or + if (is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): raise ValueError('Must produce aggregated value') @@ -33,7 +29,7 @@ cdef class _BaseGrouper: if (dummy.dtype != self.arr.dtype and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') - if util.is_array(values) and not values.flags.contiguous: + if is_array(values) and not values.flags.contiguous: # e.g. Categorical has no `flags` attribute values = values.copy() index = dummy.index.values @@ -106,7 +102,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.f = f values = series.values - if util.is_array(values) and not values.flags.c_contiguous: + if is_array(values) and not values.flags.c_contiguous: # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values @@ -204,7 +200,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.f = f values = series.values - if util.is_array(values) and not values.flags.c_contiguous: + if is_array(values) and not values.flags.c_contiguous: # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values @@ -288,9 +284,9 @@ cpdef inline extract_result(object res, bint squeeze=True): res = res._values if squeeze and res.ndim == 1 and len(res) == 1: res = res[0] - if hasattr(res, 'values') and util.is_array(res.values): + if hasattr(res, 'values') and is_array(res.values): res = res.values - if util.is_array(res): + if is_array(res): if res.ndim == 0: res = res.item() elif squeeze and res.ndim == 1 and len(res) == 1: @@ -304,7 +300,7 @@ cdef class Slider: """ cdef: ndarray values, buf - Py_ssize_t stride, orig_len, orig_stride + Py_ssize_t stride, orig_len char *orig_data def __init__(self, ndarray values, ndarray buf): @@ -320,7 +316,6 @@ cdef class Slider: self.orig_data = self.buf.data self.orig_len = self.buf.shape[0] - self.orig_stride = self.buf.strides[0] self.buf.data = self.values.data self.buf.strides[0] = self.stride @@ -334,9 +329,8 @@ cdef class Slider: cdef reset(self): - self.buf.shape[0] = self.orig_len self.buf.data = self.orig_data - self.buf.strides[0] = self.orig_stride + self.buf.shape[0] = self.orig_len class InvalidApply(Exception): @@ -346,7 +340,6 @@ class InvalidApply(Exception): def apply_frame_axis0(object frame, object f, object names, const int64_t[:] starts, const int64_t[:] ends): cdef: - BlockSlider slider Py_ssize_t i, n = len(starts) list results object piece @@ -357,16 +350,13 @@ def apply_frame_axis0(object frame, object f, object names, results = [] - slider = BlockSlider(frame) - mutated = False - item_cache = slider.dummy._item_cache + item_cache = frame._item_cache try: for i in range(n): - slider.move(starts[i], ends[i]) item_cache.clear() # ugh - chunk = slider.dummy + chunk = frame[starts[i]:ends[i]] object.__setattr__(chunk, 'name', names[i]) try: @@ -399,82 +389,6 @@ def apply_frame_axis0(object frame, object f, object names, if require_slow_apply: break finally: - slider.reset() + pass return results, mutated - - -cdef class BlockSlider: - """ - Only capable of sliding on axis=0 - """ - - cdef public: - object frame, dummy, index - int nblocks - Slider idx_slider - list blocks - - cdef: - char **base_ptrs - - def __init__(self, object frame): - cdef: - Py_ssize_t i - object b - - self.frame = frame - self.dummy = frame[:0] - self.index = self.dummy.index - - self.blocks = [b.values for b in self.dummy._mgr.blocks] - - for x in self.blocks: - util.set_array_not_contiguous(x) - - self.nblocks = len(self.blocks) - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference to a 1-d - # ndarray like datetime / timedelta / period. - self.idx_slider = Slider( - self.frame.index._index_data, self.dummy.index._index_data) - - self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) - for i, block in enumerate(self.blocks): - self.base_ptrs[i] = (block).data - - def __dealloc__(self): - free(self.base_ptrs) - - cdef move(self, int start, int end): - cdef: - ndarray arr - Py_ssize_t i - - # move blocks - for i in range(self.nblocks): - arr = self.blocks[i] - - # axis=1 is the frame's axis=0 - arr.data = self.base_ptrs[i] + arr.strides[1] * start - arr.shape[1] = end - start - - # move and set the index - self.idx_slider.move(start, end) - - object.__setattr__(self.index, '_index_data', self.idx_slider.buf) - self.index._engine.clear_mapping() - self.index._cache.clear() # e.g. inferred_freq must go - - cdef reset(self): - cdef: - ndarray arr - Py_ssize_t i - - # reset blocks - for i in range(self.nblocks): - arr = self.blocks[i] - - # axis=1 is the frame's axis=0 - arr.data = self.base_ptrs[i] - arr.shape[1] = 0 From fc1fe49699415f8aace64beca72e5e689b12a31b Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 9 Oct 2020 10:48:55 -0500 Subject: [PATCH 2/4] Revert "BLD: remove blockslider #34014" This reverts commit 687a0ce3b2594622f9cfd877c709e797639f3ddc. --- pandas/_libs/reduction.pyx | 110 +++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 3702449e803a4..3a0fda5aed620 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,5 +1,9 @@ from copy import copy +from cython import Py_ssize_t + +from libc.stdlib cimport free, malloc + import numpy as np cimport numpy as cnp @@ -7,14 +11,14 @@ from numpy cimport int64_t, ndarray cnp.import_array() -from pandas._libs.util cimport is_array +from pandas._libs cimport util from pandas._libs.lib import is_scalar, maybe_convert_objects cpdef check_result_array(object obj, Py_ssize_t cnt): - if (is_array(obj) or + if (util.is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): raise ValueError('Must produce aggregated value') @@ -29,7 +33,7 @@ cdef class _BaseGrouper: if (dummy.dtype != self.arr.dtype and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') - if is_array(values) and not values.flags.contiguous: + if util.is_array(values) and not values.flags.contiguous: # e.g. Categorical has no `flags` attribute values = values.copy() index = dummy.index.values @@ -102,7 +106,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.f = f values = series.values - if is_array(values) and not values.flags.c_contiguous: + if util.is_array(values) and not values.flags.c_contiguous: # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values @@ -200,7 +204,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.f = f values = series.values - if is_array(values) and not values.flags.c_contiguous: + if util.is_array(values) and not values.flags.c_contiguous: # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values @@ -284,9 +288,9 @@ cpdef inline extract_result(object res, bint squeeze=True): res = res._values if squeeze and res.ndim == 1 and len(res) == 1: res = res[0] - if hasattr(res, 'values') and is_array(res.values): + if hasattr(res, 'values') and util.is_array(res.values): res = res.values - if is_array(res): + if util.is_array(res): if res.ndim == 0: res = res.item() elif squeeze and res.ndim == 1 and len(res) == 1: @@ -300,7 +304,7 @@ cdef class Slider: """ cdef: ndarray values, buf - Py_ssize_t stride, orig_len + Py_ssize_t stride, orig_len, orig_stride char *orig_data def __init__(self, ndarray values, ndarray buf): @@ -316,6 +320,7 @@ cdef class Slider: self.orig_data = self.buf.data self.orig_len = self.buf.shape[0] + self.orig_stride = self.buf.strides[0] self.buf.data = self.values.data self.buf.strides[0] = self.stride @@ -329,8 +334,9 @@ cdef class Slider: cdef reset(self): - self.buf.data = self.orig_data self.buf.shape[0] = self.orig_len + self.buf.data = self.orig_data + self.buf.strides[0] = self.orig_stride class InvalidApply(Exception): @@ -340,6 +346,7 @@ class InvalidApply(Exception): def apply_frame_axis0(object frame, object f, object names, const int64_t[:] starts, const int64_t[:] ends): cdef: + BlockSlider slider Py_ssize_t i, n = len(starts) list results object piece @@ -350,13 +357,16 @@ def apply_frame_axis0(object frame, object f, object names, results = [] + slider = BlockSlider(frame) + mutated = False - item_cache = frame._item_cache + item_cache = slider.dummy._item_cache try: for i in range(n): + slider.move(starts[i], ends[i]) item_cache.clear() # ugh - chunk = frame[starts[i]:ends[i]] + chunk = slider.dummy object.__setattr__(chunk, 'name', names[i]) try: @@ -389,6 +399,82 @@ def apply_frame_axis0(object frame, object f, object names, if require_slow_apply: break finally: - pass + slider.reset() return results, mutated + + +cdef class BlockSlider: + """ + Only capable of sliding on axis=0 + """ + + cdef public: + object frame, dummy, index + int nblocks + Slider idx_slider + list blocks + + cdef: + char **base_ptrs + + def __init__(self, object frame): + cdef: + Py_ssize_t i + object b + + self.frame = frame + self.dummy = frame[:0] + self.index = self.dummy.index + + self.blocks = [b.values for b in self.dummy._mgr.blocks] + + for x in self.blocks: + util.set_array_not_contiguous(x) + + self.nblocks = len(self.blocks) + # See the comment in indexes/base.py about _index_data. + # We need this for EA-backed indexes that have a reference to a 1-d + # ndarray like datetime / timedelta / period. + self.idx_slider = Slider( + self.frame.index._index_data, self.dummy.index._index_data) + + self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) + for i, block in enumerate(self.blocks): + self.base_ptrs[i] = (block).data + + def __dealloc__(self): + free(self.base_ptrs) + + cdef move(self, int start, int end): + cdef: + ndarray arr + Py_ssize_t i + + # move blocks + for i in range(self.nblocks): + arr = self.blocks[i] + + # axis=1 is the frame's axis=0 + arr.data = self.base_ptrs[i] + arr.strides[1] * start + arr.shape[1] = end - start + + # move and set the index + self.idx_slider.move(start, end) + + object.__setattr__(self.index, '_index_data', self.idx_slider.buf) + self.index._engine.clear_mapping() + self.index._cache.clear() # e.g. inferred_freq must go + + cdef reset(self): + cdef: + ndarray arr + Py_ssize_t i + + # reset blocks + for i in range(self.nblocks): + arr = self.blocks[i] + + # axis=1 is the frame's axis=0 + arr.data = self.base_ptrs[i] + arr.shape[1] = 0 From 709ee170b5be26e42914d4958bf54d42c353dd85 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sat, 10 Oct 2020 16:18:48 -0500 Subject: [PATCH 3/4] CI: move py39 build to conda #33948 --- .travis.yml | 7 +------ ci/azure/posix.yml | 5 +++++ ci/deps/azure-39.yaml | 17 +++++++++++++++++ ci/setup_env.sh | 5 ----- 4 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 ci/deps/azure-39.yaml diff --git a/.travis.yml b/.travis.yml index 2bf72bd159fc2..1ddd886699d38 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,11 +35,6 @@ matrix: fast_finish: true include: - - dist: bionic - python: 3.9-dev - env: - - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" - - env: - JOB="3.8, slow" ENV_FILE="ci/deps/travis-38-slow.yaml" PATTERN="slow" SQL="1" services: @@ -94,7 +89,7 @@ install: script: - echo "script start" - echo "$JOB" - - if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi + - source activate pandas-dev - ci/run_tests.sh after_script: diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 3a9bb14470692..8e44db0b4bcd4 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -61,6 +61,11 @@ jobs: PANDAS_TESTING_MODE: "deprecate" EXTRA_APT: "xsel" + py39: + ENV_FILE: ci/deps/azure-39.yaml + CONDA_PY: "39" + PATTERN: "not slow and not network and not clipboard" + steps: - script: | if [ "$(uname)" == "Linux" ]; then diff --git a/ci/deps/azure-39.yaml b/ci/deps/azure-39.yaml new file mode 100644 index 0000000000000..67edc83a9d738 --- /dev/null +++ b/ci/deps/azure-39.yaml @@ -0,0 +1,17 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.9.* + + # tools + - cython>=0.29.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - numpy + - python-dateutil + - pytz diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 247f809c5fe63..8984fa2d9a9be 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,10 +1,5 @@ #!/bin/bash -e -if [ "$JOB" == "3.9-dev" ]; then - /bin/bash ci/build39.sh - exit 0 -fi - # edit the locale file if needed if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo "Adding locale to the first line of pandas/__init__.py" From fe9709fbd493d75b1ba785c7918a6e0827a833d9 Mon Sep 17 00:00:00 2001 From: VirosaLi <2EkF8qUgpNkj> Date: Fri, 13 Nov 2020 13:27:26 -0600 Subject: [PATCH 4/4] CI: remove py39 build script --- ci/build39.sh | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100755 ci/build39.sh diff --git a/ci/build39.sh b/ci/build39.sh deleted file mode 100755 index faef2be03c2bb..0000000000000 --- a/ci/build39.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -e -# Special build for python3.9 until numpy puts its own wheels up - -pip install --no-deps -U pip wheel setuptools -pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis - -python setup.py build_ext -inplace -python -m pip install --no-build-isolation -e . - -python -c "import sys; print(sys.version_info)" -python -c "import pandas as pd" -python -c "import hypothesis"