Skip to content

separate _libs/src/reduce.pyx to _libs.reduction #19306

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from cython cimport Py_ssize_t

import numpy as np
cimport numpy as np
from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, PyArray_SETITEM,
from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM,
PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew,
flatiter, NPY_OBJECT,
int64_t,
Expand Down Expand Up @@ -57,8 +57,6 @@ cimport util
cdef int64_t NPY_NAT = util.get_nat()
from util cimport is_array, _checknull

from libc.math cimport fabs, sqrt


def values_from_object(object o):
""" return my values or the object if we are say an ndarray """
Expand Down Expand Up @@ -1119,5 +1117,4 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys,
return result


include "reduce.pyx"
include "inference.pyx"
19 changes: 17 additions & 2 deletions pandas/_libs/src/reduce.pyx → pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
@@ -1,9 +1,24 @@
# -*- coding: utf-8 -*-
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't like reduce?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I try to avoid overlap with built-in names.

# cython: profile=False
import numpy as np

from distutils.version import LooseVersion

from cython cimport Py_ssize_t
from cpython cimport Py_INCREF

from libc.stdlib cimport malloc, free

import numpy as np
cimport numpy as np
from numpy cimport (ndarray,
int64_t,
PyArray_SETITEM,
PyArray_ITER_NEXT, PyArray_ITER_DATA, PyArray_IterNew,
flatiter)
np.import_array()

cimport util
from lib import maybe_convert_objects
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove this, and instead where .reduce() is called you can do this on the python side.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'd also need to do that everywhere SeriesGrouper.get_result or SeriesBinGrouper.get_result is called. What's the upside that makes the duplicated code worthwhile?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this completely removes this module from the import time dependency chain.

you could do this in the top-level function reduce() I suppose (inside the function)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or could expose maybe_convert_objects in a pxd and then call it directly in cython (but that actually puts this back in the dep chain).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could do this in the top-level function reduce() I suppose (inside the function)

That would be better than trying to do it after every place in the code-base where the function is called. But note that reduce is not the only part of this module that is used externally, so we'd need to do this run-time import in three places in this module.

Since the dependency is one-way and not a c-dep, I don't see a huge downside to the import-time import of maybe_convert_objects.


is_numpy_prior_1_6_2 = LooseVersion(np.__version__) < '1.6.2'


Expand Down
12 changes: 6 additions & 6 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
from pandas import compat
from pandas._libs import lib
from pandas._libs import reduction
from pandas.core.dtypes.common import (
is_extension_type,
is_sequence)
Expand Down Expand Up @@ -114,7 +114,7 @@ def apply_empty_result(self):

def apply_raw(self):
try:
result = lib.reduce(self.values, self.f, axis=self.axis)
result = reduction.reduce(self.values, self.f, axis=self.axis)
except Exception:
result = np.apply_along_axis(self.f, self.axis, self.values)

Expand Down Expand Up @@ -150,10 +150,10 @@ def apply_standard(self):

try:
labels = self.agg_axis
result = lib.reduce(values, self.f,
axis=self.axis,
dummy=dummy,
labels=labels)
result = reduction.reduce(values, self.f,
axis=self.axis,
dummy=dummy,
labels=labels)
return Series(result, index=labels)
except Exception:
pass
Expand Down
15 changes: 9 additions & 6 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@

from pandas.plotting._core import boxplot_frame_groupby

from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT
from pandas._libs import (lib, reduction,
groupby as libgroupby,
Timestamp, NaT, iNaT)
from pandas._libs.lib import count_level_2d

_doc_template = """
Expand Down Expand Up @@ -1981,7 +1983,7 @@ def apply(self, f, data, axis=0):
try:
values, mutated = splitter.fast_apply(f, group_keys)
return group_keys, values, mutated
except (lib.InvalidApply):
except reduction.InvalidApply:
# we detect a mutation of some kind
# so take slow path
pass
Expand Down Expand Up @@ -2404,8 +2406,8 @@ def _aggregate_series_fast(self, obj, func):
obj = obj._take(indexer, convert=False).to_dense()
group_index = algorithms.take_nd(
group_index, indexer, allow_fill=False)
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
dummy)
grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups,
dummy)
result, counts = grouper.get_result()
return result, counts

Expand Down Expand Up @@ -2618,7 +2620,7 @@ def groupings(self):

def agg_series(self, obj, func):
dummy = obj[:0]
grouper = lib.SeriesBinGrouper(obj, func, self.bins, dummy)
grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy)
return grouper.get_result()

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -4758,7 +4760,8 @@ def fast_apply(self, f, names):
return [], True

sdata = self._get_sorted_data()
results, mutated = lib.apply_frame_axis0(sdata, f, names, starts, ends)
results, mutated = reduction.apply_frame_axis0(sdata, f, names,
starts, ends)

return results, mutated

Expand Down
23 changes: 12 additions & 11 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas import Index, isna
from pandas.util.testing import assert_almost_equal
import pandas.util.testing as tm
from pandas._libs import lib, groupby
from pandas._libs import lib, groupby, reduction


def test_series_grouper():
Expand All @@ -19,7 +19,7 @@ def test_series_grouper():

labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64)

grouper = lib.SeriesGrouper(obj, np.mean, labels, 2, dummy)
grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy)
result, counts = grouper.get_result()

expected = np.array([obj[3:6].mean(), obj[6:].mean()])
Expand All @@ -36,7 +36,7 @@ def test_series_bin_grouper():

bins = np.array([3, 6])

grouper = lib.SeriesBinGrouper(obj, np.mean, bins, dummy)
grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy)
result, counts = grouper.get_result()

expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()])
Expand Down Expand Up @@ -127,26 +127,27 @@ def test_int_index(self):
from pandas.core.series import Series

arr = np.random.randn(100, 4)
result = lib.reduce(arr, np.sum, labels=Index(np.arange(4)))
result = reduction.reduce(arr, np.sum, labels=Index(np.arange(4)))
expected = arr.sum(0)
assert_almost_equal(result, expected)

result = lib.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100)))
result = reduction.reduce(arr, np.sum, axis=1,
labels=Index(np.arange(100)))
expected = arr.sum(1)
assert_almost_equal(result, expected)

dummy = Series(0., index=np.arange(100))
result = lib.reduce(arr, np.sum, dummy=dummy,
labels=Index(np.arange(4)))
result = reduction.reduce(arr, np.sum, dummy=dummy,
labels=Index(np.arange(4)))
expected = arr.sum(0)
assert_almost_equal(result, expected)

dummy = Series(0., index=np.arange(4))
result = lib.reduce(arr, np.sum, axis=1, dummy=dummy,
labels=Index(np.arange(100)))
result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy,
labels=Index(np.arange(100)))
expected = arr.sum(1)
assert_almost_equal(result, expected)

result = lib.reduce(arr, np.sum, axis=1, dummy=dummy,
labels=Index(np.arange(100)))
result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy,
labels=Index(np.arange(100)))
assert_almost_equal(result, expected)
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ class CheckSDist(sdist_class):
'pandas/_libs/interval.pyx',
'pandas/_libs/hashing.pyx',
'pandas/_libs/missing.pyx',
'pandas/_libs/reduction.pyx',
'pandas/_libs/testing.pyx',
'pandas/_libs/window.pyx',
'pandas/_libs/skiplist.pyx',
Expand Down Expand Up @@ -506,6 +507,8 @@ def pxd(name):
'pandas/_libs/src/numpy_helper.h'],
'sources': ['pandas/_libs/src/parser/tokenizer.c',
'pandas/_libs/src/parser/io.c']},
'_libs.reduction': {
'pyxfile': '_libs/reduction'},
'_libs.tslibs.period': {
'pyxfile': '_libs/tslibs/period',
'pxdfiles': ['_libs/src/util',
Expand Down