From 23303e011b7e0d9e83eef27f477d105b276b11b2 Mon Sep 17 00:00:00 2001 From: rohanp Date: Fri, 18 Aug 2017 18:09:52 -0400 Subject: [PATCH 01/13] optimized median func when bottleneck not present --- doc/source/whatsnew/v0.21.0.txt | 302 +++++++++++++++++++++++++++++--- pandas/core/nanops.py | 74 +++++--- 2 files changed, 330 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b4ca3f011a81d..62b5113ab64fd 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -9,6 +9,8 @@ users upgrade to this version. Highlights include: +- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. + Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. .. contents:: What's new in v0.21.0 @@ -22,8 +24,87 @@ New features - Support for `PEP 519 -- Adding a file system path protocol `_ on most readers and writers (:issue:`13823`) -- Added `__fspath__` method to :class`:pandas.HDFStore`, :class:`pandas.ExcelFile`, - and :class:`pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) +- Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, + and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) +- Added ``skipna`` parameter to :func:`~pandas.api.types.infer_dtype` to + support type inference in the presence of missing values (:issue:`17059`). + +.. _whatsnew_0210.enhancements.infer_objects: + +``infer_objects`` type conversion +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`DataFrame.infer_objects` and :meth:`Series.infer_objects` +methods have been added to perform dtype inference on object columns, replacing +some of the functionality of the deprecated ``convert_objects`` +method. See the documentation :ref:`here ` +for more details. (:issue:`11221`) + +This method only performs soft conversions on object columns, converting Python objects +to native types, but not any coercive conversions. For example: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': np.array([1, 2, 3], dtype='object'), + 'C': ['1', '2', '3']}) + df.dtypes + df.infer_objects().dtypes + +Note that column ``'C'`` was not converted - only scalar numeric types +will be inferred to a new type. Other types of conversion should be accomplished +using the :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). + +.. ipython:: python + + df = df.infer_objects() + df['C'] = pd.to_numeric(df['C'], errors='coerce') + df.dtypes + +.. _whatsnew_0210.enhancements.attribute_access: + +Improved warnings when attempting to create columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +New users are often flummoxed by the relationship between column operations and attribute +access on ``DataFrame`` instances (:issue:`5904` & :issue:`7175`). Two specific instances +of this confusion include attempting to create a new column by setting into an attribute: + +.. code-block:: ipython + + In[1]: df = pd.DataFrame({'one': [1., 2., 3.]}) + In[2]: df.two = [4, 5, 6] + +This does not raise any obvious exceptions, but also does not create a new column: + +.. code-block:: ipython + + In[3]: df + Out[3]: + one + 0 1.0 + 1 2.0 + 2 3.0 + +The second source of confusion is creating a column whose name collides with a method or +attribute already in the instance namespace: + +.. code-block:: ipython + + In[4]: df['sum'] = [5., 7., 9.] + +This does not permit that column to be accessed as an attribute: + +.. code-block:: ipython + + In[5]: df.sum + Out[5]: + + +Both of these now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. .. _whatsnew_0210.enhancements.other: @@ -31,44 +112,183 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception of type ``MergeError`` will be raised. For more, see :ref:`here ` (:issue:`16270`) -- ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) -- ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`) -- :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ -- :func:`api.types.infer_dtype` now infers decimals. (:issue: `15690`) +- :func:`Series.to_dict` and :func:`DataFrame.to_dict` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) +- :func:`RangeIndex.append` now returns a ``RangeIndex`` object when possible (:issue:`16212`) +- :func:`Series.rename_axis` and :func:`DataFrame.rename_axis` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) +- :func:`Series.set_axis` and :func:`DataFrame.set_axis` now support the ``inplace`` parameter. (:issue:`14636`) +- :func:`Series.to_pickle` and :func:`DataFrame.to_pickle` have gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ +- :func:`api.types.infer_dtype` now infers decimals. (:issue:`15690`) - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) -- :func:`DataFrame.clip()` and :func: `Series.cip()` have gained an inplace argument. (:issue: `15388`) +- :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) +- :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) +- :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) +- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) +- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) +- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. +- :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) +- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). .. _whatsnew_0210.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- Support has been dropped for Python 3.4 (:issue:`15251`) -- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) +.. _whatsnew_0210.api_breaking.pandas_eval: -- Accessing a non-existent attribute on a closed :class:`HDFStore` will now - raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +Improved error handling during item assignment in pd.eval +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`eval` will now raise a ``ValueError`` when item assignment malfunctions, or +inplace operations are specified, but there is no item assignment in the expression (:issue:`16732`) + +.. ipython:: python + + arr = np.array([1, 2, 3]) + +Previously, if you attempted the following expression, you would get a not very helpful error message: + +.. code-block:: ipython + + In [3]: pd.eval("a = 1 + 2", target=arr, inplace=True) + ... + IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) + and integer or boolean arrays are valid indices + +This is a very long way of saying numpy arrays don't support string-item indexing. With this +change, the error message is now this: + +.. code-block:: python + + In [3]: pd.eval("a = 1 + 2", target=arr, inplace=True) + ... + ValueError: Cannot assign expression output to target + +It also used to be possible to evaluate expressions inplace, even if there was no item assignment: + +.. code-block:: ipython + + In [4]: pd.eval("1 + 2", target=arr, inplace=True) + Out[4]: 3 + +However, this input does not make much sense because the output is not being assigned to +the target. Now, a ``ValueError`` will be raised when such an input is passed in: + +.. code-block:: ipython + + In [4]: pd.eval("1 + 2", target=arr, inplace=True) + ... + ValueError: Cannot operate inplace if there is no assignment + +Dtype Conversions +^^^^^^^^^^^^^^^^^ + +- Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to + same the type (e.g. int / float), or raise for datetimelikes. These will now preseve the bools with ``object`` dtypes. (:issue:`16821`). + + .. ipython:: python + + s = Series([1, 2, 3]) + + .. code-block:: python + + In [5]: s[1] = True + + In [6]: s + Out[6]: + 0 1 + 1 1 + 2 3 + dtype: int64 + + New Behavior + + .. ipython:: python + + s[1] = True + s + +- Previously, as assignment to a datetimelike with a non-datetimelike would coerce the + non-datetime-like item being assigned (:issue:`14145`). + + .. ipython:: python + + s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')]) + + .. code-block:: python + + In [1]: s[1] = 1 + + In [2]: s + Out[2]: + 0 2011-01-01 00:00:00.000000000 + 1 1970-01-01 00:00:00.000000001 + dtype: datetime64[ns] + + These now coerce to ``object`` dtype. + + .. ipython:: python + + s[1] = 1 + s + +- Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`) +- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) + +.. _whatsnew_0210.api.na_changes: + +NA naming Changes +^^^^^^^^^^^^^^^^^ + +In order to promote more consistency among the pandas API, we have added additional top-level +functions :func:`isna` and :func:`notna` that are aliases for :func:`isnull` and :func:`notnull`. +The naming scheme is now more consistent with methods like ``.dropna()`` and ``.fillna()``. Furthermore +in all cases where ``.isnull()`` and ``.notnull()`` methods are defined, these have additional methods +named ``.isna()`` and ``.notna()``, these are included for classes ``Categorical``, +``Index``, ``Series``, and ``DataFrame``. (:issue:`15001`). + +The configuration option ``pd.options.mode.use_inf_as_null`` is deprecated, and ``pd.options.mode.use_inf_as_na`` is added as a replacement. .. _whatsnew_0210.api: Other API Changes ^^^^^^^^^^^^^^^^^ +- Support has been dropped for Python 3.4 (:issue:`15251`) +- Support has been dropped for bottleneck < 1.0.0 (:issue:`15214`) +- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) +- Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now + raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) +- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) +- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). +- Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) +- ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) +- Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - Moved definition of ``MergeError`` to the ``pandas.errors`` module. - +- The signature of :func:`Series.set_axis` and :func:`DataFrame.set_axis` has been changed from ``set_axis(axis, labels)`` to ``set_axis(labels, axis=0)``, for consistency with the rest of the API. The old signature is deprecated and will show a ``FutureWarning`` (:issue:`14636`) +- :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) .. _whatsnew_0210.deprecations: Deprecations ~~~~~~~~~~~~ -- :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with to_excel() (:issue:`10559`). +- :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). .. _whatsnew_0210.prior_deprecations: Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- :func:`read_excel()` has dropped the ``has_index_names`` parameter (:issue:`10967`) +- The ``pd.options.display.height`` configuration has been dropped (:issue:`3663`) +- The ``pd.options.display.line_width`` configuration has been dropped (:issue:`2881`) +- The ``pd.options.display.mpl_style`` configuration has been dropped (:issue:`12190`) +- ``Index`` has dropped the ``.sym_diff()`` method in favor of ``.symmetric_difference()`` (:issue:`12591`) +- ``Categorical`` has dropped the ``.order()`` and ``.sort()`` methods in favor of ``.sort_values()`` (:issue:`12882`) +- :func:`eval` and :func:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) +- The function ``get_offset_name`` has been dropped in favor of the ``.freqstr`` attribute for an offset (:issue:`11834`) .. _whatsnew_0210.performance: @@ -76,56 +296,98 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - +- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) +- Improved performance of :func:`median()` when bottleneck is not installed but :func:`np.namnmedian` is present (:issue:`16468`) +- :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) .. _whatsnew_0210.bug_fixes: Bug Fixes ~~~~~~~~~ + Conversion ^^^^^^^^^^ +- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`) +- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) +- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`) +- Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`) +- Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`) Indexing ^^^^^^^^ - +- When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). +- When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). +- Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). +- Fixed :func:`TimedeltaIndex.get_loc` handling of ``np.timedelta64`` inputs (:issue:`16909`). +- Fix :func:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`). +- Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) +- Bug in reindexing on an empty ``CategoricalIndex`` (:issue:`16770`) +- Fixes ``DataFrame.loc`` for setting with alignment and tz-aware ``DatetimeIndex`` (:issue:`16889`) +- Avoids ``IndexError`` when passing an Index or Series to ``.iloc`` with older numpy (:issue:`17193`) +- Allow unicode empty strings as placeholders in multilevel columns in Python 2 (:issue:`17099`) I/O ^^^ -- Bug in ``pd.read_csv()`` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) - +- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`) +- Bug in :func:`read_csv` in which specified column names were not being thoroughly de-duplicated (:issue:`17095`) +- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) +- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`). +- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). +- Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`) +- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) +- Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) Plotting ^^^^^^^^ - +- Bug in plotting methods using ``secondary_y`` and ``fontsize`` not setting secondary axis font size (:issue:`12565`) Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) +- Bug in :func:`infer_freq` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) +- Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) +- Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) +- Bug in ``Series.resample(...).apply()`` where an empty ``Series`` modified the source index and did not return the name of a ``Series`` (:issue:`14313`) +- Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`) Sparse ^^^^^^ +- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`) +- Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) Reshaping ^^^^^^^^^ - - +- Joining/Merging with a non unique ``PeriodIndex`` raised a ``TypeError`` (:issue:`16871`) +- Bug in :func:`crosstab` where non-aligned series of integers were casted to float (:issue:`17005`) +- Bug in merging with categorical dtypes with datetimelikes incorrectly raised a ``TypeError`` (:issue:`16900`) +- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`) +- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) +- Fixes dtype of result with integer dtype input, from :func:`pivot_table` when called with ``margins=True`` (:issue:`17013`) +- Bug in :func:`crosstab` where passing two ``Series`` with the same name raised a ``KeyError`` (:issue:`13279`) +- :func:`Series.argmin`, :func:`Series.argmax`, and their counterparts on ``DataFrame`` and groupby objects work correctly with floating point data that contains infinite values (:issue:`13595`). Numeric ^^^^^^^ +- Bug in ``.clip()`` with ``axis=1`` and a list-like for ``threshold`` is passed; previously this raised ``ValueError`` (:issue:`15390`) Categorical ^^^^^^^^^^^ +- Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) Other ^^^^^ +- Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) +- Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) +- Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1d64f87b15761..4760ba5728c2a 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1,9 +1,11 @@ import itertools import functools import operator +import warnings +from distutils.version import LooseVersion import numpy as np -from pandas import compat +from pandas import compat, _np_version_under1p9 from pandas._libs import tslib, algos, lib from pandas.core.dtypes.common import ( _get_dtype, @@ -16,15 +18,29 @@ is_datetime_or_timedelta_dtype, is_int_or_datetime_dtype, is_any_int_dtype) from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask -from pandas.core.dtypes.missing import isnull, notnull +from pandas.core.dtypes.missing import isna, notna from pandas.core.config import get_option from pandas.core.common import _values_from_object +_BOTTLENECK_INSTALLED = False +_MIN_BOTTLENECK_VERSION = '1.0.0' + try: import bottleneck as bn - _BOTTLENECK_INSTALLED = True + ver = bn.__version__ + _BOTTLENECK_INSTALLED = (LooseVersion(ver) >= + LooseVersion(_MIN_BOTTLENECK_VERSION)) + + if not _BOTTLENECK_INSTALLED: + warnings.warn( + "The installed version of bottleneck {ver} is not supported " + "in pandas and will be not be used\nThe minimum supported " + "version is {min_ver}\n".format( + ver=ver, min_ver=_MIN_BOTTLENECK_VERSION), UserWarning) + except ImportError: # pragma: no cover - _BOTTLENECK_INSTALLED = False + pass + _USE_BOTTLENECK = False @@ -195,7 +211,7 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None, if isfinite: mask = _isfinite(values) else: - mask = isnull(values) + mask = isna(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) @@ -232,7 +248,7 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None, def _isfinite(values): if is_datetime_or_timedelta_dtype(values): - return isnull(values) + return isna(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) @@ -328,12 +344,22 @@ def nanmedian(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna) + if not skipna: + return _wrap_results(np.median(values, axis=axis), dtype) + def get_median(x): - mask = notnull(x) + mask = notna(x) if not skipna and not mask.all(): return np.nan return algos.median(_values_from_object(x[mask])) + if _np_version_under1p9: + agg1d = get_median + aggaxis = lambda values, axis: np.apply_along_axis(get_median, axis, values) + else: + agg1d = np.nanmedian + aggaxis = np.nanmedian + if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -347,8 +373,7 @@ def get_median(x): if values.ndim > 1: # there's a non-empty array to apply over otherwise numpy raises if notempty: - return _wrap_results( - np.apply_along_axis(get_median, axis, values), dtype) + return _wrap_results(aggaxis(values, axis), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" @@ -361,8 +386,7 @@ def get_median(x): return _wrap_results(ret, dtype) # otherwise return a scalar value - return _wrap_results(get_median(values) if notempty else np.nan, dtype) - + return _wrap_results(agg1d(values) if notempty else np.nan, dtype) def _get_counts_nanvar(mask, axis, ddof, dtype=float): dtype = _get_dtype(dtype) @@ -395,7 +419,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1): values = _values_from_object(values) dtype = values.dtype - mask = isnull(values) + mask = isna(values) if is_any_int_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -434,7 +458,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1): def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) - mask = isnull(values) + mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) @@ -469,24 +493,22 @@ def reduction(values, axis=None, skipna=True): nanmin = _nanminmax('min', fill_value_typ='+inf') nanmax = _nanminmax('max', fill_value_typ='-inf') - +@disallow('O') def nanargmax(values, axis=None, skipna=True): """ Returns -1 in the NA case """ - values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf', - isfinite=True) + values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf') result = values.argmax(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result - +@disallow('O') def nanargmin(values, axis=None, skipna=True): """ Returns -1 in the NA case """ - values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf', - isfinite=True) + values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf') result = values.argmin(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result @@ -503,7 +525,7 @@ def nanskew(values, axis=None, skipna=True): """ values = _values_from_object(values) - mask = isnull(values) + mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) @@ -558,7 +580,7 @@ def nankurt(values, axis=None, skipna=True): """ values = _values_from_object(values) - mask = isnull(values) + mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) @@ -615,7 +637,7 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8', 'm8') def nanprod(values, axis=None, skipna=True): - mask = isnull(values) + mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 @@ -696,7 +718,7 @@ def nancorr(a, b, method='pearson', min_periods=None): if min_periods is None: min_periods = 1 - valid = notnull(a) & notnull(b) + valid = notna(a) & notna(b) if not valid.all(): a = a[valid] b = b[valid] @@ -740,7 +762,7 @@ def nancov(a, b, min_periods=None): if min_periods is None: min_periods = 1 - valid = notnull(a) & notnull(b) + valid = notna(a) & notna(b) if not valid.all(): a = a[valid] b = b[valid] @@ -778,8 +800,8 @@ def _ensure_numeric(x): def make_nancomp(op): def f(x, y): - xmask = isnull(x) - ymask = isnull(y) + xmask = isna(x) + ymask = isna(y) mask = xmask | ymask with np.errstate(all='ignore'): From 3e2d1ede81d1dfb65e105e7b046af84312125851 Mon Sep 17 00:00:00 2001 From: rohanp Date: Fri, 18 Aug 2017 18:50:21 -0400 Subject: [PATCH 02/13] fixed merge issues --- doc/source/whatsnew/v0.21.0.txt | 4 ---- pandas/core/nanops.py | 2 -- 2 files changed, 6 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6cc7bba735a9e..8aac5c410d7e7 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -298,13 +298,9 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) -<<<<<<< HEAD - Improved performance of :func:`median()` when bottleneck is not installed but :func:`np.namnmedian` is present (:issue:`16468`) - :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) -======= -- :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) ->>>>>>> 34c4ffd7c454848311f71e6869b5cad4bc132449 .. _whatsnew_0210.bug_fixes: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 86f63c11adae2..5da41ea054a8a 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -494,7 +494,6 @@ def reduction(values, axis=None, skipna=True): nanmax = _nanminmax('max', fill_value_typ='-inf') - @disallow('O') def nanargmax(values, axis=None, skipna=True): """ @@ -506,7 +505,6 @@ def nanargmax(values, axis=None, skipna=True): return result - @disallow('O') def nanargmin(values, axis=None, skipna=True): """ From 227d8c42d5a26a69aa2dd5f1dc3880709c22bb6e Mon Sep 17 00:00:00 2001 From: rohanp Date: Fri, 18 Aug 2017 18:52:30 -0400 Subject: [PATCH 03/13] pep8 --- pandas/core/nanops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 5da41ea054a8a..aa4aaad01da49 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -388,6 +388,7 @@ def get_median(x): # otherwise return a scalar value return _wrap_results(agg1d(values) if notempty else np.nan, dtype) + def _get_counts_nanvar(mask, axis, ddof, dtype=float): dtype = _get_dtype(dtype) count = _get_counts(mask, axis, dtype=dtype) From 23bbe7332f47b6f57c01efd13a5325477b805c57 Mon Sep 17 00:00:00 2001 From: rohanp Date: Tue, 22 Aug 2017 02:04:16 -0400 Subject: [PATCH 04/13] refactored skipna to switch --- pandas/core/nanops.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index aa4aaad01da49..ff10a0d5924dc 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -87,6 +87,21 @@ def _f(*args, **kwargs): return _f +class skipna_switch(object): + + def __init__(self, alt): + self.alt = alt + + def __call__(self, default): + + @functools.wraps(default) + def f(values, axis=None, skipna=True, **kwds): + if skipna: + return default(values, axis, skipna, **kwds) + else: + return self.alt(values, axis, **kwds) + + return f class bottleneck_switch(object): @@ -338,15 +353,13 @@ def nanmean(values, axis=None, skipna=True): return _wrap_results(the_mean, dtype) +@skipna_switch(np.median) @disallow('M8') @bottleneck_switch() def nanmedian(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna) - if not skipna: - return _wrap_results(np.median(values, axis=axis), dtype) - def get_median(x): mask = notna(x) if not skipna and not mask.all(): From c053b7557749d873b7cc3036389d20d1b47ca45e Mon Sep 17 00:00:00 2001 From: rohanp Date: Wed, 23 Aug 2017 00:52:04 -0400 Subject: [PATCH 05/13] removed numpy < 1.9 support --- pandas/core/nanops.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index ff10a0d5924dc..720c60b44baf7 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -5,7 +5,7 @@ from distutils.version import LooseVersion import numpy as np -from pandas import compat, _np_version_under1p9 +from pandas import compat from pandas._libs import tslib, algos, lib from pandas.core.dtypes.common import ( _get_dtype, @@ -87,6 +87,7 @@ def _f(*args, **kwargs): return _f + class skipna_switch(object): def __init__(self, alt): @@ -103,6 +104,7 @@ def f(values, axis=None, skipna=True, **kwds): return f + class bottleneck_switch(object): def __init__(self, zero_value=None, **kwargs): @@ -366,13 +368,6 @@ def get_median(x): return np.nan return algos.median(_values_from_object(x[mask])) - if _np_version_under1p9: - agg1d = get_median - aggaxis = lambda values, axis: np.apply_along_axis(get_median, axis, values) - else: - agg1d = np.nanmedian - aggaxis = np.nanmedian - if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -386,7 +381,7 @@ def get_median(x): if values.ndim > 1: # there's a non-empty array to apply over otherwise numpy raises if notempty: - return _wrap_results(aggaxis(values, axis), dtype) + return _wrap_results(np.nanmedian(values, axis), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" @@ -399,7 +394,7 @@ def get_median(x): return _wrap_results(ret, dtype) # otherwise return a scalar value - return _wrap_results(agg1d(values) if notempty else np.nan, dtype) + return _wrap_results(np.nanmedian(values) if notempty else np.nan, dtype) def _get_counts_nanvar(mask, axis, ddof, dtype=float): From 63291182d03ee3d13b1bdd3e0bab617f9e17d8d8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 10 Dec 2017 18:35:39 -0500 Subject: [PATCH 06/13] cleanups --- doc/source/whatsnew/v0.21.0.txt | 1 - doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/nanops.py | 33 ++++++++++----------------------- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index dc5bb4cfeafe4..89e2d3006696c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1014,7 +1014,6 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) -- Improved performance of :func:`median()` when bottleneck is not installed but :func:`np.namnmedian` is present (:issue:`16468`) - :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) - Improved performance of :meth:`~Series.cat.set_categories` by not materializing the values (:issue:`17508`) - :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 93fd218bd7743..6df6a0534a1e2 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -235,6 +235,7 @@ Performance Improvements - Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - Improved performance of :func:`Series.dt.time` and :func:`DatetimeIndex.time` +- Improved performance of :func:`median()` when bottleneck is not installed but :func:`np.nanmedian` is present (:issue:`16468`) .. _whatsnew_0220.docs: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 21c7b5eed89e1..4fe0813f39897 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -6,7 +6,7 @@ import numpy as np from pandas import compat -from pandas._libs import tslib, algos, lib +from pandas._libs import tslib, lib from pandas.core.dtypes.common import ( _get_dtype, is_float, is_scalar, @@ -87,23 +87,6 @@ def _f(*args, **kwargs): return _f -class skipna_switch(object): - - def __init__(self, alt): - self.alt = alt - - def __call__(self, default): - - @functools.wraps(default) - def f(values, axis=None, skipna=True, **kwds): - if skipna: - return default(values, axis, skipna, **kwds) - else: - return self.alt(values, axis, **kwds) - - return f - - class bottleneck_switch(object): def __init__(self, **kwargs): @@ -360,19 +343,17 @@ def nanmean(values, axis=None, skipna=True): return _wrap_results(the_mean, dtype) -@skipna_switch(np.median) @disallow('M8') @bottleneck_switch() def nanmedian(values, axis=None, skipna=True): - values, mask, dtype, dtype_max = _get_values(values, skipna) - def get_median(x): mask = notna(x) if not skipna and not mask.all(): return np.nan - return algos.median(_values_from_object(x[mask])) + return np.nanmedian(x[mask]) + values, mask, dtype, dtype_max = _get_values(values, skipna) if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -384,8 +365,14 @@ def get_median(x): # an array from a frame if values.ndim > 1: + # there's a non-empty array to apply over otherwise numpy raises if notempty: + if not skipna: + return _wrap_results( + np.apply_along_axis(get_median, axis, values), dtype) + + # fastpath for the skipna case return _wrap_results(np.nanmedian(values, axis), dtype) # must return the correct shape, but median is not defined for the @@ -399,7 +386,7 @@ def get_median(x): return _wrap_results(ret, dtype) # otherwise return a scalar value - return _wrap_results(np.nanmedian(values) if notempty else np.nan, dtype) + return _wrap_results(get_median(values) if notempty else np.nan, dtype) def _get_counts_nanvar(mask, axis, ddof, dtype=float): From 21160a08fa3cfbb586ecd4a6d2e3a94d42f6a6f4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 10 Dec 2017 19:27:15 -0500 Subject: [PATCH 07/13] doc update --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 6df6a0534a1e2..5ed4fddf90b8c 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -235,7 +235,7 @@ Performance Improvements - Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - Improved performance of :func:`Series.dt.time` and :func:`DatetimeIndex.time` -- Improved performance of :func:`median()` when bottleneck is not installed but :func:`np.nanmedian` is present (:issue:`16468`) +- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) .. _whatsnew_0220.docs: From 21da28fa018dec9d5d779497a4336362c7480b7d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 10 Dec 2017 19:35:13 -0500 Subject: [PATCH 08/13] remove algos/median & kth_smallest - no longer used --- pandas/_libs/algos.pyx | 48 ------------------------------------------ 1 file changed, 48 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 3710ddc33c7c5..b06fad124c357 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -166,54 +166,6 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): return result, counts -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil: - cdef: - Py_ssize_t i, j, l, m, n = a.shape[0] - numeric x - - with nogil: - l = 0 - m = n - 1 - - while l < m: - x = a[k] - i = l - j = m - - while 1: - while a[i] < x: i += 1 - while x < a[j]: j -= 1 - if i <= j: - swap(&a[i], &a[j]) - i += 1; j -= 1 - - if i > j: break - - if j < k: l = i - if k < i: m = j - return a[k] - - -cpdef numeric median(numeric[:] arr): - """ - A faster median - """ - cdef Py_ssize_t n = arr.size - - if n == 0: - return np.NaN - - arr = arr.copy() - - if n % 2: - return kth_smallest(arr, n // 2) - else: - return (kth_smallest(arr, n // 2) + - kth_smallest(arr, n // 2 - 1)) / 2 - - # ---------------------------------------------------------------------- # Pairwise correlation/covariance From 1123477a19318dd2432f4b2788dde4f1df61dc20 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 21 Jan 2018 13:27:52 -0500 Subject: [PATCH 09/13] whatsnew --- doc/source/whatsnew/v0.22.0.txt | 124 -------------------------------- doc/source/whatsnew/v0.23.0.txt | 2 + 2 files changed, 2 insertions(+), 124 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 411b59f93cf97..d165339cb0de9 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -197,131 +197,7 @@ or expanding sum with ``min_periods=0``. Previously this returned ``NaN``, when fewer than ``min_periods`` non-*NA* values were in the window. Now it returns ``0``. -<<<<<<< HEAD -.. _whatsnew_0220.api_breaking.deps: - -Dependencies have increased minimum versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We have updated our minimum supported versions of dependencies (:issue:`15184`). -If installed, we now require: - - +-----------------+-----------------+----------+ - | Package | Minimum Version | Required | - +=================+=================+==========+ - | python-dateutil | 2.5.0 | X | - +-----------------+-----------------+----------+ - | openpyxl | 2.4.0 | | - +-----------------+-----------------+----------+ - - -- Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`) -- Building from source now explicity requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - -.. _whatsnew_0220.api: - -Other API Changes -^^^^^^^^^^^^^^^^^ - -- :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`) -- ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`) -- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`) -- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) -- All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). -- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) -- :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`) -- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) -- ``pandas.tseries.frequencies.get_freq_group()`` and ``pandas.tseries.frequencies.DAYS`` are removed from the public API (:issue:`18034`) -- :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) -- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`, :issue:`18509`). -- :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) -- :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) -- Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) -- Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). -- :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) -- When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`) -- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) -- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) -- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) -- Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) -- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`) -- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) -- :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) - -.. _whatsnew_0220.deprecations: - -Deprecations -~~~~~~~~~~~~ - -- ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`). -- ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`). -- ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`) - -.. _whatsnew_0220.prior_deprecations: - -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- Warnings against the obsolete usage ``Categorical(codes, categories)``, which were emitted for instance when the first two arguments to ``Categorical()`` had different dtypes, and recommended the use of ``Categorical.from_codes``, have now been removed (:issue:`8074`) -- The ``levels`` and ``labels`` attributes of a ``MultiIndex`` can no longer be set directly (:issue:`4039`). -- ``pd.tseries.util.pivot_annual`` has been removed (deprecated since v0.19). Use ``pivot_table`` instead (:issue:`18370`) -- ``pd.tseries.util.isleapyear`` has been removed (deprecated since v0.19). Use ``.is_leap_year`` property in Datetime-likes instead (:issue:`18370`) -- ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd.merge_ordered`` instead (:issue:`18459`) -- The ``SparseList`` class has been removed (:issue:`14007`) -- The ``pandas.io.wb`` and ``pandas.io.data`` stub modules have been removed (:issue:`13735`) -- ``Categorical.from_array`` has been removed (:issue:`13854`) -- The ``freq`` and ``how`` parameters have been removed from the ``rolling``/``expanding``/``ewm`` methods of DataFrame - and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601 & :issue:18668) -- ``DatetimeIndex.to_datetime``, ``Timestamp.to_datetime``, ``PeriodIndex.to_datetime``, and ``Index.to_datetime`` have been removed (:issue:`8254`, :issue:`14096`, :issue:`14113`) - -.. _whatsnew_0220.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`) -- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) -- :class`DateOffset` arithmetic performance is improved (:issue:`18218`) -- Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) -- Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`) -- The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) -- ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) -- Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) -- Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) -- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) -- Improved performance of :func:`Series.dt.time` and :func:`DatetimeIndex.time` -- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - -.. _whatsnew_0220.docs: - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- -- -- - -.. _whatsnew_0220.bug_fixes: - -Bug Fixes -~~~~~~~~~ - - -Conversion -^^^^^^^^^^ - -- Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) -- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) -- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) -- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) -- Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) - - -Indexing -^^^^^^^^ -======= *pandas 0.21.1* ->>>>>>> master .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fe5342c520196..5fa0c1ff9253f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -379,6 +379,8 @@ Performance Improvements - Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) +- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) + .. _whatsnew_0230.docs: From 3025f79c6b7b1b002bacff4ed86a0b3096a1721e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 21 Jan 2018 13:35:51 -0500 Subject: [PATCH 10/13] remove more code --- pandas/_libs/algos.pxd | 13 ------------- pandas/_libs/groupby.pyx | 11 ++++++++++- setup.py | 4 ++-- 3 files changed, 12 insertions(+), 16 deletions(-) delete mode 100644 pandas/_libs/algos.pxd diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd deleted file mode 100644 index 6d80e6f0073eb..0000000000000 --- a/pandas/_libs/algos.pxd +++ /dev/null @@ -1,13 +0,0 @@ -from util cimport numeric -from numpy cimport float64_t, double_t - -cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil - -cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: - cdef numeric t - - # cython doesn't allow pointer dereference so use array syntax - t = a[0] - a[0] = b[0] - b[0] = t - return 0 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9d9ac2ef2f5b1..ac79656f1c975 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -16,7 +16,6 @@ from numpy cimport (ndarray, from libc.stdlib cimport malloc, free from util cimport numeric, get_nat -from algos cimport swap from algos import take_2d_axis1_float64_float64, groupsort_indexer cdef int64_t iNaT = get_nat() @@ -25,6 +24,16 @@ cdef double NaN = np.NaN cdef double nan = NaN +cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: + cdef numeric t + + # cython doesn't allow pointer dereference so use array syntax + t = a[0] + a[0] = b[0] + b[0] = t + return 0 + + # TODO: aggregate multiple columns in single pass # ---------------------------------------------------------------------- # first, nth, last diff --git a/setup.py b/setup.py index 7ade1544ec5cd..246edfa893f42 100755 --- a/setup.py +++ b/setup.py @@ -460,11 +460,11 @@ def pxd(name): ext_data = { '_libs.algos': { 'pyxfile': '_libs/algos', - 'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'], + 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], 'depends': _pxi_dep['algos']}, '_libs.groupby': { 'pyxfile': '_libs/groupby', - 'pxdfiles': ['_libs/src/util', '_libs/algos'], + 'pxdfiles': ['_libs/src/util'], 'depends': _pxi_dep['groupby']}, '_libs.hashing': { 'pyxfile': '_libs/hashing'}, From 5b71ee1e48c99fd052e39023e01b85d75aadda58 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 21 Jan 2018 13:41:25 -0500 Subject: [PATCH 11/13] import issue --- pandas/_libs/algos.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 42b4897fce299..40c14a014d264 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -8,8 +8,6 @@ from cython cimport Py_ssize_t np.import_array() -cdef float64_t FP_ERR = 1e-13 - cimport util from libc.stdlib cimport malloc, free @@ -24,6 +22,7 @@ from numpy cimport (ndarray, double_t) +cdef float64_t FP_ERR = 1e-13 cdef double NaN = np.NaN cdef double nan = NaN From f25a62addc0156a827c9696baecb5b18c0c72554 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 21 Jan 2018 15:57:51 -0500 Subject: [PATCH 12/13] moar --- pandas/_libs/algos.pxd | 13 ++++++++++++ pandas/_libs/algos.pyx | 33 +++++++++++++++++++++++++++++- pandas/_libs/groupby.pyx | 11 +--------- pandas/_libs/groupby_helper.pxi.in | 2 +- 4 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 pandas/_libs/algos.pxd diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd new file mode 100644 index 0000000000000..6d80e6f0073eb --- /dev/null +++ b/pandas/_libs/algos.pxd @@ -0,0 +1,13 @@ +from util cimport numeric +from numpy cimport float64_t, double_t + +cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil + +cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: + cdef numeric t + + # cython doesn't allow pointer dereference so use array syntax + t = a[0] + a[0] = b[0] + b[0] = t + return 0 diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 40c14a014d264..9a7af71e74574 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -8,6 +8,8 @@ from cython cimport Py_ssize_t np.import_array() +cdef float64_t FP_ERR = 1e-13 + cimport util from libc.stdlib cimport malloc, free @@ -22,7 +24,6 @@ from numpy cimport (ndarray, double_t) -cdef float64_t FP_ERR = 1e-13 cdef double NaN = np.NaN cdef double nan = NaN @@ -165,6 +166,36 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): return result, counts +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil: + cdef: + Py_ssize_t i, j, l, m, n = a.shape[0] + numeric x + + with nogil: + l = 0 + m = n - 1 + + while l < m: + x = a[k] + i = l + j = m + + while 1: + while a[i] < x: i += 1 + while x < a[j]: j -= 1 + if i <= j: + swap(&a[i], &a[j]) + i += 1; j -= 1 + + if i > j: break + + if j < k: l = i + if k < i: m = j + return a[k] + + # ---------------------------------------------------------------------- # Pairwise correlation/covariance diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ac79656f1c975..9d9ac2ef2f5b1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -16,6 +16,7 @@ from numpy cimport (ndarray, from libc.stdlib cimport malloc, free from util cimport numeric, get_nat +from algos cimport swap from algos import take_2d_axis1_float64_float64, groupsort_indexer cdef int64_t iNaT = get_nat() @@ -24,16 +25,6 @@ cdef double NaN = np.NaN cdef double nan = NaN -cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: - cdef numeric t - - # cython doesn't allow pointer dereference so use array syntax - t = a[0] - a[0] = b[0] - b[0] = t - return 0 - - # TODO: aggregate multiple columns in single pass # ---------------------------------------------------------------------- # first, nth, last diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 14d47398ac1df..a751fadaf48cf 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -740,7 +740,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = _median_linear(ptr, size) + out[j, i] = median_linear(ptr, size) ptr += size From 027a2b2fe053df0e8909f3ab35d63f9813fc761a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 21 Jan 2018 15:58:16 -0500 Subject: [PATCH 13/13] moar --- pandas/_libs/groupby.pyx | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9d9ac2ef2f5b1..9cc15fb6692d9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -118,7 +118,7 @@ def group_last_object(ndarray[object, ndim=2] out, out[i, j] = resx[i, j] -cdef inline float64_t _median_linear(float64_t* a, int n) nogil: +cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result cdef float64_t* tmp diff --git a/setup.py b/setup.py index 246edfa893f42..7ade1544ec5cd 100755 --- a/setup.py +++ b/setup.py @@ -460,11 +460,11 @@ def pxd(name): ext_data = { '_libs.algos': { 'pyxfile': '_libs/algos', - 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'], 'depends': _pxi_dep['algos']}, '_libs.groupby': { 'pyxfile': '_libs/groupby', - 'pxdfiles': ['_libs/src/util'], + 'pxdfiles': ['_libs/src/util', '_libs/algos'], 'depends': _pxi_dep['groupby']}, '_libs.hashing': { 'pyxfile': '_libs/hashing'},