diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 9367c42f8d39a..5c5a1df4ea1f8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -423,7 +423,7 @@ class frame_get_dtype_counts(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10, 10000)) + self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() @@ -985,3 +985,14 @@ def setup(self): def time_series_string_vector_slice(self): self.s.str[:5] + + +class frame_quantile_axis1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 3), + columns=list('ABC')) + + def time_frame_quantile_axis1(self): + self.df.quantile([0.1, 0.5], axis=1) diff --git a/codecov.yml b/codecov.yml index edf2d821e07e5..86e7dd55c9550 100644 --- a/codecov.yml +++ b/codecov.yml @@ -9,4 +9,5 @@ coverage: branches: null changes: default: - branches: null + branches: + - master diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 7f837bef5251c..51982c42499ff 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -563,7 +563,6 @@ Performance Improvements - Improved speed of SAS reader (:issue:`12656`, :issue:`12961`) - Performance improvements in ``.groupby(..).cumcount()`` (:issue:`11039`) - Improved memory usage in ``pd.read_csv()`` when using ``skiprows=an_integer`` (:issue:`13005`) - - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) - Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`). - Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 34bd2956319fc..fc342bcc50b61 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -97,6 +97,9 @@ Performance Improvements - Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) +- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) + + @@ -110,6 +113,7 @@ Bug Fixes +- Regression in ``Series.quantile`` with nans (:issue:`13098`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b209b6d6ec543..3bf442349ef04 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4989,31 +4989,27 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0.5 2.5 55.0 """ self._check_percentile(q) - if not com.is_list_like(q): - q = [q] - squeeze = True - else: - squeeze = False data = self._get_numeric_data() if numeric_only else self axis = self._get_axis_number(axis) + is_transposed = axis == 1 - def _quantile(series): - res = series.quantile(q, interpolation=interpolation) - return series.name, res - - if axis == 1: + if is_transposed: data = data.T - # unable to use DataFrame.apply, becasuse data may be empty - result = dict(_quantile(s) for (_, s) in data.iteritems()) - result = self._constructor(result, columns=data.columns) - if squeeze: - if result.shape == (1, 1): - result = result.T.iloc[:, 0] # don't want scalar - else: - result = result.T.squeeze() - result.name = None # For groupby, so it can set an index name + result = data._data.quantile(qs=q, + axis=1, + interpolation=interpolation, + transposed=is_transposed) + + if result.ndim == 2: + result = self._constructor(result) + else: + result = self._constructor_sliced(result, name=q) + + if is_transposed: + result = result.T + return result def to_timestamp(self, freq=None, how='start', axis=0, copy=True): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index abfc5c989056e..97df81ad6be48 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -40,7 +40,7 @@ from pandas.util.decorators import cache_readonly from pandas.tslib import Timedelta -from pandas import compat +from pandas import compat, _np_version_under1p9 from pandas.compat import range, map, zip, u from pandas.lib import BlockPlacement @@ -84,7 +84,7 @@ def __init__(self, values, placement, ndim=None, fastpath=False): self.mgr_locs = placement self.values = values - if len(self.mgr_locs) != len(self.values): + if ndim and len(self.mgr_locs) != len(self.values): raise ValueError('Wrong number of items passed %d, placement ' 'implies %d' % (len(self.values), len(self.mgr_locs))) @@ -180,6 +180,12 @@ def make_block(self, values, placement=None, ndim=None, **kwargs): return make_block(values, placement=placement, ndim=ndim, **kwargs) + def make_block_scalar(self, values, **kwargs): + """ + Create a ScalarBlock + """ + return ScalarBlock(values) + def make_block_same_class(self, values, placement=None, fastpath=True, **kwargs): """ Wrap given values in a block of same type as self. """ @@ -324,7 +330,8 @@ def apply(self, func, mgr=None, **kwargs): """ result = func(self.values, **kwargs) if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result)) + result = self.make_block(values=_block_shape(result, + ndim=self.ndim)) return result @@ -1260,32 +1267,117 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) - def quantile(self, qs, mgr=None, **kwargs): + def quantile(self, qs, interpolation='linear', axis=0, mgr=None): """ compute the quantiles of the Parameters ---------- - qs : a scalar or list of the quantiles to be computed + qs: a scalar or list of the quantiles to be computed + interpolation: type of interpolation, default 'linear' + axis: axis to compute, default 0 + + Returns + ------- + tuple of (axis, block) + """ + if _np_version_under1p9: + if interpolation != 'linear': + raise ValueError("Interpolation methods other than linear " + "are not supported in numpy < 1.9.") + + kw = {} + if not _np_version_under1p9: + kw.update({'interpolation': interpolation}) values = self.get_values() - values, mask, _, _ = self._try_coerce_args(values, values) + values, _, _, _ = self._try_coerce_args(values, values) + mask = isnull(self.values) if not lib.isscalar(mask) and mask.any(): - values = values[~mask] - if len(values) == 0: - if com.is_list_like(qs): - result = np.array([self.fill_value]) + # even though this could be a 2-d mask it appears + # as a 1-d result + mask = mask.reshape(values.shape) + result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) + values = _block_shape(values[~mask], ndim=self.ndim) + if self.ndim > 1: + values = values.reshape(result_shape) + + from pandas import Float64Index + is_empty = values.shape[axis] == 0 + if com.is_list_like(qs): + ax = Float64Index(qs) + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat(np.array([self._na_value] * len(qs)), + len(values)).reshape(len(values), + len(qs)) else: - result = self._na_value - elif com.is_list_like(qs): - values = [_quantile(values, x * 100, **kwargs) for x in qs] - result = np.array(values) + + try: + result = _quantile(values, np.array(qs) * 100, + axis=axis, **kw) + except ValueError: + + # older numpies don't handle an array for q + result = [_quantile(values, q * 100, + axis=axis, **kw) for q in qs] + + result = np.array(result, copy=False) + if self.ndim > 1: + result = result.T + else: - result = _quantile(values, qs * 100, **kwargs) - return self._try_coerce_result(result) + if self.ndim == 1: + ax = Float64Index([qs]) + else: + ax = mgr.axes[0] + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + result = np.array([self._na_value] * len(self)) + else: + result = _quantile(values, qs * 100, axis=axis, **kw) + + ndim = getattr(result, 'ndim', None) or 0 + result = self._try_coerce_result(result) + if lib.isscalar(result): + return ax, self.make_block_scalar(result) + return ax, make_block(result, + placement=np.arange(len(result)), + ndim=ndim) + + +class ScalarBlock(Block): + """ + a scalar compat Block + """ + __slots__ = ['_mgr_locs', 'values', 'ndim'] + + def __init__(self, values): + self.ndim = 0 + self.mgr_locs = [0] + self.values = values + + @property + def dtype(self): + return type(self.values) + + @property + def shape(self): + return tuple([0]) + + def __len__(self): + return 0 class NonConsolidatableMixIn(object): @@ -1378,6 +1470,8 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] + + mask = mask.reshape(new_values.shape) new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] @@ -1676,6 +1770,7 @@ def convert(self, *args, **kwargs): can return multiple blocks! """ + if args: raise NotImplementedError by_item = True if 'by_item' not in kwargs else kwargs['by_item'] @@ -1706,8 +1801,13 @@ def convert(self, *args, **kwargs): for i, rl in enumerate(self.mgr_locs): values = self.iget(i) - values = fn(values.ravel(), **fn_kwargs).reshape(values.shape) - values = _block_shape(values, ndim=self.ndim) + shape = values.shape + values = fn(values.ravel(), **fn_kwargs) + try: + values = values.reshape(shape) + values = _block_shape(values, ndim=self.ndim) + except AttributeError: + pass newb = make_block(values, ndim=self.ndim, placement=[rl]) blocks.append(newb) @@ -2115,7 +2215,10 @@ def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('M8[ns]') + try: + result = result.astype('M8[ns]') + except ValueError: + pass elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) return result @@ -2219,11 +2322,6 @@ def to_object_block(self, mgr): kwargs['placement'] = [0] return self.make_block(values, klass=ObjectBlock, **kwargs) - def replace(self, *args, **kwargs): - # if we are forced to ObjectBlock, then don't coerce (to UTC) - kwargs['convert'] = False - return super(DatetimeTZBlock, self).replace(*args, **kwargs) - def _slice(self, slicer): """ return a slice of my values """ if isinstance(slicer, tuple): @@ -2246,8 +2344,8 @@ def _try_coerce_args(self, values, other): ------- base-type values, values mask, base-type other, other mask """ - values_mask = isnull(values) - values = values.tz_localize(None).asi8 + values_mask = _block_shape(isnull(values), ndim=self.ndim) + values = _block_shape(values.tz_localize(None).asi8, ndim=self.ndim) other_mask = False if isinstance(other, ABCSeries): @@ -2283,6 +2381,9 @@ def _try_coerce_result(self, result): elif isinstance(result, (np.integer, np.float, np.datetime64)): result = lib.Timestamp(result).tz_localize(self.values.tz) if isinstance(result, np.ndarray): + # allow passing of > 1dim if its trivial + if result.ndim > 1: + result = result.reshape(len(result)) result = self._holder(result).tz_localize(self.values.tz) return result @@ -2809,7 +2910,7 @@ def _verify_integrity(self): len(self.items), tot_items)) def apply(self, f, axes=None, filter=None, do_integrity_check=False, - consolidate=True, raw=False, **kwargs): + consolidate=True, **kwargs): """ iterate over the blocks, collect and create a new block manager @@ -2823,7 +2924,6 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, integrity check consolidate: boolean, default True. Join together blocks having same dtype - raw: boolean, default False. Return the raw returned results Returns ------- @@ -2890,17 +2990,102 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) - if raw: - if self._is_single_block: - return result_blocks[0] - return result_blocks - elif len(result_blocks) == 0: + if len(result_blocks) == 0: return self.make_empty(axes or self.axes) bm = self.__class__(result_blocks, axes or self.axes, do_integrity_check=do_integrity_check) bm._consolidate_inplace() return bm + def reduction(self, f, axis=0, consolidate=True, transposed=False, + **kwargs): + """ + iterate over the blocks, collect and create a new block manager. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + f: the callable or function name to operate on at the block level + axis: reduction axis, default 0 + consolidate: boolean, default True. Join together blocks having same + dtype + transposed: boolean, default False + we are holding transposed data + + Returns + ------- + Block Manager (new object) + + """ + + if consolidate: + self._consolidate_inplace() + + axes, blocks = [], [] + for b in self.blocks: + kwargs['mgr'] = self + axe, block = getattr(b, f)(axis=axis, **kwargs) + + axes.append(axe) + blocks.append(block) + + # note that some DatetimeTZ, Categorical are always ndim==1 + ndim = set([b.ndim for b in blocks]) + + if 2 in ndim: + + new_axes = list(self.axes) + + # multiple blocks that are reduced + if len(blocks) > 1: + new_axes[1] = axes[0] + + # reset the placement to the original + for b, sb in zip(blocks, self.blocks): + b.mgr_locs = sb.mgr_locs + + else: + new_axes[axis] = Index(np.concatenate( + [ax.values for ax in axes])) + + if transposed: + new_axes = new_axes[::-1] + blocks = [b.make_block(b.values.T, + placement=np.arange(b.shape[1]) + ) for b in blocks] + + return self.__class__(blocks, new_axes) + + # 0 ndim + if 0 in ndim and 1 not in ndim: + values = np.array([b.values for b in blocks]) + if len(values) == 1: + return values.item() + blocks = [make_block(values, ndim=1)] + axes = Index([ax[0] for ax in axes]) + + # single block + values = _concat._concat_compat([b.values for b in blocks]) + + # compute the orderings of our original data + if len(self.blocks) > 1: + + indexer = np.empty(len(self.axes[0]), dtype='int64') + i = 0 + for b in self.blocks: + for j in b.mgr_locs: + indexer[j] = i + i = i + 1 + + values = values.take(indexer) + + return SingleBlockManager( + [make_block(values, + ndim=1, + placement=np.arange(len(values)))], + axes[0]) + def isnull(self, **kwargs): return self.apply('apply', **kwargs) @@ -2911,7 +3096,7 @@ def eval(self, **kwargs): return self.apply('eval', **kwargs) def quantile(self, **kwargs): - return self.apply('quantile', raw=True, **kwargs) + return self.reduction('quantile', **kwargs) def setitem(self, **kwargs): return self.apply('setitem', **kwargs) @@ -3068,7 +3253,6 @@ def combine(self, blocks, copy=True): indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - new_items = self.items.take(indexer) new_blocks = [] for b in blocks: @@ -3077,9 +3261,10 @@ def combine(self, blocks, copy=True): axis=0, allow_fill=False) new_blocks.append(b) - new_axes = list(self.axes) - new_axes[0] = new_items - return self.__class__(new_blocks, new_axes, do_integrity_check=False) + axes = list(self.axes) + axes[0] = self.items.take(indexer) + + return self.__class__(new_blocks, axes, do_integrity_check=False) def get_slice(self, slobj, axis=0): if axis >= self.ndim: @@ -3829,6 +4014,16 @@ def _block(self): def _values(self): return self._block.values + @property + def _blknos(self): + """ compat with BlockManager """ + return None + + @property + def _blklocs(self): + """ compat with BlockManager """ + return None + def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): # if we are the same and don't copy, just return @@ -4317,7 +4512,7 @@ def _extend_blocks(result, blocks=None): def _block_shape(values, ndim=1, shape=None): """ guarantee the shape of the values to be at least 1 d """ - if values.ndim <= ndim: + if values.ndim < ndim: if shape is None: shape = values.shape values = values.reshape(tuple((1, ) + shape)) diff --git a/pandas/core/series.py b/pandas/core/series.py index 58e983ad904ba..43b4ba3a51212 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -57,8 +57,6 @@ from pandas.core.config import get_option -from pandas import _np_version_under1p9 - __all__ = ['Series'] _shared_doc_kwargs = dict( @@ -1349,21 +1347,12 @@ def quantile(self, q=0.5, interpolation='linear'): self._check_percentile(q) - if _np_version_under1p9: - if interpolation != 'linear': - raise ValueError("Interpolation methods other than linear " - "are not supported in numpy < 1.9.") - - kwargs = dict() - if not _np_version_under1p9: - kwargs.update({'interpolation': interpolation}) + result = self._data.quantile(qs=q, interpolation=interpolation) - result = self._data.quantile(qs=q, **kwargs) - - if com.is_list_like(result): - # explicitly use Float64Index to coerce empty result to float dtype - index = Float64Index(q) - return self._constructor(result, index=index, name=self.name) + if com.is_list_like(q): + return self._constructor(result, + index=Float64Index(q), + name=self.name) else: # scalar return result diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index dff2c6f0df7b1..318fd17b8f88e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3826,24 +3826,24 @@ def write_data(self, chunksize, dropna=False): nrows = self.nrows_expected # if dropna==True, then drop ALL nan rows + masks = [] if dropna: - masks = [] for a in self.values_axes: # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask mask = com.isnull(a.data).all(axis=0) - masks.append(mask.astype('u1', copy=False)) + if isinstance(mask, np.ndarray): + masks.append(mask.astype('u1', copy=False)) - # consolidate masks + # consolidate masks + if len(masks): mask = masks[0] for m in masks[1:]: mask = mask & m mask = mask.ravel() - else: - mask = None # broadcast the indexes if needed diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 843031fafa1a9..3ccc1c4f9336c 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -642,6 +642,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint seen_float = 0 bint seen_complex = 0 bint seen_datetime = 0 + bint seen_datetimetz = 0 bint seen_timedelta = 0 bint seen_int = 0 bint seen_bool = 0 @@ -675,6 +676,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if val is None: seen_null = 1 floats[i] = complexes[i] = fnan + elif val is NaT: + if convert_datetime: + idatetimes[i] = iNaT + seen_datetime = 1 + if convert_timedelta: + itimedeltas[i] = iNaT + seen_timedelta = 1 + if not (convert_datetime or convert_timedelta): + seen_object = 1 elif util.is_bool_object(val): seen_bool = 1 bools[i] = val @@ -710,9 +720,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, complexes[i] = val seen_complex = 1 elif PyDateTime_Check(val) or util.is_datetime64_object(val): + + # if we have an tz's attached then return the objects if convert_datetime: - seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + if getattr(val, 'tzinfo', None) is not None: + seen_datetimetz = 1 + break + else: + seen_datetime = 1 + idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value else: seen_object = 1 break @@ -731,6 +747,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_numeric = seen_complex or seen_float or seen_int + # we try to coerce datetime w/tz but must all have the same tz + if seen_datetimetz: + if len(set([ getattr(val, 'tz', None) for val in objects ])) == 1: + from pandas import DatetimeIndex + return DatetimeIndex(objects) + seen_object = 1 + if not seen_object: if not safe: diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index d883363812ddb..52e8697abe850 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -28,9 +28,12 @@ def test_quantile(self): q = self.tsframe.quantile(0.1, axis=0) self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + tm.assert_index_equal(q.index, self.tsframe.columns) + q = self.tsframe.quantile(0.9, axis=1) - q = self.intframe.quantile(0.1) - self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + self.assertEqual(q['2000-01-17'], + percentile(self.tsframe.loc['2000-01-17'], 90)) + tm.assert_index_equal(q.index, self.tsframe.index) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) @@ -39,13 +42,13 @@ def test_quantile(self): # non-numeric exclusion df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) rs = df.quantile(0.5) - xp = df.median() + xp = df.median().rename(0.5) assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1) - expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile([.5, .75], axis=1) @@ -59,9 +62,25 @@ def test_quantile(self): df = DataFrame([[1, 2, 3], ['a', 'b', 4]]) result = df.quantile(.5, axis=1) - expected = Series([3., 4.], index=[0, 1]) + expected = Series([3., 4.], index=[0, 1], name=0.5) assert_series_equal(result, expected) + def test_quantile_axis_mixed(self): + + # mixed on axis=1 + df = DataFrame({"A": [1, 2, 3], + "B": [2., 3., 4.], + "C": pd.date_range('20130101', periods=3), + "D": ['foo', 'bar', 'baz']}) + result = df.quantile(.5, axis=1) + expected = Series([1.5, 2.5, 3.5], name=0.5) + assert_series_equal(result, expected) + + # must raise + def f(): + df.quantile(.5, axis=1, numeric_only=False) + self.assertRaises(TypeError, f) + def test_quantile_axis_parameter(self): # GH 9543/9544 @@ -69,7 +88,7 @@ def test_quantile_axis_parameter(self): result = df.quantile(.5, axis=0) - expected = Series([2., 3.], index=["A", "B"]) + expected = Series([2., 3.], index=["A", "B"], name=0.5) assert_series_equal(result, expected) expected = df.quantile(.5, axis="index") @@ -77,7 +96,7 @@ def test_quantile_axis_parameter(self): result = df.quantile(.5, axis=1) - expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile(.5, axis="columns") @@ -107,22 +126,23 @@ def test_quantile_interpolation(self): # interpolation method other than default linear df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1, 2, 3], index=[1, 2, 3]) + expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) + # cross-check interpolation=nearest results in original dtype exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5, axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], dtype='int64') + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64') assert_series_equal(result, expected) # float df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1., 2., 3.], index=[1, 2, 3]) + expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5, axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], dtype='float64') + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64') assert_series_equal(result, expected) # axis @@ -217,7 +237,8 @@ def test_quantile_datetime(self): # datetime result = df.quantile(.5, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], - index=['a', 'b']) + index=['a', 'b'], + name=0.5) assert_series_equal(result, expected) # datetime w/ multi @@ -231,7 +252,8 @@ def test_quantile_datetime(self): result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), Timestamp('2011-07-02 12:00:00')], - index=[0, 1]) + index=[0, 1], + name=0.5) assert_series_equal(result, expected) result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) @@ -256,12 +278,13 @@ def test_quantile_box(self): 'C': [pd.Timedelta('1 days'), pd.Timedelta('2 days'), pd.Timedelta('3 days')]}) + res = df.quantile(0.5, numeric_only=False) - # when squeezed, result.name is explicitly reset + exp = pd.Series([pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days')], - name=None, index=['A', 'B', 'C']) + name=0.5, index=['A', 'B', 'C']) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) @@ -305,7 +328,7 @@ def test_quantile_box(self): pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days'), pd.Timedelta('2 days')], - name=None, index=list('AaBbCc')) + name=0.5, index=list('AaBbCc')) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index f538fa4e90401..e0bff7fbd39e4 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -126,6 +126,14 @@ def test_quantile_interpolation_np_lt_1p9(self): interpolation='higher') def test_quantile_nan(self): + + # GH 13098 + s = pd.Series([1, 2, 3, 4, np.nan]) + result = s.quantile(0.5) + expected = 2.5 + self.assertEqual(result, expected) + + # all nan/empty cases = [Series([]), Series([np.nan, np.nan])] for s in cases: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 5bd5c80f18386..583b1c7aea270 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2676,7 +2676,7 @@ def f(x, q=None, axis=0): trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) - assert_series_equal(agg_result, agg_expected) + assert_series_equal(agg_result, agg_expected, check_names=False) assert_series_equal(trans_result, trans_expected) agg_result = ts_grouped.agg(f, q=80) @@ -2692,11 +2692,11 @@ def f(x, q=None, axis=0): apply_result = df_grouped.apply(DataFrame.quantile, .8) expected = df_grouped.quantile(.8) assert_frame_equal(apply_result, expected) - assert_frame_equal(agg_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) agg_result = df_grouped.agg(f, q=80) apply_result = df_grouped.apply(DataFrame.quantile, q=.8) - assert_frame_equal(agg_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) assert_frame_equal(apply_result, expected) def test_size(self):