From 193f238129648e581ce18cfaa3619b604bd312d7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 25 Apr 2014 09:22:03 -0500 Subject: [PATCH] BUG: Datetime values in DataFrame.quantile() Closes #6965 previously returned nonsense --- doc/source/release.rst | 1 + pandas/core/frame.py | 44 +++++++++++++++++++++++++++----------- pandas/tests/test_frame.py | 19 ++++++++++++++++ 3 files changed, 52 insertions(+), 12 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index b670e6b5cea05..ec8f44f955043 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -491,6 +491,7 @@ Bug Fixes - Bug in ``unstack`` raises ``ValueError`` when ``MultiIndex`` contains ``PeriodIndex`` (:issue:`4342`) - Bug in ``boxplot`` and ``hist`` draws unnecessary axes (:issue:`6769`) - Regression in ``groupby.nth()`` for out-of-bounds indexers (:issue:`6621`) +- Bug in ``quantile`` with datetime values (:issue:`6965`) pandas 0.13.1 ------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4345154437bf5..66ba061ab35ef 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4188,23 +4188,43 @@ def quantile(self, q=0.5, axis=0, numeric_only=True): """ per = np.asarray(q) * 100 + if not com.is_list_like(per): + per = [per] + q = [q] + squeeze = True + else: + squeeze = False + def f(arr, per): - arr = arr.values - if arr.dtype != np.float_: - arr = arr.astype(float) - arr = arr[notnull(arr)] - if len(arr) == 0: + if arr._is_datelike_mixed_type: + values = _values_from_object(arr).view('i8') + else: + values = arr.astype(float) + values = values[notnull(values)] + if len(values) == 0: return NA else: - return _quantile(arr, per) + return _quantile(values, per) data = self._get_numeric_data() if numeric_only else self - if com.is_list_like(per): - from pandas.tools.merge import concat - return concat([data.apply(f, axis=axis, args=(x,)) for x in per], - axis=1, keys=per/100.).T - else: - return data.apply(f, axis=axis, args=(per,)) + + # need to know which cols are timestamp going in so that we can + # map timestamp over them after getting the quantile. + is_dt_col = data.dtypes.map(com.is_datetime64_dtype) + is_dt_col = is_dt_col[is_dt_col].index + + quantiles = [[f(vals, x) for x in per] + for (_, vals) in data.iteritems()] + result = DataFrame(quantiles, index=data._info_axis, columns=q).T + if len(is_dt_col) > 0: + result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp) + if squeeze: + if result.shape == (1, 1): + result = result.T.iloc[:, 0] # don't want scalar + else: + result = result.T.squeeze() + result.name = None # For groupby, so it can set an index name + return result def rank(self, axis=0, numeric_only=None, method='average', na_option='keep', ascending=True, pct=False): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1e803a46d76de..7365e4be187b0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10994,6 +10994,25 @@ def test_quantile_multi(self): index=[.1, .9]) assert_frame_equal(result, expected) + def test_quantile_datetime(self): + df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]}) + + # exclude datetime + result = df.quantile(.5) + expected = Series([2.5], index=['b']) + + # datetime + result = df.quantile(.5, numeric_only=False) + expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], + index=['a', 'b']) + assert_series_equal(result, expected) + + # datetime w/ multi + result = df.quantile([.5], numeric_only=False) + expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]], + index=[.5], columns=['a', 'b']) + assert_frame_equal(result, expected) + def test_cumsum(self): self.tsframe.ix[5:10, 0] = nan self.tsframe.ix[10:15, 1] = nan