diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index a81ab6ed0311c..7132a25f72870 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -63,7 +63,7 @@ Bug Fixes - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) - +- Regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 11721a5bdac29..a96ee64b4cfb6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -6,7 +6,6 @@ from collections import defaultdict import numpy as np -from numpy import percentile as _quantile from pandas.core.base import PandasObject @@ -1315,16 +1314,38 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): values = self.get_values() values, _, _, _ = self._try_coerce_args(values, values) - mask = isnull(self.values) - if not lib.isscalar(mask) and mask.any(): - # even though this could be a 2-d mask it appears - # as a 1-d result - mask = mask.reshape(values.shape) - result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) - values = _block_shape(values[~mask], ndim=self.ndim) - if self.ndim > 1: - values = values.reshape(result_shape) + def _nanpercentile1D(values, mask, q, **kw): + values = values[~mask] + + if len(values) == 0: + if is_scalar(q): + return self._na_value + else: + return np.array([self._na_value] * len(q), + dtype=values.dtype) + + return np.percentile(values, q, **kw) + + def _nanpercentile(values, q, axis, **kw): + + mask = isnull(self.values) + if not is_scalar(mask) and mask.any(): + if self.ndim == 1: + return _nanpercentile1D(values, mask, q, **kw) + else: + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [_nanpercentile1D(val, m, q, **kw) for (val, m) + in zip(list(values), list(mask))] + result = np.array(result, dtype=values.dtype, copy=False).T + return result + else: + return np.percentile(values, q, axis=axis, **kw) from pandas import Float64Index is_empty = values.shape[axis] == 0 @@ -1343,13 +1364,13 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: try: - result = _quantile(values, np.array(qs) * 100, - axis=axis, **kw) + result = _nanpercentile(values, np.array(qs) * 100, + axis=axis, **kw) except ValueError: # older numpies don't handle an array for q - result = [_quantile(values, q * 100, - axis=axis, **kw) for q in qs] + result = [_nanpercentile(values, q * 100, + axis=axis, **kw) for q in qs] result = np.array(result, copy=False) if self.ndim > 1: @@ -1368,7 +1389,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: result = np.array([self._na_value] * len(self)) else: - result = _quantile(values, qs * 100, axis=axis, **kw) + result = _nanpercentile(values, qs * 100, axis=axis, **kw) ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 52e8697abe850..22414a6ba8a53 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -262,6 +262,11 @@ def test_quantile_datetime(self): index=[0.5], columns=[0, 1]) assert_frame_equal(result, expected) + # empty when numeric_only=True + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # result = df[['a', 'c']].quantile(.5) + # result = df[['a', 'c']].quantile([.5]) + def test_quantile_invalid(self): msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: @@ -340,3 +345,95 @@ def test_quantile_box(self): pd.Timedelta('2 days')]], index=[0.5], columns=list('AaBbCc')) tm.assert_frame_equal(res, exp) + + def test_quantile_nan(self): + + # GH 14357 - float block where some cols have missing values + df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)}) + df.iloc[-1, 1] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + res = df.quantile(0.5, axis=1) + exp = Series(np.arange(1.0, 6.0), name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75], axis=1) + exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + # full-nan column + df['b'] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]}, + index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + def test_quantile_nat(self): + + # full NaT column + df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.NaT], index=['a'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame({'a': [pd.NaT]}, index=[0.5]) + tm.assert_frame_equal(res, exp) + + # mixed non-null / full null column + df = DataFrame({'a': [pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03')], + 'b': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'], + name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5], + columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty(self): + + # floats + df = DataFrame(columns=['a', 'b'], dtype='float64') + + res = df.quantile(0.5) + exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5]) + tm.assert_frame_equal(res, exp) + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5, axis=1) + # res = df.quantile([0.5], axis=1) + + # ints + df = DataFrame(columns=['a', 'b'], dtype='int64') + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5) + + # datetimes + df = DataFrame(columns=['a', 'b'], dtype='datetime64') + + # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) + # res = df.quantile(0.5, numeric_only=False) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 7d2517987e526..76db6c90a685f 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -184,3 +184,35 @@ def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile([0.5]) tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) + + def test_quantile_empty(self): + + # floats + s = Series([], dtype='float64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # int + s = Series([], dtype='int64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # datetime + s = Series([], dtype='datetime64[ns]') + + res = s.quantile(0.5) + self.assertTrue(res is pd.NaT) + + res = s.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5]) + tm.assert_series_equal(res, exp)