Skip to content

Commit 4c42422

Browse files
[Backport #14536] BUG: DataFrame.quantile with NaNs (GH14357) (#14536)
(cherry picked from commit 52f31d4)
1 parent e7d7872 commit 4c42422

File tree

4 files changed

+166
-16
lines changed

4 files changed

+166
-16
lines changed

doc/source/whatsnew/v0.19.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ Bug Fixes
6666

6767

6868
- Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`)
69-
69+
- Regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`).
7070
- Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`)
7171
- Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`)
7272
- Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`)

pandas/core/internals.py

+36-15
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from collections import defaultdict
77

88
import numpy as np
9-
from numpy import percentile as _quantile
109

1110
from pandas.core.base import PandasObject
1211

@@ -1315,16 +1314,38 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
13151314

13161315
values = self.get_values()
13171316
values, _, _, _ = self._try_coerce_args(values, values)
1318-
mask = isnull(self.values)
1319-
if not lib.isscalar(mask) and mask.any():
13201317

1321-
# even though this could be a 2-d mask it appears
1322-
# as a 1-d result
1323-
mask = mask.reshape(values.shape)
1324-
result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1))
1325-
values = _block_shape(values[~mask], ndim=self.ndim)
1326-
if self.ndim > 1:
1327-
values = values.reshape(result_shape)
1318+
def _nanpercentile1D(values, mask, q, **kw):
1319+
values = values[~mask]
1320+
1321+
if len(values) == 0:
1322+
if is_scalar(q):
1323+
return self._na_value
1324+
else:
1325+
return np.array([self._na_value] * len(q),
1326+
dtype=values.dtype)
1327+
1328+
return np.percentile(values, q, **kw)
1329+
1330+
def _nanpercentile(values, q, axis, **kw):
1331+
1332+
mask = isnull(self.values)
1333+
if not is_scalar(mask) and mask.any():
1334+
if self.ndim == 1:
1335+
return _nanpercentile1D(values, mask, q, **kw)
1336+
else:
1337+
# for nonconsolidatable blocks mask is 1D, but values 2D
1338+
if mask.ndim < values.ndim:
1339+
mask = mask.reshape(values.shape)
1340+
if axis == 0:
1341+
values = values.T
1342+
mask = mask.T
1343+
result = [_nanpercentile1D(val, m, q, **kw) for (val, m)
1344+
in zip(list(values), list(mask))]
1345+
result = np.array(result, dtype=values.dtype, copy=False).T
1346+
return result
1347+
else:
1348+
return np.percentile(values, q, axis=axis, **kw)
13281349

13291350
from pandas import Float64Index
13301351
is_empty = values.shape[axis] == 0
@@ -1343,13 +1364,13 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
13431364
else:
13441365

13451366
try:
1346-
result = _quantile(values, np.array(qs) * 100,
1347-
axis=axis, **kw)
1367+
result = _nanpercentile(values, np.array(qs) * 100,
1368+
axis=axis, **kw)
13481369
except ValueError:
13491370

13501371
# older numpies don't handle an array for q
1351-
result = [_quantile(values, q * 100,
1352-
axis=axis, **kw) for q in qs]
1372+
result = [_nanpercentile(values, q * 100,
1373+
axis=axis, **kw) for q in qs]
13531374

13541375
result = np.array(result, copy=False)
13551376
if self.ndim > 1:
@@ -1368,7 +1389,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
13681389
else:
13691390
result = np.array([self._na_value] * len(self))
13701391
else:
1371-
result = _quantile(values, qs * 100, axis=axis, **kw)
1392+
result = _nanpercentile(values, qs * 100, axis=axis, **kw)
13721393

13731394
ndim = getattr(result, 'ndim', None) or 0
13741395
result = self._try_coerce_result(result)

pandas/tests/frame/test_quantile.py

+97
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,11 @@ def test_quantile_datetime(self):
262262
index=[0.5], columns=[0, 1])
263263
assert_frame_equal(result, expected)
264264

265+
# empty when numeric_only=True
266+
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
267+
# result = df[['a', 'c']].quantile(.5)
268+
# result = df[['a', 'c']].quantile([.5])
269+
265270
def test_quantile_invalid(self):
266271
msg = 'percentiles should all be in the interval \\[0, 1\\]'
267272
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
@@ -340,3 +345,95 @@ def test_quantile_box(self):
340345
pd.Timedelta('2 days')]],
341346
index=[0.5], columns=list('AaBbCc'))
342347
tm.assert_frame_equal(res, exp)
348+
349+
def test_quantile_nan(self):
350+
351+
# GH 14357 - float block where some cols have missing values
352+
df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
353+
df.iloc[-1, 1] = np.nan
354+
355+
res = df.quantile(0.5)
356+
exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
357+
tm.assert_series_equal(res, exp)
358+
359+
res = df.quantile([0.5, 0.75])
360+
exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
361+
tm.assert_frame_equal(res, exp)
362+
363+
res = df.quantile(0.5, axis=1)
364+
exp = Series(np.arange(1.0, 6.0), name=0.5)
365+
tm.assert_series_equal(res, exp)
366+
367+
res = df.quantile([0.5, 0.75], axis=1)
368+
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
369+
tm.assert_frame_equal(res, exp)
370+
371+
# full-nan column
372+
df['b'] = np.nan
373+
374+
res = df.quantile(0.5)
375+
exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
376+
tm.assert_series_equal(res, exp)
377+
378+
res = df.quantile([0.5, 0.75])
379+
exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
380+
index=[0.5, 0.75])
381+
tm.assert_frame_equal(res, exp)
382+
383+
def test_quantile_nat(self):
384+
385+
# full NaT column
386+
df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})
387+
388+
res = df.quantile(0.5, numeric_only=False)
389+
exp = Series([pd.NaT], index=['a'], name=0.5)
390+
tm.assert_series_equal(res, exp)
391+
392+
res = df.quantile([0.5], numeric_only=False)
393+
exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
394+
tm.assert_frame_equal(res, exp)
395+
396+
# mixed non-null / full null column
397+
df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
398+
pd.Timestamp('2012-01-02'),
399+
pd.Timestamp('2012-01-03')],
400+
'b': [pd.NaT, pd.NaT, pd.NaT]})
401+
402+
res = df.quantile(0.5, numeric_only=False)
403+
exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
404+
name=0.5)
405+
tm.assert_series_equal(res, exp)
406+
407+
res = df.quantile([0.5], numeric_only=False)
408+
exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
409+
columns=['a', 'b'])
410+
tm.assert_frame_equal(res, exp)
411+
412+
def test_quantile_empty(self):
413+
414+
# floats
415+
df = DataFrame(columns=['a', 'b'], dtype='float64')
416+
417+
res = df.quantile(0.5)
418+
exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
419+
tm.assert_series_equal(res, exp)
420+
421+
res = df.quantile([0.5])
422+
exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
423+
tm.assert_frame_equal(res, exp)
424+
425+
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
426+
# res = df.quantile(0.5, axis=1)
427+
# res = df.quantile([0.5], axis=1)
428+
429+
# ints
430+
df = DataFrame(columns=['a', 'b'], dtype='int64')
431+
432+
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
433+
# res = df.quantile(0.5)
434+
435+
# datetimes
436+
df = DataFrame(columns=['a', 'b'], dtype='datetime64')
437+
438+
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
439+
# res = df.quantile(0.5, numeric_only=False)

pandas/tests/series/test_quantile.py

+32
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,35 @@ def test_quantile_nat(self):
184184

185185
res = Series([pd.NaT, pd.NaT]).quantile([0.5])
186186
tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5]))
187+
188+
def test_quantile_empty(self):
189+
190+
# floats
191+
s = Series([], dtype='float64')
192+
193+
res = s.quantile(0.5)
194+
self.assertTrue(np.isnan(res))
195+
196+
res = s.quantile([0.5])
197+
exp = Series([np.nan], index=[0.5])
198+
tm.assert_series_equal(res, exp)
199+
200+
# int
201+
s = Series([], dtype='int64')
202+
203+
res = s.quantile(0.5)
204+
self.assertTrue(np.isnan(res))
205+
206+
res = s.quantile([0.5])
207+
exp = Series([np.nan], index=[0.5])
208+
tm.assert_series_equal(res, exp)
209+
210+
# datetime
211+
s = Series([], dtype='datetime64[ns]')
212+
213+
res = s.quantile(0.5)
214+
self.assertTrue(res is pd.NaT)
215+
216+
res = s.quantile([0.5])
217+
exp = Series([pd.NaT], index=[0.5])
218+
tm.assert_series_equal(res, exp)

0 commit comments

Comments
 (0)