From 55806cffc8d6e2497dca1d4273d9deda19d98357 Mon Sep 17 00:00:00 2001 From: Luca Scarabello Date: Fri, 17 Feb 2017 16:07:21 +0100 Subject: [PATCH 1/6] BUG: pd.cut with bins=1 and input all 0s The special case of running pd.cut() qith bins=1 an input containing all 0s raises a ValueError --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/tools/test_tile.py | 5 +++++ pandas/tools/tile.py | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ece9ff4a1adff..cd30923e738ea 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -670,7 +670,7 @@ Bug Fixes - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`, :issue:`15424`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a numpy array (:issue:`15434`) - Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) - +- Bug in ``pd.cut()`` single bin on all 0s array raises ``ValueError`` (:issue:`15428`) - Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index de44eadc15751..6da4a78c5c91a 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -297,6 +297,11 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + # issue 15428 + s = Series([0., 0.]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index feb4d4bfd5044..f953bf4525b56 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -104,8 +104,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, mn, mx = [mi + 0.0 for mi in rng] if mn == mx: # adjust end points before binning - mn -= .001 * abs(mn) - mx += .001 * abs(mx) + mn -= .001 * abs(mn) if mn != 0 else .001 + mx += .001 * abs(mx) if mx != 0 else .001 bins = np.linspace(mn, mx, bins + 1, endpoint=True) else: # adjust end points after binning bins = np.linspace(mn, mx, bins + 1, endpoint=True) From f56a27fe0ee23fea61a7d00846b528d44846fe49 Mon Sep 17 00:00:00 2001 From: Luca Scarabello Date: Fri, 17 Feb 2017 19:35:40 +0100 Subject: [PATCH 2/6] Issue #15431 BUG: pd.qcut with q=1 and input with identical values --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/tools/test_tile.py | 48 +++++++++++++++++++++++++++++++++ pandas/tools/tile.py | 7 +++++ 3 files changed, 56 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cd30923e738ea..eeeeac4c21d88 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -671,6 +671,7 @@ Bug Fixes - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a numpy array (:issue:`15434`) - Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) - Bug in ``pd.cut()`` single bin on all 0s array raises ``ValueError`` (:issue:`15428`) +- Bug in ``pd.qcut()`` single quantile and array with identical values raises ``ValueError`` (:issue:`15431`) - Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index 6da4a78c5c91a..7803f08d28776 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -285,6 +285,36 @@ def test_qcut_duplicates_bin(self): # invalid self.assertRaises(ValueError, qcut, values, 3, duplicates='foo') + def test_single_quantile(self): + # issue 15431 + expected = Series([0, 0]) + + s = Series([9., 9.]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + s = Series([-9., -9.]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + s = Series([0., 0.]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + expected = Series([0]) + + s = Series([9]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + s = Series([-9]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + s = Series([0]) + result = qcut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + def test_single_bin(self): # issue 14652 expected = Series([0, 0]) @@ -297,11 +327,29 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + expected = Series([0]) + + s = Series([9]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + + s = Series([-9]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + # issue 15428 + expected = Series([0, 0]) + s = Series([0., 0.]) result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + expected = Series([0]) + + s = Series([0]) + result = cut(s, 1, labels=False) + tm.assert_series_equal(result, expected) + def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index f953bf4525b56..d38f6b0641502 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -189,6 +189,13 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): else: quantiles = q bins = algos.quantile(x, quantiles) + + # fix special case: q=1 and all identical values + if q == 1 and len(bins) == 2 and bins[0] == bins[1]: + bins = np.asarray(bins, np.float64) + bins[0] -= .001 * abs(bins[0]) if bins[0] != 0 else .001 + bins[1] += .001 * abs(bins[1]) if bins[1] != 0 else .001 + fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype, duplicates=duplicates) From b7d92dc28b2c81b58b38d1e7fef442fae6422df8 Mon Sep 17 00:00:00 2001 From: Luca Date: Sat, 18 Feb 2017 14:28:52 +0100 Subject: [PATCH 3/6] Added 'allow' duplicates option to _bins_to_cuts --- pandas/tests/tools/test_tile.py | 28 +++++++++++++++++++++++++--- pandas/tools/tile.py | 13 +++++-------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index 7803f08d28776..11b242bc06e15 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -3,7 +3,7 @@ import numpy as np from pandas.compat import zip -from pandas import Series, Index +from pandas import Series, Index, Categorical import pandas.util.testing as tm from pandas.util.testing import assertRaisesRegexp import pandas.core.common as com @@ -239,7 +239,6 @@ def test_qcut_binning_issues(self): self.assertTrue(ep <= sn) def test_cut_return_categorical(self): - from pandas import Categorical s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) res = cut(s, 3) exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -249,7 +248,6 @@ def test_cut_return_categorical(self): tm.assert_series_equal(res, exp) def test_qcut_return_categorical(self): - from pandas import Categorical s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) res = qcut(s, [0, 0.333, 0.666, 1]) exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], @@ -292,28 +290,52 @@ def test_single_quantile(self): s = Series([9., 9.]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0, 0], ["[9, 9]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) s = Series([-9., -9.]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0, 0], ["[-9, -9]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) s = Series([0., 0.]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0, 0], ["[0, 0]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) expected = Series([0]) s = Series([9]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0], ["[9, 9]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) s = Series([-9]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0], ["[-9, -9]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) s = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) + result = qcut(s, 1) + exp_lab = Series(Categorical.from_codes([0], ["[0, 0]"], + ordered=True)) + tm.assert_series_equal(result, exp_lab) def test_single_bin(self): # issue 14652 diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index d38f6b0641502..63d4cf4acafb7 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -186,16 +186,13 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): if is_integer(q): quantiles = np.linspace(0, 1, q + 1) + + if q == 1: + duplicates = 'allow' else: quantiles = q bins = algos.quantile(x, quantiles) - # fix special case: q=1 and all identical values - if q == 1 and len(bins) == 2 and bins[0] == bins[1]: - bins = np.asarray(bins, np.float64) - bins[0] -= .001 * abs(bins[0]) if bins[0] != 0 else .001 - bins[1] += .001 * abs(bins[1]) if bins[1] != 0 else .001 - fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype, duplicates=duplicates) @@ -208,7 +205,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): - if duplicates not in ['raise', 'drop']: + if duplicates not in ['raise', 'drop', 'allow']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") @@ -218,7 +215,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, raise ValueError("Bin edges must be unique: {}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(repr(bins))) - else: + elif duplicates == 'drop': bins = unique_bins side = 'left' if right else 'right' From 692503a4f2b1937bbd10b4a831a8bf02b03d9d39 Mon Sep 17 00:00:00 2001 From: Luca Scarabello Date: Thu, 2 Mar 2017 09:59:38 +0100 Subject: [PATCH 4/6] Improved solution: using same approach as pd.cut --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tools/tile.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index eeeeac4c21d88..c405615b00bca 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -677,6 +677,7 @@ Bug Fixes + - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series` (:issue:`14320`) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 63d4cf4acafb7..7a77b32656f41 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -185,14 +185,17 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): x, dtype = _coerce_to_type(x) if is_integer(q): - quantiles = np.linspace(0, 1, q + 1) + if x.size == 0: + raise ValueError('Cannot qcut empty array') - if q == 1: + rng = (nanops.nanmin(x), nanops.nanmax(x)) + if rng[0] == rng[1] and q == 1: duplicates = 'allow' + + quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) - fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype, duplicates=duplicates) From def84bac25be0428d1d0978418031a9fe573c2ee Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 8 Mar 2017 00:40:52 +0100 Subject: [PATCH 5/6] Yet another implementation attempt --- pandas/tools/tile.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 7a77b32656f41..034ecd72bd41e 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -185,13 +185,6 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): x, dtype = _coerce_to_type(x) if is_integer(q): - if x.size == 0: - raise ValueError('Cannot qcut empty array') - - rng = (nanops.nanmin(x), nanops.nanmax(x)) - if rng[0] == rng[1] and q == 1: - duplicates = 'allow' - quantiles = np.linspace(0, 1, q + 1) else: quantiles = q @@ -208,17 +201,17 @@ def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): - if duplicates not in ['raise', 'drop', 'allow']: + if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") unique_bins = algos.unique(bins) - if len(unique_bins) < len(bins): + if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(repr(bins))) - elif duplicates == 'drop': + else: bins = unique_bins side = 'left' if right else 'right' From 12489876c0c3c125b209886f5ccd43c461090698 Mon Sep 17 00:00:00 2001 From: Luca Date: Wed, 8 Mar 2017 00:47:13 +0100 Subject: [PATCH 6/6] rebased on master --- doc/source/whatsnew/v0.20.0.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c405615b00bca..8dbc5673f1bcd 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -672,12 +672,10 @@ Bug Fixes - Bug in ``Rolling.quantile`` function that caused a segmentation fault when called with a quantile value outside of the range [0, 1] (:issue:`15463`) - Bug in ``pd.cut()`` single bin on all 0s array raises ``ValueError`` (:issue:`15428`) - Bug in ``pd.qcut()`` single quantile and array with identical values raises ``ValueError`` (:issue:`15431`) - - Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`) - - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series` (:issue:`14320`)