From 87fa4ea4b702bbff917ea06f3d56a313bc0baa13 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Mon, 16 Sep 2019 17:17:37 +0530 Subject: [PATCH 01/13] Updated Reshape.py to validate `columns` in `get_dummies` Added validation for the argument passed to columns --- pandas/core/reshape/reshape.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c32ca47c19160..1ed3ac1a17d77 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -863,6 +863,8 @@ def get_dummies( # determine columns being encoded if columns is None: data_to_encode = data.select_dtypes(include=dtypes_to_encode) + elif not is_list_like(columns): + raise TypeError("Input must be a list-like of list-likes") else: data_to_encode = data[columns] From 09a4f5ef2098a1839ebd9e4753b865c990b3f0dd Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Mon, 16 Sep 2019 20:35:29 +0530 Subject: [PATCH 02/13] added tests for the column validation The code is inspired from a similar tests in `RaiseError`: https://github.com/pandas-dev/pandas/blob/master/pandas/tests/arrays/categorical/test_operators.py#L97 Valid `list_like`: https://github.com/pandas-dev/pandas/blob/master/pandas/tests/reshape/test_pivot.py#L706 --- pandas/tests/reshape/test_reshape.py | 66 ++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 149930059d868..b7deda4ccbcc6 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -608,6 +608,72 @@ def test_get_dummies_all_sparse(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "values", + [ + ["baz", "zoo"], + np.array(["baz", "zoo"]), + pd.Series(["baz", "zoo"]), + pd.Index(["baz", "zoo"]), + ], + ) + @pytest.mark.parametrize("method", [True]) + def test_get_dummies_with_list_like_values(self, values, method): + # issue #17160 + df = pd.DataFrame( + { + "bar": [1, 2, 3, 4, 5, 6], + "foo": ["one", "one", "one", "two", "two", "two"], + "baz": ["A", "B", "C", "A", "B", "C"], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) + + if method: + result = pd.get_dummies(df, columns=values,dtype="int64") + else: + result = pd.get_dummies(df, columns=values,dtype="int64") + + data = [[1, 'one', 1, 0, 0, 0, 0, 0, 1, 0, 0], + [2, 'one', 0, 1, 0, 0, 0, 0, 0, 1, 0], + [3, 'one', 0, 0, 1, 0, 0, 0, 0, 0, 1], + [4, 'two', 1, 0, 0, 1, 0, 0, 0, 0, 0], + [5, 'two', 0, 1, 0, 0, 0, 1, 0, 0, 0], + [6, 'two', 0, 0, 1, 0, 1, 0, 0, 0, 0]] + columns = ['bar', 'foo', 'baz_A', 'baz_B', 'baz_C', 'zoo_q', + 'zoo_t', 'zoo_w', 'zoo_x', 'zoo_y', 'zoo_z'] + expected = DataFrame(data=data, columns=columns) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + "baz", + "zoo", + ], + ) + @pytest.mark.parametrize("method", [True]) + def test_get_dummies_with_string_values(self, values, method): + # issue #17160 + df = pd.DataFrame( + { + "bar": [1, 2, 3, 4, 5, 6], + "foo": ["one", "one", "one", "two", "two", "two"], + "baz": ["A", "B", "C", "A", "B", "C"], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) + + msg="Input must be a list-like of list-likes" + + with pytest.raises(TypeError, match=msg): + if method: + result = pd.get_dummies(df, columns=values) + else: + result = pd.get_dummies(df, columns=values) + + + class TestCategoricalReshape: def test_reshaping_multi_index_categorical(self): From ef2d57737c771abd2335255661c09f70b4130d26 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Mon, 16 Sep 2019 20:43:50 +0530 Subject: [PATCH 03/13] updated test_reshape.py with PEP-8 code rec --- pandas/tests/reshape/test_reshape.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index b7deda4ccbcc6..2487fda35f260 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -630,9 +630,9 @@ def test_get_dummies_with_list_like_values(self, values, method): ) if method: - result = pd.get_dummies(df, columns=values,dtype="int64") + result = pd.get_dummies(df, columns=values, dtype="int64") else: - result = pd.get_dummies(df, columns=values,dtype="int64") + result = pd.get_dummies(df, columns=values, dtype="int64") data = [[1, 'one', 1, 0, 0, 0, 0, 0, 1, 0, 0], [2, 'one', 0, 1, 0, 0, 0, 0, 0, 1, 0], @@ -640,8 +640,8 @@ def test_get_dummies_with_list_like_values(self, values, method): [4, 'two', 1, 0, 0, 1, 0, 0, 0, 0, 0], [5, 'two', 0, 1, 0, 0, 0, 1, 0, 0, 0], [6, 'two', 0, 0, 1, 0, 1, 0, 0, 0, 0]] - columns = ['bar', 'foo', 'baz_A', 'baz_B', 'baz_C', 'zoo_q', - 'zoo_t', 'zoo_w', 'zoo_x', 'zoo_y', 'zoo_z'] + columns = ['bar', 'foo', 'baz_A', 'baz_B', 'baz_C', 'zoo_q', 'zoo_t', 'zoo_w' + , 'zoo_x', 'zoo_y', 'zoo_z'] expected = DataFrame(data=data, columns=columns) tm.assert_frame_equal(result, expected) @@ -664,15 +664,13 @@ def test_get_dummies_with_string_values(self, values, method): } ) - msg="Input must be a list-like of list-likes" + msg = "Input must be a list-like of list-likes" with pytest.raises(TypeError, match=msg): if method: - result = pd.get_dummies(df, columns=values) + pd.get_dummies(df, columns=values) else: - result = pd.get_dummies(df, columns=values) - - + pd.get_dummies(df, columns=values) class TestCategoricalReshape: From faa8f076b23dc6973b3f0d4be7981d40d8fe9e2a Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Mon, 16 Sep 2019 22:34:39 +0530 Subject: [PATCH 04/13] code formated using black --- pandas/tests/reshape/test_reshape.py | 37 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 2487fda35f260..9077596cc2f52 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -634,24 +634,31 @@ def test_get_dummies_with_list_like_values(self, values, method): else: result = pd.get_dummies(df, columns=values, dtype="int64") - data = [[1, 'one', 1, 0, 0, 0, 0, 0, 1, 0, 0], - [2, 'one', 0, 1, 0, 0, 0, 0, 0, 1, 0], - [3, 'one', 0, 0, 1, 0, 0, 0, 0, 0, 1], - [4, 'two', 1, 0, 0, 1, 0, 0, 0, 0, 0], - [5, 'two', 0, 1, 0, 0, 0, 1, 0, 0, 0], - [6, 'two', 0, 0, 1, 0, 1, 0, 0, 0, 0]] - columns = ['bar', 'foo', 'baz_A', 'baz_B', 'baz_C', 'zoo_q', 'zoo_t', 'zoo_w' - , 'zoo_x', 'zoo_y', 'zoo_z'] + data = [ + [1, "one", 1, 0, 0, 0, 0, 0, 1, 0, 0], + [2, "one", 0, 1, 0, 0, 0, 0, 0, 1, 0], + [3, "one", 0, 0, 1, 0, 0, 0, 0, 0, 1], + [4, "two", 1, 0, 0, 1, 0, 0, 0, 0, 0], + [5, "two", 0, 1, 0, 0, 0, 1, 0, 0, 0], + [6, "two", 0, 0, 1, 0, 1, 0, 0, 0, 0], + ] + columns = [ + "bar", + "foo", + "baz_A", + "baz_B", + "baz_C", + "zoo_q", + "zoo_t", + "zoo_w", + "zoo_x", + "zoo_y", + "zoo_z", + ] expected = DataFrame(data=data, columns=columns) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "values", - [ - "baz", - "zoo", - ], - ) + @pytest.mark.parametrize("values", ["baz", "zoo"]) @pytest.mark.parametrize("method", [True]) def test_get_dummies_with_string_values(self, values, method): # issue #17160 From 81c2597df7180eaa4f7c8c69ef5c9f7122c6c5d2 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Tue, 17 Sep 2019 19:41:46 +0530 Subject: [PATCH 05/13] updated test_reshape.py Removed method as thier is only 1 type of call for the function `get_dummies` --- pandas/tests/reshape/test_reshape.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 9077596cc2f52..efb3567ef8892 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -617,9 +617,8 @@ def test_get_dummies_all_sparse(self): pd.Index(["baz", "zoo"]), ], ) - @pytest.mark.parametrize("method", [True]) - def test_get_dummies_with_list_like_values(self, values, method): - # issue #17160 + def test_get_dummies_with_list_like_values(self, values): + # issue #28383 df = pd.DataFrame( { "bar": [1, 2, 3, 4, 5, 6], @@ -629,10 +628,7 @@ def test_get_dummies_with_list_like_values(self, values, method): } ) - if method: - result = pd.get_dummies(df, columns=values, dtype="int64") - else: - result = pd.get_dummies(df, columns=values, dtype="int64") + result = pd.get_dummies(df, columns=values, dtype="int64") data = [ [1, "one", 1, 0, 0, 0, 0, 0, 1, 0, 0], @@ -659,9 +655,8 @@ def test_get_dummies_with_list_like_values(self, values, method): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("values", ["baz", "zoo"]) - @pytest.mark.parametrize("method", [True]) - def test_get_dummies_with_string_values(self, values, method): - # issue #17160 + def test_get_dummies_with_string_values(self, values): + # issue #28383 df = pd.DataFrame( { "bar": [1, 2, 3, 4, 5, 6], @@ -674,10 +669,7 @@ def test_get_dummies_with_string_values(self, values, method): msg = "Input must be a list-like of list-likes" with pytest.raises(TypeError, match=msg): - if method: - pd.get_dummies(df, columns=values) - else: - pd.get_dummies(df, columns=values) + pd.get_dummies(df, columns=values) class TestCategoricalReshape: From 3aa8749744e8b983232195ee0247a506bf5203f4 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Tue, 17 Sep 2019 19:49:36 +0530 Subject: [PATCH 06/13] updated file based on the changes in PR Asserting list-like values for `columns` parameter in `get_dummies` --- doc/source/whatsnew/v1.0.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c78e27f098f13..907cb432ed60e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -218,7 +218,8 @@ Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`) -- +- Added test to assert the :func:`get_dummies` raises TypeError message when the value to `columns` isn't a list-like value (:issue:`28383`) +- Sparse ^^^^^^ From af99037159802101b67fc139c78fb06a1aa04b00 Mon Sep 17 00:00:00 2001 From: Rajat <22280243+R1j1t@users.noreply.github.com> Date: Tue, 17 Sep 2019 22:51:38 +0530 Subject: [PATCH 07/13] Update doc/source/whatsnew/v1.0.0.rst Co-Authored-By: Tom Augspurger --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 907cb432ed60e..18c1c93992cb0 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -218,7 +218,7 @@ Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`) -- Added test to assert the :func:`get_dummies` raises TypeError message when the value to `columns` isn't a list-like value (:issue:`28383`) +- Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) - Sparse From 235346261f0b0f23d3b0ca2c79af6f241ea024a2 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Tue, 17 Sep 2019 22:52:44 +0530 Subject: [PATCH 08/13] updated error message based on comments received --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 1ed3ac1a17d77..15da09b5a8135 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -864,7 +864,7 @@ def get_dummies( if columns is None: data_to_encode = data.select_dtypes(include=dtypes_to_encode) elif not is_list_like(columns): - raise TypeError("Input must be a list-like of list-likes") + raise TypeError("Input must be a list-like for parameter `columns`") else: data_to_encode = data[columns] From 7d681819c6821ac03156f4115d03c47ac8cba688 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Wed, 18 Sep 2019 00:23:05 +0530 Subject: [PATCH 09/13] updated the test with the new error message --- pandas/tests/reshape/test_reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index efb3567ef8892..5712b0553f528 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -666,7 +666,7 @@ def test_get_dummies_with_string_values(self, values): } ) - msg = "Input must be a list-like of list-likes" + msg = "Input must be a list-like for parameter `columns`" with pytest.raises(TypeError, match=msg): pd.get_dummies(df, columns=values) From ef1b9cd729ee82bf8198e58acc9fec7d74177341 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Fri, 20 Sep 2019 12:50:13 +0530 Subject: [PATCH 10/13] Removed list-like object check based on the comments, i have removed the list-like objects check --- pandas/tests/reshape/test_reshape.py | 46 ---------------------------- 1 file changed, 46 deletions(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 5712b0553f528..465529a197d7a 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -608,52 +608,6 @@ def test_get_dummies_all_sparse(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "values", - [ - ["baz", "zoo"], - np.array(["baz", "zoo"]), - pd.Series(["baz", "zoo"]), - pd.Index(["baz", "zoo"]), - ], - ) - def test_get_dummies_with_list_like_values(self, values): - # issue #28383 - df = pd.DataFrame( - { - "bar": [1, 2, 3, 4, 5, 6], - "foo": ["one", "one", "one", "two", "two", "two"], - "baz": ["A", "B", "C", "A", "B", "C"], - "zoo": ["x", "y", "z", "q", "w", "t"], - } - ) - - result = pd.get_dummies(df, columns=values, dtype="int64") - - data = [ - [1, "one", 1, 0, 0, 0, 0, 0, 1, 0, 0], - [2, "one", 0, 1, 0, 0, 0, 0, 0, 1, 0], - [3, "one", 0, 0, 1, 0, 0, 0, 0, 0, 1], - [4, "two", 1, 0, 0, 1, 0, 0, 0, 0, 0], - [5, "two", 0, 1, 0, 0, 0, 1, 0, 0, 0], - [6, "two", 0, 0, 1, 0, 1, 0, 0, 0, 0], - ] - columns = [ - "bar", - "foo", - "baz_A", - "baz_B", - "baz_C", - "zoo_q", - "zoo_t", - "zoo_w", - "zoo_x", - "zoo_y", - "zoo_z", - ] - expected = DataFrame(data=data, columns=columns) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("values", ["baz", "zoo"]) def test_get_dummies_with_string_values(self, values): # issue #28383 From 4da4149468b98cfe30bbdbc1fb2924921db6d232 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 23 Sep 2019 13:17:35 -0700 Subject: [PATCH 11/13] whitespace fixup --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9fff3bd769e04..cc1096941391d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -279,7 +279,7 @@ Reshaping - Bug in :meth:`DataFrame.apply` that caused incorrect output with empty :class:`DataFrame` (:issue:`28202`, :issue:`21959`) - Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`) - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) -- +- Sparse ^^^^^^ From 9cd8d40a4dadd1a48a6d79b8aedbd6ff35d4c486 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Sat, 12 Oct 2019 12:34:19 +0530 Subject: [PATCH 12/13] Removed trailing space --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 14dec1baefe27..e6c2db152d75c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -346,7 +346,7 @@ Reshaping - Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). -- Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) +- Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) Sparse ^^^^^^ From f266f47dc25aa23f8f20a3bc3637ae0c0e8d59d5 Mon Sep 17 00:00:00 2001 From: R1j1t <22280243+R1j1t@users.noreply.github.com> Date: Sat, 12 Oct 2019 12:34:46 +0530 Subject: [PATCH 13/13] Removed redundant value in test --- pandas/tests/reshape/test_reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 2bd4fd4e2bd9c..21f4be8f188f4 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -608,7 +608,7 @@ def test_get_dummies_all_sparse(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("values", ["baz", "zoo"]) + @pytest.mark.parametrize("values", ["baz"]) def test_get_dummies_with_string_values(self, values): # issue #28383 df = pd.DataFrame(