From 0997bc70a8c2ad66d9eea8cbac847e77234f8084 Mon Sep 17 00:00:00 2001 From: saucoide Date: Thu, 22 Apr 2021 02:33:56 +0200 Subject: [PATCH 1/9] BUG: Check for null values when infering excel in read_clipboard GH41108 Stripping whitespace on read_clipboard causes data copied from excel to lose the tab separator between the first column and the nexts, shifting the data into the wrong column This just removes the .lstrip() except for the first line containing headers, to avoid breaking header-less index in csv files --- pandas/io/clipboards.py | 5 ++++- pandas/tests/io/test_clipboard.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 00a99eb8a4480..3c48c53671a85 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -58,7 +58,10 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = {x.lstrip().count("\t") for x in lines} + counts = { + x.count("\t") if i > 0 else x.lstrip().count("\t") + for i, x in enumerate(lines) + } if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = "\t" diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 45d9ad430aa43..8b2d6f65ac3bb 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -220,6 +220,21 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): # excel data is parsed correctly assert df.iloc[1][1] == "Harry Carney" + # a null on the first column works + text = dedent( + """ + John James Charlie Mingus + 1 2 + Harry Carney + 7 Carl Miney + """.strip() + ) + mock_clipboard[request.node.name] = text + df = read_clipboard(**clip_kwargs) + + # excel data is parsed correctly + assert df.iloc[1][1] == "Harry Carney" + # having diff tab counts doesn't trigger it text = dedent( """ From 8310492aba3877ddd3d55c9f63a6f6a1b858f1b0 Mon Sep 17 00:00:00 2001 From: saucoide Date: Sat, 24 Apr 2021 00:08:08 +0200 Subject: [PATCH 2/9] separated test & added whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/tests/io/test_clipboard.py | 26 +++++++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d357e4a633347..0675dbb827bab 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -925,7 +925,7 @@ I/O - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) - Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) -- +- Bug in :func:`read_clipboard` when copying from excel and the first column contains null values (:issue:`41108`) Period ^^^^^^ diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 8b2d6f65ac3bb..e4367341bfb82 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -220,21 +220,6 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): # excel data is parsed correctly assert df.iloc[1][1] == "Harry Carney" - # a null on the first column works - text = dedent( - """ - John James Charlie Mingus - 1 2 - Harry Carney - 7 Carl Miney - """.strip() - ) - mock_clipboard[request.node.name] = text - df = read_clipboard(**clip_kwargs) - - # excel data is parsed correctly - assert df.iloc[1][1] == "Harry Carney" - # having diff tab counts doesn't trigger it text = dedent( """ @@ -258,6 +243,17 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): tm.assert_frame_equal(res, exp) + # Tests that nulls on the first column do not trip infering excel format + def test_infer_excel_with_nulls(self, request, mock_clipboard): + # GH41108 + text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen" + + mock_clipboard[request.node.name] = text + df = read_clipboard() + + # excel data is parsed correctly + assert df.iloc[1][1] == "blue" + def test_invalid_encoding(self, df): msg = "clipboard only supports utf-8 encoding" # test case for testing invalid encoding From a08502df6c19e8dedfbc72f9e01142aeb1d2f3a3 Mon Sep 17 00:00:00 2001 From: saucoide Date: Sat, 24 Apr 2021 13:09:48 +0200 Subject: [PATCH 3/9] test compares the whole dataframe & reformatted --- pandas/io/clipboards.py | 3 +-- pandas/tests/io/test_clipboard.py | 5 ++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 3c48c53671a85..6d1990686d402 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -59,8 +59,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # 1 3 4 counts = { - x.count("\t") if i > 0 else x.lstrip().count("\t") - for i, x in enumerate(lines) + x.count("\t") if i > 0 else x.lstrip().count("\t") for i, x in enumerate(lines) } if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = "\t" diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index e4367341bfb82..37f3aa8aed906 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -243,16 +243,19 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): tm.assert_frame_equal(res, exp) - # Tests that nulls on the first column do not trip infering excel format def test_infer_excel_with_nulls(self, request, mock_clipboard): # GH41108 text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen" mock_clipboard[request.node.name] = text df = read_clipboard() + df_expected = DataFrame( + data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]} + ) # excel data is parsed correctly assert df.iloc[1][1] == "blue" + assert df.equals(df_expected) def test_invalid_encoding(self, df): msg = "clipboard only supports utf-8 encoding" From d468588c677debe9e2bda5dd1a1d57f772b252b9 Mon Sep 17 00:00:00 2001 From: saucoide Date: Sat, 24 Apr 2021 23:32:21 +0200 Subject: [PATCH 4/9] change test to use tm.assert_frame_equal --- pandas/tests/io/test_clipboard.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 37f3aa8aed906..13499a9e28ec6 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -254,8 +254,7 @@ def test_infer_excel_with_nulls(self, request, mock_clipboard): ) # excel data is parsed correctly - assert df.iloc[1][1] == "blue" - assert df.equals(df_expected) + tm.assert_frame_equal(df, df_expected) def test_invalid_encoding(self, df): msg = "clipboard only supports utf-8 encoding" From 7438ab92ef3cd5b9f21b27f5f230894f10e3f307 Mon Sep 17 00:00:00 2001 From: saucoide Date: Sun, 25 Apr 2021 06:41:09 +0200 Subject: [PATCH 5/9] Check for index columns and pass through the kwarg to read_csv Use the number of leading tab's in the first row when we are copying from excel as the value for `index_col` --- pandas/io/clipboards.py | 10 +++++++--- pandas/tests/io/test_clipboard.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 6d1990686d402..cf75b6304cd7f 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -58,11 +58,15 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = { - x.count("\t") if i > 0 else x.lstrip().count("\t") for i, x in enumerate(lines) - } + counts = {x.lstrip(" ").count("\t") for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = "\t" + # check the number of leading tabs in the first line + # to account for index columns + index_length = len(lines[0]) - len(lines[0].lstrip(" \t")) + if index_length != 0: + print(index_length) + kwargs.setdefault("index_col", [0, index_length - 1]) # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get("delim_whitespace") is None: diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 13499a9e28ec6..73b583d6764b2 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + MultiIndex, get_option, read_clipboard, ) @@ -256,6 +257,21 @@ def test_infer_excel_with_nulls(self, request, mock_clipboard): # excel data is parsed correctly tm.assert_frame_equal(df, df_expected) + def test_infer_excel_with_multiindex(self, request, mock_clipboard): + # GH41108 + text = "\t\tcol1\tcol2\nA\t0\t1\tred\nA\t1\t\tblue\nB\t0\t2\tgreen" + + mock_clipboard[request.node.name] = text + df = read_clipboard() + multiindex = MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0)]) + df_expected = DataFrame( + data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}, + index=multiindex, + ) + + # excel data is parsed correctly + tm.assert_frame_equal(df, df_expected) + def test_invalid_encoding(self, df): msg = "clipboard only supports utf-8 encoding" # test case for testing invalid encoding From efab5a8aa112c91350103abf3ae400b727e3de70 Mon Sep 17 00:00:00 2001 From: saucoide Date: Sun, 25 Apr 2021 06:52:05 +0200 Subject: [PATCH 6/9] removing print --- pandas/io/clipboards.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index cf75b6304cd7f..a1a9130f2b7af 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -65,7 +65,6 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # to account for index columns index_length = len(lines[0]) - len(lines[0].lstrip(" \t")) if index_length != 0: - print(index_length) kwargs.setdefault("index_col", [0, index_length - 1]) # Edge case where sep is specified to be None, return to default From 1603426d89abfc8da4dc28e565440b37a36deed7 Mon Sep 17 00:00:00 2001 From: saucoide Date: Tue, 11 May 2021 00:11:37 +0200 Subject: [PATCH 7/9] parametrized multiindex test & fixed the args to index_col --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/io/clipboards.py | 2 +- pandas/tests/io/test_clipboard.py | 29 +++++++++++++++++++++++------ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 0675dbb827bab..9ebb8f4337507 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -925,7 +925,7 @@ I/O - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) - Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) -- Bug in :func:`read_clipboard` when copying from excel and the first column contains null values (:issue:`41108`) +- Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`) Period ^^^^^^ diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index a1a9130f2b7af..a6940c08198b0 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -65,7 +65,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # to account for index columns index_length = len(lines[0]) - len(lines[0].lstrip(" \t")) if index_length != 0: - kwargs.setdefault("index_col", [0, index_length - 1]) + kwargs.setdefault("index_col", list(range(index_length))) # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get("delim_whitespace") is None: diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 73b583d6764b2..380536f9efcce 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -5,7 +5,6 @@ from pandas import ( DataFrame, - MultiIndex, get_option, read_clipboard, ) @@ -257,16 +256,34 @@ def test_infer_excel_with_nulls(self, request, mock_clipboard): # excel data is parsed correctly tm.assert_frame_equal(df, df_expected) - def test_infer_excel_with_multiindex(self, request, mock_clipboard): + @pytest.mark.parametrize( + "multiindex", + [ + ( + """\t\t\tcol1\tcol2 + A\t0\tTrue\t1\tred + A\t1\tTrue\t\tblue + B\t0\tFalse\t2\tgreen""", + [["A", "A", "B"], [0, 1, 0], [True, True, False]], + ), + ( + """\t\tcol1\tcol2 + A\t0\t1\tred + A\t1\t\tblue + B\t0\t2\tgreen""", + [["A", "A", "B"], [0, 1, 0]], + ), + ], + ) + def test_infer_excel_with_multiindex(self, request, mock_clipboard, multiindex): # GH41108 - text = "\t\tcol1\tcol2\nA\t0\t1\tred\nA\t1\t\tblue\nB\t0\t2\tgreen" - mock_clipboard[request.node.name] = text + # the `.replace()` is because `.dedent()` does not like the leading `\t` + mock_clipboard[request.node.name] = multiindex[0].replace(" ", "") df = read_clipboard() - multiindex = MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0)]) df_expected = DataFrame( data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}, - index=multiindex, + index=multiindex[1], ) # excel data is parsed correctly From 0b1e21ae12b4fa0610460e7c5f45ff4e569e5de1 Mon Sep 17 00:00:00 2001 From: saucoide Date: Sat, 22 May 2021 00:33:55 +0200 Subject: [PATCH 8/9] Remove .replace() and use implicit str concatenation instead --- pandas/tests/io/test_clipboard.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 380536f9efcce..f1f635ad98d3b 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -260,17 +260,21 @@ def test_infer_excel_with_nulls(self, request, mock_clipboard): "multiindex", [ ( - """\t\t\tcol1\tcol2 - A\t0\tTrue\t1\tred - A\t1\tTrue\t\tblue - B\t0\tFalse\t2\tgreen""", + ( # Can't use `dedent` here as it will remove the leading `\t` + "\t\t\tcol1\tcol2\n" + "A\t0\tTrue\t1\tred\n" + "A\t1\tTrue\t\tblue\n" + "B\t0\tFalse\t2\tgreen\n" + ), [["A", "A", "B"], [0, 1, 0], [True, True, False]], ), ( - """\t\tcol1\tcol2 - A\t0\t1\tred - A\t1\t\tblue - B\t0\t2\tgreen""", + ( + "\t\tcol1\tcol2\n" + "A\t0\t1\tred\n" + "A\t1\t\tblue\n" + "B\t0\t2\tgreen\n" + ), [["A", "A", "B"], [0, 1, 0]], ), ], @@ -278,8 +282,7 @@ def test_infer_excel_with_nulls(self, request, mock_clipboard): def test_infer_excel_with_multiindex(self, request, mock_clipboard, multiindex): # GH41108 - # the `.replace()` is because `.dedent()` does not like the leading `\t` - mock_clipboard[request.node.name] = multiindex[0].replace(" ", "") + mock_clipboard[request.node.name] = multiindex[0] df = read_clipboard() df_expected = DataFrame( data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}, From 030939e8d2016f07ab68ceb65cb8eb054d643d8f Mon Sep 17 00:00:00 2001 From: saucoide Date: Sat, 22 May 2021 00:45:46 +0200 Subject: [PATCH 9/9] make the concatenation explicit --- pandas/tests/io/test_clipboard.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index f1f635ad98d3b..40b2eb1f4114b 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -259,21 +259,20 @@ def test_infer_excel_with_nulls(self, request, mock_clipboard): @pytest.mark.parametrize( "multiindex", [ - ( - ( # Can't use `dedent` here as it will remove the leading `\t` - "\t\t\tcol1\tcol2\n" - "A\t0\tTrue\t1\tred\n" - "A\t1\tTrue\t\tblue\n" - "B\t0\tFalse\t2\tgreen\n" + ( # Can't use `dedent` here as it will remove the leading `\t` + "\n".join( + [ + "\t\t\tcol1\tcol2", + "A\t0\tTrue\t1\tred", + "A\t1\tTrue\t\tblue", + "B\t0\tFalse\t2\tgreen", + ] ), [["A", "A", "B"], [0, 1, 0], [True, True, False]], ), ( - ( - "\t\tcol1\tcol2\n" - "A\t0\t1\tred\n" - "A\t1\t\tblue\n" - "B\t0\t2\tgreen\n" + "\n".join( + ["\t\tcol1\tcol2", "A\t0\t1\tred", "A\t1\t\tblue", "B\t0\t2\tgreen"] ), [["A", "A", "B"], [0, 1, 0]], ),