diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d357e4a633347..9ebb8f4337507 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -925,7 +925,7 @@ I/O - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) - Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) -- +- Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`) Period ^^^^^^ diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 00a99eb8a4480..a6940c08198b0 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -58,9 +58,14 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = {x.lstrip().count("\t") for x in lines} + counts = {x.lstrip(" ").count("\t") for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = "\t" + # check the number of leading tabs in the first line + # to account for index columns + index_length = len(lines[0]) - len(lines[0].lstrip(" \t")) + if index_length != 0: + kwargs.setdefault("index_col", list(range(index_length))) # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get("delim_whitespace") is None: diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 45d9ad430aa43..40b2eb1f4114b 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -243,6 +243,54 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): tm.assert_frame_equal(res, exp) + def test_infer_excel_with_nulls(self, request, mock_clipboard): + # GH41108 + text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen" + + mock_clipboard[request.node.name] = text + df = read_clipboard() + df_expected = DataFrame( + data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]} + ) + + # excel data is parsed correctly + tm.assert_frame_equal(df, df_expected) + + @pytest.mark.parametrize( + "multiindex", + [ + ( # Can't use `dedent` here as it will remove the leading `\t` + "\n".join( + [ + "\t\t\tcol1\tcol2", + "A\t0\tTrue\t1\tred", + "A\t1\tTrue\t\tblue", + "B\t0\tFalse\t2\tgreen", + ] + ), + [["A", "A", "B"], [0, 1, 0], [True, True, False]], + ), + ( + "\n".join( + ["\t\tcol1\tcol2", "A\t0\t1\tred", "A\t1\t\tblue", "B\t0\t2\tgreen"] + ), + [["A", "A", "B"], [0, 1, 0]], + ), + ], + ) + def test_infer_excel_with_multiindex(self, request, mock_clipboard, multiindex): + # GH41108 + + mock_clipboard[request.node.name] = multiindex[0] + df = read_clipboard() + df_expected = DataFrame( + data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}, + index=multiindex[1], + ) + + # excel data is parsed correctly + tm.assert_frame_equal(df, df_expected) + def test_invalid_encoding(self, df): msg = "clipboard only supports utf-8 encoding" # test case for testing invalid encoding