BUG: Check for null values when infering excel in read_clipboard (pandas-dev#41109)

saucoide · TLouf · commit 5d8189433ebf · 2021-06-01T18:01:59.000+02:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -935,7 +935,7 @@ I/O
 - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`)
 - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`)
 - Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`)
--
+- Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py
@@ -58,9 +58,14 @@ def read_clipboard(sep=r"\s+", **kwargs):  # pragma: no cover
     # 0  1  2
     # 1  3  4
 
-    counts = {x.lstrip().count("\t") for x in lines}
+    counts = {x.lstrip(" ").count("\t") for x in lines}
     if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
         sep = "\t"
+        # check the number of leading tabs in the first line
+        # to account for index columns
+        index_length = len(lines[0]) - len(lines[0].lstrip(" \t"))
+        if index_length != 0:
+            kwargs.setdefault("index_col", list(range(index_length)))
 
     # Edge case where sep is specified to be None, return to default
     if sep is None and kwargs.get("delim_whitespace") is None:
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
@@ -243,6 +243,54 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard):
 
         tm.assert_frame_equal(res, exp)
 
+    def test_infer_excel_with_nulls(self, request, mock_clipboard):
+        # GH41108
+        text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen"
+
+        mock_clipboard[request.node.name] = text
+        df = read_clipboard()
+        df_expected = DataFrame(
+            data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}
+        )
+
+        # excel data is parsed correctly
+        tm.assert_frame_equal(df, df_expected)
+
+    @pytest.mark.parametrize(
+        "multiindex",
+        [
+            (  # Can't use `dedent` here as it will remove the leading `\t`
+                "\n".join(
+                    [
+                        "\t\t\tcol1\tcol2",
+                        "A\t0\tTrue\t1\tred",
+                        "A\t1\tTrue\t\tblue",
+                        "B\t0\tFalse\t2\tgreen",
+                    ]
+                ),
+                [["A", "A", "B"], [0, 1, 0], [True, True, False]],
+            ),
+            (
+                "\n".join(
+                    ["\t\tcol1\tcol2", "A\t0\t1\tred", "A\t1\t\tblue", "B\t0\t2\tgreen"]
+                ),
+                [["A", "A", "B"], [0, 1, 0]],
+            ),
+        ],
+    )
+    def test_infer_excel_with_multiindex(self, request, mock_clipboard, multiindex):
+        # GH41108
+
+        mock_clipboard[request.node.name] = multiindex[0]
+        df = read_clipboard()
+        df_expected = DataFrame(
+            data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]},
+            index=multiindex[1],
+        )
+
+        # excel data is parsed correctly
+        tm.assert_frame_equal(df, df_expected)
+
     def test_invalid_encoding(self, df):
         msg = "clipboard only supports utf-8 encoding"
         # test case for testing invalid encoding