diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bb9f48d17b2e1..b8fdd884448eb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -681,6 +681,7 @@ MultiIndex - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) - :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) +- :func:`read_csv` now handles empty values in :class:`MultiIndex` columns and indexes consistently, replacing them with empty strings instead of "Unnamed: ..." when uniqueness can be ensured. (:issue:`59560`) - I/O diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e263c69376d05..a92e98fe37b09 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -239,6 +239,19 @@ def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header))) + # Replace None, empty strings, or column names starting with 'Unnamed: ' + # (used as placeholders in multi-index headers) with empty strings. + columns = [ + tuple( + "" + if level is None + or str(level).strip() == "" + or (isinstance(level, str) and level.startswith("Unnamed: ")) + else level + for level in col + ) + for col in columns + ] names = columns.copy() for single_ic in sorted(ic): names.insert(single_ic, single_ic) @@ -357,7 +370,7 @@ def _agg_index(self, index) -> Index: ) else: col_na_values, col_na_fvalues = set(), set() - + col_na_values.discard("") cast_type = None index_converter = False if self.index_names is not None: @@ -694,8 +707,11 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis # Only clean index names that were placeholders. for i, name in enumerate(index_names): - if isinstance(name, str) and name in self.unnamed_cols: - index_names[i] = None + if isinstance(name, str): + if name.strip() == "": + index_names[i] = "" + elif name in self.unnamed_cols: + index_names[i] = None return index_names, columns, index_col diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 9977e2b8e1a1d..919dbb56b7dbb 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -375,3 +375,24 @@ def test_multiindex_columns_not_leading_index_col(all_parsers): ) expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"]) tm.assert_frame_equal(result, expected) + + +def test_multiindex_empty_values_handling(all_parsers): + # GH#59560 + parser = all_parsers + if parser.engine == "pyarrow": + pytest.skip( + "PyArrow engine does not support multiple header rows for MultiIndex cols." + ) + + data = ", ,a,b,b\n" ", ,, ,b2\n" "i1,,0,1,2\n" "i2,,3,4,5\n" + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) + expected_columns = MultiIndex.from_tuples( + [("a", ""), ("b", ""), ("b", "b2")], names=[None, None] + ) + expected = DataFrame( + [[0, 1, 2], [3, 4, 5]], + index=MultiIndex.from_tuples([("i1", ""), ("i2", "")]), + columns=expected_columns, + ) + tm.assert_frame_equal(result, expected)