diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 67c7ce150132a..0f2c9c4756987 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -64,6 +64,7 @@ Bug Fixes **I/O** +- Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) - - diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2c8f98732c92f..65df2bffb4abf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3209,12 +3209,22 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): col = columns[k] if is_integer(k) else k dtype[col] = v - if index_col is None or index_col is False: + # Even though we have no data, the "index" of the empty DataFrame + # could for example still be an empty MultiIndex. Thus, we need to + # check whether we have any index columns specified, via either: + # + # 1) index_col (column indices) + # 2) index_names (column names) + # + # Both must be non-null to ensure a successful construction. Otherwise, + # we have to create a generic emtpy Index. + if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: data = [Series([], dtype=dtype[name]) for name in index_names] index = _ensure_index_from_sequences(data, names=index_names) index_col.sort() + for i, n in enumerate(index_col): columns.pop(n - i) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 2b7ff1f5a9879..b39122e5e7906 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -238,6 +238,21 @@ def test_csv_mixed_type(self): out = self.read_csv(StringIO(data)) tm.assert_frame_equal(out, expected) + def test_read_csv_low_memory_no_rows_with_index(self): + if self.engine == "c" and not self.low_memory: + pytest.skip("This is a low-memory specific test") + + # see gh-21141 + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + out = self.read_csv(StringIO(data), low_memory=True, + index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(out, expected) + def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0,