Skip to content

fix for 61123 read_excel nrows param reads extra rows #61129

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,7 @@ def f(skiprows: Sequence, x: int) -> bool:
# the number of rows read from file
return None

# This method calculates how many rows to read from the file
def parse(
self,
sheet_name: str | int | list[int] | list[str] | None = 0,
Expand Down Expand Up @@ -748,13 +749,15 @@ def parse(
if verbose:
print(f"Reading sheet {asheetname}")

# Get the sheet object based on name or index
if isinstance(asheetname, str):
sheet = self.get_sheet_by_name(asheetname)
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)

file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
data = self.get_sheet_data(sheet, file_rows_needed)

if hasattr(sheet, "close"):
# pyxlsb opens two TemporaryFiles
sheet.close()
Expand All @@ -764,6 +767,11 @@ def parse(
output[asheetname] = DataFrame()
continue

# Ensure we don't process more rows than requested with nrows
# This is a safeguard in case get_sheet_data returns more rows than requested
if nrows is not None and len(data) > nrows:
data = data[:nrows + (0 if header is None else header + 1)]

output = self._parse_sheet(
data=data,
output=output,
Expand Down
5 changes: 4 additions & 1 deletion pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,10 @@ def get_sheet_data(
break

# Trim trailing empty rows
data = data[: last_row_with_data + 1]
if file_rows_needed is None:
# Only trim trailing empty rows when file_rows_needed is None
# to ensure we return exactly file_rows_needed rows when specified
data = data[: last_row_with_data + 1]

if len(data) > 0:
# extend rows to max width
Expand Down
5 changes: 5 additions & 0 deletions pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,9 @@ def get_sheet_data(
data_row + (max_width - len(data_row)) * empty_cell
for data_row in data
]

# Ensure we return exactly file_rows_needed rows if specified
if file_rows_needed is not None and len(data) > file_rows_needed:
data = data[:file_rows_needed]

return data
1 change: 1 addition & 0 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = time(
cell_contents.hour,
cell_contents.minute,
# xlrd implementation already correctly limits rows to file_rows_needed
cell_contents.second,
cell_contents.microsecond,
)
Expand Down
74 changes: 74 additions & 0 deletions pandas/tests/io/excel/run_nrows_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Standalone script to test nrows parameter with adjacent tables in Excel files.
This script can be run directly with Python without using pytest.

Usage:
python pandas/tests/io/excel/run_nrows_test.py
"""
import os
import tempfile
import pandas as pd


def run_test():
"""
Test that nrows parameter correctly handles adjacent tables.

This test creates two Excel files:
1. One with a blank row between two tables
2. One with no blank row between two tables

Then it verifies that reading with nrows=3 returns only the first table
in both cases.
"""
# Create temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
# Create test files
file1 = os.path.join(tmp_dir, "with_blank.xlsx")
file2 = os.path.join(tmp_dir, "no_blank.xlsx")

# Create test data
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

print("Creating Excel files...")

# Create file with blank row between tables
with pd.ExcelWriter(file1) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# Create file with no blank row between tables
with pd.ExcelWriter(file2) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

print("Reading Excel files with nrows=3...")

# Read with nrows=3 (should only get the first table)
df1 = pd.read_excel(file1, nrows=3)
df2 = pd.read_excel(file2, nrows=3)

# Expected result - just the first table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Verify results
print("Verifying results...")
pd.testing.assert_frame_equal(df1, expected)
pd.testing.assert_frame_equal(df2, expected)

# Verify shapes
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"

# Verify last row doesn't contain headers from second table
assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"

print("All tests passed!")


if __name__ == "__main__":
run_test()
65 changes: 65 additions & 0 deletions pandas/tests/io/excel/test_excel_adjacent_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Tests for reading Excel files with adjacent tables.
"""
import pytest
import pandas as pd
import pandas._testing as tm


# Skip the entire test class if openpyxl is not installed
pytestmark = pytest.mark.skipif(
pytest.importorskip("openpyxl", reason="openpyxl not installed") is None,
reason="openpyxl not installed"
)


class TestExcelAdjacentTables:
"""Tests for reading Excel files with adjacent tables."""

@pytest.mark.parametrize("engine", ["openpyxl"])
def test_nrows_with_adjacent_tables(self, engine, tmp_path):
"""
Test that nrows parameter correctly handles adjacent tables.

GH-61123: When using nrows to limit the number of rows read from an Excel file,
the function should correctly handle cases where tables are adjacent (no blank
row between them).
"""
# Create test files with tables with and without blank rows between them
# File 1: Two tables with a blank row between
file1 = tmp_path / "test1.xlsx"
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

with pd.ExcelWriter(file1, engine=engine) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# File 2: Two tables with no blank row
file2 = tmp_path / "test2.xlsx"
with pd.ExcelWriter(file2, engine=engine) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

# Read first 3 rows (header + 3 data rows)
# Using nrows=3 to get exactly the upper table without blank rows
df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine)
df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine)

# Expected data - just the upper table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Check content
tm.assert_frame_equal(df1, expected)
tm.assert_frame_equal(df2, expected)

# Verify we didn't read the header of the next table in df2
# If we did, the last row would contain column headers from the second table
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"

# Check specific values in the last row to ensure we didn't read the header
assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"
Loading
Loading