-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: IndexError when header rows have unequal column counts #43102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
Here's the entire stacktrace: <ipython-input-5-1bf8982a1449> in <module>
----> 1 pandas.read_csv(StringIO(data), sep=",", header=[0, 1], on_bad_lines="skip")
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
584 kwds.update(kwds_defaults)
585
--> 586 return _read(filepath_or_buffer, kwds)
587
588
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
480
481 # Create the parser.
--> 482 parser = TextFileReader(filepath_or_buffer, **kwds)
483
484 if chunksize or iterator:
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
809 self.options["has_index_names"] = kwds["has_index_names"]
810
--> 811 self._engine = self._make_engine(self.engine)
812
813 def close(self):
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
1038 )
1039 # error: Too many arguments for "ParserBase"
-> 1040 return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
1041
1042 def _failover_to_python(self):
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py in __init__(self, src, **kwds)
90 self.col_names,
91 passed_names,
---> 92 ) = self._extract_multi_indexer_columns(
93 self._reader.header,
94 self.index_names, # type: ignore[has-type]
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py in _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names)
345 return tuple(r[i] for i in range(field_count) if i not in sic)
346
--> 347 columns = list(zip(*(extract(r) for r in header)))
348 names = ic + columns
349
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py in <genexpr>(.0)
345 return tuple(r[i] for i in range(field_count) if i not in sic)
346
--> 347 columns = list(zip(*(extract(r) for r in header)))
348 names = ic + columns
349
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py in extract(r)
343
344 def extract(r):
--> 345 return tuple(r[i] for i in range(field_count) if i not in sic)
346
347 columns = list(zip(*(extract(r) for r in header)))
~/.local/share/virtualenvs/py3.9playground--W4ZB8cD/lib/python3.9/site-packages/pandas/io/parsers/base_parser.py in <genexpr>(.0)
343
344 def extract(r):
--> 345 return tuple(r[i] for i in range(field_count) if i not in sic)
346
347 columns = list(zip(*(extract(r) for r in header)))
IndexError: list index out of range
|
So, just to confirm: an exception needs to be thrown if the length of the row ( def extract(r):
try:
return tuple(r[i] for i in range(field_count) if i not in sic)
except IndexError:
raise Exception("Error") from None This changes the stack trace to: File "<ipython-input-4-1bf8982a1449>", line 1, in <module>
pandas.read_csv(StringIO(data), sep=",", header=[0, 1], on_bad_lines="skip")
File "pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "pandas\io\parsers\readers.py", line 586, in read_csv
return _read(filepath_or_buffer, kwds)
File "pandas\io\parsers\readers.py", line 482, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "pandas\io\parsers\readers.py", line 811, in __init__
self._engine = self._make_engine(self.engine)
File "pandas\io\parsers\readers.py", line 1040, in _make_engine
return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
File "pandas\io\parsers\c_parser_wrapper.py", line 92, in __init__
) = self._extract_multi_indexer_columns(
File "pandas\io\parsers\base_parser.py", line 350, in _extract_multi_indexer_columns
columns = list(zip(*(extract(r) for r in header)))
File "pandas\io\parsers\base_parser.py", line 350, in <genexpr>
columns = list(zip(*(extract(r) for r in header)))
File "pandas\io\parsers\base_parser.py", line 348, in extract
raise Exception("Error") from None
Exception: Error If someone can guide me as to which exception I should raise and if there's anything else that needs to be added, I would love to work on this issue. Also, I did confirm; the bug also exists on the master branch |
It'd be good if the exception could provide information regarding the header row on which the error occured |
@quantumalaviya
|
@phofl Something like this? #check if header lengths are equal
for l in range(len(header)):
if len(header[l])!=field_count:
raise ParserError(f"Header rows must have equal number of columns") |
@MrCreosote How would you suggest dealing with it in cases where there is more than one discrepancy? For example, if 3 rows are chosen to be headers and all have different values? |
Personally I'd probably just throw an error for the first discrepancy found, top to bottom, and ignore the others, but I'm a brand new Pandas user and not a dev, so the weight of my opinion is pretty low. |
Although I would definitely want the row index of the first offending row in the exception |
Have a look at the PR and let me know if it's satisfactory. The traceback now looks like this: File "<ipython-input-4-1bf8982a1449>", line 1, in <module>
pandas.read_csv(StringIO(data), sep=",", header=[0, 1], on_bad_lines="skip")
File "pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "pandas\io\parsers\readers.py", line 586, in read_csv
return _read(filepath_or_buffer, kwds)
File "pandas\io\parsers\readers.py", line 482, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "pandas\io\parsers\readers.py", line 811, in __init__
self._engine = self._make_engine(self.engine)
File "pandas\io\parsers\readers.py", line 1040, in _make_engine
return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
File "pandas\io\parsers\c_parser_wrapper.py", line 92, in __init__
) = self._extract_multi_indexer_columns(
File "pandas\io\parsers\base_parser.py", line 347, in _extract_multi_indexer_columns
raise ParserError("Header rows must have equal number of columns."
ParserError: Header rows must have equal number of columns. Mismatch found at row 1 |
Shouldn't the minimum mismatch number be 2, assuming I'm understanding this correctly? You need at least 2 rows to have a mismatch |
I made it 0-indexed. |
Hmm, that's not what I would expect for rows in a file, but shrug |
I couldn't find it with a search on
indexerror read_csv
anywayI have confirmed this bug exists on the latest version of pandas.
(optional) I have confirmed this bug exists on the master branch of pandas.
Note: Please read this guide detailing how to provide the necessary information for us to reproduce your bug.
Code Sample, a copy-pastable example
Problem description
A generic index error is thrown rather than a pandas specific error, or skipping the row as per
on_bad_lines
.Expected Output
I would expect one of two things:
pandas.errors.ParseError
describing the problemMaybe there are other reasonable responses as well.
Output of
pd.show_versions()
INSTALLED VERSIONS
commit : 5f648bf
python : 3.9.6.final.0
python-bits : 64
OS : Linux
OS-release : 4.15.0-154-generic
Version : #161-Ubuntu SMP Fri Jul 30 13:04:17 UTC 2021
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.3.2
numpy : 1.21.2
pytz : 2021.1
dateutil : 2.8.2
pip : 20.3.3
setuptools : 51.3.3
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : 7.26.0
pandas_datareader: None
bs4 : None
bottleneck : None
fsspec : None
fastparquet : None
gcsfs : None
matplotlib : None
numexpr : None
odfpy : None
openpyxl : 3.0.7
pandas_gbq : None
pyarrow : None
pyxlsb : None
s3fs : None
scipy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : 2.0.1
xlwt : None
numba : None
The text was updated successfully, but these errors were encountered: