Skip to content

Commit b0bc80a

Browse files
committed
Initial commit to fix csv loading bug pandas-dev#56929
1 parent f459437 commit b0bc80a

File tree

3 files changed

+34
-11
lines changed

3 files changed

+34
-11
lines changed

pandas/_libs/parsers.pyi

100644100755
File mode changed.

pandas/_libs/parsers.pyx

100644100755
+17-10
Original file line numberDiff line numberDiff line change
@@ -656,52 +656,58 @@ cdef class TextReader:
656656

657657
# Header is in the file
658658
for level, hr in enumerate(prelim_header):
659+
print(f"Processing level {level}, header row {hr}")
659660

660661
this_header = []
661662

662663
if self.parser.lines < hr + 1:
664+
print("Tokenizing rows...")
663665
self._tokenize_rows(hr + 2)
664666

665667
if self.parser.lines == 0:
668+
print("No lines in parser.")
666669
field_count = 0
667670
start = self.parser.line_start[0]
668671

669672
# e.g., if header=3 and file only has 2 lines
670-
elif (self.parser.lines < hr + 1
671-
and not isinstance(self.orig_header, list)) or (
672-
self.parser.lines < hr):
673+
elif (self.parser.lines < hr + 1 and not isinstance(self.orig_header, list)) or (self.parser.lines < hr):
673674
msg = self.orig_header
674675
if isinstance(msg, list):
675676
joined = ",".join(str(m) for m in msg)
676677
msg = f"[{joined}], len of {len(msg)},"
677-
raise ParserError(
678-
f"Passed header={msg} but only "
679-
f"{self.parser.lines} lines in file")
678+
raise ParserError(f"Passed header={msg} but only {self.parser.lines} lines in file")
680679

681680
else:
682681
field_count = self.parser.line_fields[hr]
683682
start = self.parser.line_start[hr]
683+
print(f"Field count: {field_count}, Start: {start}")
684684

685685
unnamed_count = 0
686686
unnamed_col_indices = []
687687

688688
for i in range(field_count):
689689
word = self.parser.words[start + i]
690690

691-
name = PyUnicode_DecodeUTF8(word, strlen(word),
692-
self.encoding_errors)
691+
name = PyUnicode_DecodeUTF8(word, strlen(word), self.encoding_errors)
692+
print(f"Word {i}: '{word}', Decoded name: '{name}'")
693693

694694
if name == "":
695+
print("Empty name found.")
695696
if self.has_mi_columns:
696697
name = f"Unnamed: {i}_level_{level}"
698+
print(f"Setting multi-index column name: {name}")
697699
else:
698700
name = f"Unnamed: {i}"
701+
print(f"Setting unnamed column name: {name}")
699702

700703
unnamed_count += 1
701704
unnamed_col_indices.append(i)
702705

703706
this_header.append(name)
704707

708+
print(f"This header: {this_header}")
709+
710+
705711
if not self.has_mi_columns:
706712
# Ensure that regular columns are used before unnamed ones
707713
# to keep given names and mangle unnamed columns
@@ -744,9 +750,10 @@ cdef class TextReader:
744750
lc = len(this_header)
745751
ic = (len(self.index_col) if self.index_col
746752
is not None else 0)
753+
print(f"LC {lc}, IC {ic} Unnamed Count {unnamed_count}")
747754

748755
# if wrong number of blanks or no index, not our format
749-
if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
756+
if (lc != unnamed_count and lc - ic >= unnamed_count) or ic == 0:
750757
hr -= 1
751758
self.parser_start -= 1
752759
this_header = [None] * lc
@@ -2156,4 +2163,4 @@ def sanitize_objects(ndarray[object] values, set na_values) -> int:
21562163
else:
21572164
memo[val] = val
21582165

2159-
return na_count
2166+
return na_count

pandas/tests/io/formats/test_to_csv.py

100644100755
+17-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
compat,
1515
)
1616
import pandas._testing as tm
17-
17+
from pandas.testing import assert_frame_equal
1818

1919
class TestToCSV:
2020
def test_to_csv_with_single_column(self):
@@ -368,6 +368,22 @@ def test_to_csv_multi_index(self):
368368
exp = tm.convert_rows_list_to_csv_str(exp_rows)
369369
assert df.to_csv(index=False) == exp
370370

371+
def test_to_csv_multi_index_nan(self):
372+
# Create a MultiIndex DataFrame
373+
columns = pd.MultiIndex.from_tuples([('Level 1', 'Level 2')], names=['level1', 'level2'])
374+
data = [[np.nan], [0.1], [0.4]]
375+
df_complex = pd.DataFrame(data, columns=columns)
376+
377+
# Expected DataFrame
378+
expected_df = pd.DataFrame(data, columns=columns, index=range(3))
379+
380+
# Save and load the DataFrame as a CSV
381+
with tm.ensure_clean("complex_data.csv") as path:
382+
df_complex.to_csv(path)
383+
loaded_df_complex = pd.read_csv(path, header=[0, 1], index_col=0)
384+
385+
assert_frame_equal(loaded_df_complex, expected_df)
386+
371387
@pytest.mark.parametrize(
372388
"ind,expected",
373389
[

0 commit comments

Comments
 (0)