Skip to content

Commit 67dea69

Browse files
committed
Alternative colspan/rowspan algorithm
1 parent 17e04b4 commit 67dea69

File tree

1 file changed

+53
-38
lines changed

1 file changed

+53
-38
lines changed

pandas/io/html.py

+53-38
Original file line numberDiff line numberDiff line change
@@ -443,58 +443,73 @@ def _expand_colspan_rowspan(self, rows):
443443
"""
444444

445445
all_texts = [] # list of rows, each a list of str
446-
remainder = [] # list of (index, text, nrows)
446+
# saved_rowspans: list of None or (text, n_rows_remaining) as long as
447+
# the longest row. The index determines the column number.
448+
saved_rowspans = []
449+
450+
def pop_saved_rowspan(index):
451+
if saved_rowspans[index]:
452+
text, rowspan = saved_rowspans[index]
453+
if rowspan > 1:
454+
saved_rowspans[index] = (text, rowspan - 1)
455+
else:
456+
saved_rowspans[index] = None
457+
return text
458+
else:
459+
return ''
447460

448461
for tr in rows:
449-
texts = [] # the output for this row
450-
next_remainder = []
451-
452-
index = 0
453462
tds = self._parse_td(tr)
454-
for td in tds:
455-
# Append texts from previous rows with rowspan>1 that come
456-
# before this <td>
457-
while remainder and remainder[0][0] <= index:
458-
prev_i, prev_text, prev_rowspan = remainder.pop(0)
459-
texts.append(prev_text)
460-
if prev_rowspan > 1:
461-
next_remainder.append((prev_i, prev_text,
462-
prev_rowspan - 1))
463-
index += 1
464-
465-
# Append the text from this <td>, colspan times
466-
text = _remove_whitespace(self._text_getter(td))
467-
rowspan = int(self._attr_getter(td, 'rowspan') or 1)
468-
colspan = int(self._attr_getter(td, 'colspan') or 1)
463+
td_texts = [_remove_whitespace(self._text_getter(td))
464+
for td in tds]
465+
colspans = [int(self._attr_getter(td, 'colspan') or '1')
466+
for td in tds]
467+
rowspans = [int(self._attr_getter(td, 'rowspan') or '1')
468+
for td in tds]
469+
470+
# Make sure saved_rowspans is at least as wide as this row, so we
471+
# can index into it safely
472+
row_length = sum(colspans) + len([x for x in saved_rowspans if x])
473+
n_new_columns = row_length - len(saved_rowspans)
474+
if n_new_columns > 0:
475+
saved_rowspans.extend([None] * n_new_columns)
476+
477+
# Iterate over this row's text+colspan+rowspan <td>s...
478+
texts = [] # the output for this row
479+
index = 0 # len(texts) -- that is, the current column index
469480

481+
for text, colspan, rowspan in zip(td_texts, colspans, rowspans):
482+
# Handle colspan: just treat it as though we had `colspan`
483+
# cells, each with identical `text` and `rowspan`.
470484
for _ in range(colspan):
485+
# If saved_rowspans has text at this index, that's text
486+
# from the previous row and it belongs _before_ the text
487+
# in the <td> we're inspecting right now.
488+
while saved_rowspans[index]:
489+
texts.append(pop_saved_rowspan(index))
490+
index += 1
491+
# Now, saved_rowspans[index] is None
492+
471493
texts.append(text)
472494
if rowspan > 1:
473-
next_remainder.append((index, text, rowspan - 1))
495+
saved_rowspans[index] = (text, rowspan - 1)
474496
index += 1
475497

476-
# Append texts from previous rows at the final position
477-
for prev_i, prev_text, prev_rowspan in remainder:
478-
texts.append(prev_text)
479-
if prev_rowspan > 1:
480-
next_remainder.append((prev_i, prev_text,
481-
prev_rowspan - 1))
498+
# Copy all final values for this row from saved_rowspans. They
499+
# may be all-None, which would make us add harmless empty
500+
# strings.
501+
while index < len(saved_rowspans):
502+
texts.append(pop_saved_rowspan(index))
503+
index += 1
482504

505+
# Done with this row
483506
all_texts.append(texts)
484-
remainder = next_remainder
485507

486508
# Append rows that only appear because the previous row had non-1
487509
# rowspan
488-
while remainder:
489-
next_remainder = []
490-
texts = []
491-
for prev_i, prev_text, prev_rowspan in remainder:
492-
texts.append(prev_text)
493-
if prev_rowspan > 1:
494-
next_remainder.append((prev_i, prev_text,
495-
prev_rowspan - 1))
496-
all_texts.append(texts)
497-
remainder = next_remainder
510+
while any(x for x in saved_rowspans):
511+
all_texts.append([pop_saved_rowspan(i)
512+
for i in range(len(saved_rowspans))])
498513

499514
# ignore all-empty-text rows
500515
no_empty = [row for row in all_texts

0 commit comments

Comments
 (0)