@@ -443,58 +443,73 @@ def _expand_colspan_rowspan(self, rows):
443
443
"""
444
444
445
445
all_texts = [] # list of rows, each a list of str
446
- remainder = [] # list of (index, text, nrows)
446
+ # saved_rowspans: list of None or (text, n_rows_remaining) as long as
447
+ # the longest row. The index determines the column number.
448
+ saved_rowspans = []
449
+
450
+ def pop_saved_rowspan (index ):
451
+ if saved_rowspans [index ]:
452
+ text , rowspan = saved_rowspans [index ]
453
+ if rowspan > 1 :
454
+ saved_rowspans [index ] = (text , rowspan - 1 )
455
+ else :
456
+ saved_rowspans [index ] = None
457
+ return text
458
+ else :
459
+ return ''
447
460
448
461
for tr in rows :
449
- texts = [] # the output for this row
450
- next_remainder = []
451
-
452
- index = 0
453
462
tds = self ._parse_td (tr )
454
- for td in tds :
455
- # Append texts from previous rows with rowspan>1 that come
456
- # before this <td>
457
- while remainder and remainder [0 ][0 ] <= index :
458
- prev_i , prev_text , prev_rowspan = remainder .pop (0 )
459
- texts .append (prev_text )
460
- if prev_rowspan > 1 :
461
- next_remainder .append ((prev_i , prev_text ,
462
- prev_rowspan - 1 ))
463
- index += 1
464
-
465
- # Append the text from this <td>, colspan times
466
- text = _remove_whitespace (self ._text_getter (td ))
467
- rowspan = int (self ._attr_getter (td , 'rowspan' ) or 1 )
468
- colspan = int (self ._attr_getter (td , 'colspan' ) or 1 )
463
+ td_texts = [_remove_whitespace (self ._text_getter (td ))
464
+ for td in tds ]
465
+ colspans = [int (self ._attr_getter (td , 'colspan' ) or '1' )
466
+ for td in tds ]
467
+ rowspans = [int (self ._attr_getter (td , 'rowspan' ) or '1' )
468
+ for td in tds ]
469
+
470
+ # Make sure saved_rowspans is at least as wide as this row, so we
471
+ # can index into it safely
472
+ row_length = sum (colspans ) + len ([x for x in saved_rowspans if x ])
473
+ n_new_columns = row_length - len (saved_rowspans )
474
+ if n_new_columns > 0 :
475
+ saved_rowspans .extend ([None ] * n_new_columns )
476
+
477
+ # Iterate over this row's text+colspan+rowspan <td>s...
478
+ texts = [] # the output for this row
479
+ index = 0 # len(texts) -- that is, the current column index
469
480
481
+ for text , colspan , rowspan in zip (td_texts , colspans , rowspans ):
482
+ # Handle colspan: just treat it as though we had `colspan`
483
+ # cells, each with identical `text` and `rowspan`.
470
484
for _ in range (colspan ):
485
+ # If saved_rowspans has text at this index, that's text
486
+ # from the previous row and it belongs _before_ the text
487
+ # in the <td> we're inspecting right now.
488
+ while saved_rowspans [index ]:
489
+ texts .append (pop_saved_rowspan (index ))
490
+ index += 1
491
+ # Now, saved_rowspans[index] is None
492
+
471
493
texts .append (text )
472
494
if rowspan > 1 :
473
- next_remainder . append (( index , text , rowspan - 1 ) )
495
+ saved_rowspans [ index ] = ( text , rowspan - 1 )
474
496
index += 1
475
497
476
- # Append texts from previous rows at the final position
477
- for prev_i , prev_text , prev_rowspan in remainder :
478
- texts . append ( prev_text )
479
- if prev_rowspan > 1 :
480
- next_remainder .append (( prev_i , prev_text ,
481
- prev_rowspan - 1 ))
498
+ # Copy all final values for this row from saved_rowspans. They
499
+ # may be all-None, which would make us add harmless empty
500
+ # strings.
501
+ while index < len ( saved_rowspans ) :
502
+ texts .append (pop_saved_rowspan ( index ))
503
+ index += 1
482
504
505
+ # Done with this row
483
506
all_texts .append (texts )
484
- remainder = next_remainder
485
507
486
508
# Append rows that only appear because the previous row had non-1
487
509
# rowspan
488
- while remainder :
489
- next_remainder = []
490
- texts = []
491
- for prev_i , prev_text , prev_rowspan in remainder :
492
- texts .append (prev_text )
493
- if prev_rowspan > 1 :
494
- next_remainder .append ((prev_i , prev_text ,
495
- prev_rowspan - 1 ))
496
- all_texts .append (texts )
497
- remainder = next_remainder
510
+ while any (x for x in saved_rowspans ):
511
+ all_texts .append ([pop_saved_rowspan (i )
512
+ for i in range (len (saved_rowspans ))])
498
513
499
514
# ignore all-empty-text rows
500
515
no_empty = [row for row in all_texts
0 commit comments