Skip to content

Commit 7c23b24

Browse files
abmyiiyehoshuadimarsky
authored andcommitted
BUG: Convert <br> to space in pd.read_html (pandas-dev#45972)
1 parent c73bf75 commit 7c23b24

File tree

3 files changed

+41
-11
lines changed

3 files changed

+41
-11
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,7 @@ I/O
563563
- Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`)
564564
- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
565565
- Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)
566+
- Bug in :func:`read_html` where elements surrounding ``<br>`` were joined without a space between them (:issue:`29528`)
566567
- Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`)
567568
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
568569
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)

pandas/io/html.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,13 @@ def _build_doc(self):
622622
else:
623623
udoc = bdoc
624624
from_encoding = self.encoding
625-
return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
625+
626+
soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
627+
628+
for br in soup.find_all("br"):
629+
br.replace_with("\n" + br.text)
630+
631+
return soup
626632

627633

628634
def _build_xpath_expr(attrs) -> str:
@@ -759,6 +765,10 @@ def _build_doc(self):
759765
else:
760766
if not hasattr(r, "text_content"):
761767
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
768+
769+
for br in r.xpath("*//br"):
770+
br.tail = "\n" + (br.tail or "")
771+
762772
return r
763773

764774
def _parse_thead_tr(self, table):

pandas/tests/io/test_html.py

+29-10
Original file line numberDiff line numberDiff line change
@@ -627,17 +627,17 @@ def try_remove_ws(x):
627627
)
628628
assert df.shape == ground_truth.shape
629629
old = [
630-
"First Vietnamese American BankIn Vietnamese",
631-
"Westernbank Puerto RicoEn Espanol",
632-
"R-G Premier Bank of Puerto RicoEn Espanol",
633-
"EurobankEn Espanol",
634-
"Sanderson State BankEn Espanol",
635-
"Washington Mutual Bank(Including its subsidiary Washington "
630+
"First Vietnamese American Bank In Vietnamese",
631+
"Westernbank Puerto Rico En Espanol",
632+
"R-G Premier Bank of Puerto Rico En Espanol",
633+
"Eurobank En Espanol",
634+
"Sanderson State Bank En Espanol",
635+
"Washington Mutual Bank (Including its subsidiary Washington "
636636
"Mutual Bank FSB)",
637-
"Silver State BankEn Espanol",
638-
"AmTrade International BankEn Espanol",
639-
"Hamilton Bank, NAEn Espanol",
640-
"The Citizens Savings BankPioneer Community Bank, Inc.",
637+
"Silver State Bank En Espanol",
638+
"AmTrade International Bank En Espanol",
639+
"Hamilton Bank, NA En Espanol",
640+
"The Citizens Savings Bank Pioneer Community Bank, Inc.",
641641
]
642642
new = [
643643
"First Vietnamese American Bank",
@@ -1302,3 +1302,22 @@ def test_parse_path_object(self, datapath):
13021302
df1 = self.read_html(file_path_string)[0]
13031303
df2 = self.read_html(file_path)[0]
13041304
tm.assert_frame_equal(df1, df2)
1305+
1306+
def test_parse_br_as_space(self):
1307+
# GH 29528: pd.read_html() convert <br> to space
1308+
result = self.read_html(
1309+
"""
1310+
<table>
1311+
<tr>
1312+
<th>A</th>
1313+
</tr>
1314+
<tr>
1315+
<td>word1<br>word2</td>
1316+
</tr>
1317+
</table>
1318+
"""
1319+
)[0]
1320+
1321+
expected = DataFrame(data=[["word1 word2"]], columns=["A"])
1322+
1323+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)