diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 6cbee83247692..e4e46299d101b 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -497,6 +497,7 @@ I/O
- Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`)
- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
- Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)
+- Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`)
- Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`)
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements(:issue:`45598`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index efcbb3c588ce9..b58a408806b54 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -622,7 +622,13 @@ def _build_doc(self):
else:
udoc = bdoc
from_encoding = self.encoding
- return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
+
+ soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
+
+ for br in soup.find_all("br"):
+ br.replace_with("\n" + br.text)
+
+ return soup
def _build_xpath_expr(attrs) -> str:
@@ -759,6 +765,10 @@ def _build_doc(self):
else:
if not hasattr(r, "text_content"):
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
+
+ for br in r.xpath("*//br"):
+ br.tail = "\n" + (br.tail or "")
+
return r
def _parse_thead_tr(self, table):
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 99fa31726445a..b0009a45edd72 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -627,17 +627,17 @@ def try_remove_ws(x):
)
assert df.shape == ground_truth.shape
old = [
- "First Vietnamese American BankIn Vietnamese",
- "Westernbank Puerto RicoEn Espanol",
- "R-G Premier Bank of Puerto RicoEn Espanol",
- "EurobankEn Espanol",
- "Sanderson State BankEn Espanol",
- "Washington Mutual Bank(Including its subsidiary Washington "
+ "First Vietnamese American Bank In Vietnamese",
+ "Westernbank Puerto Rico En Espanol",
+ "R-G Premier Bank of Puerto Rico En Espanol",
+ "Eurobank En Espanol",
+ "Sanderson State Bank En Espanol",
+ "Washington Mutual Bank (Including its subsidiary Washington "
"Mutual Bank FSB)",
- "Silver State BankEn Espanol",
- "AmTrade International BankEn Espanol",
- "Hamilton Bank, NAEn Espanol",
- "The Citizens Savings BankPioneer Community Bank, Inc.",
+ "Silver State Bank En Espanol",
+ "AmTrade International Bank En Espanol",
+ "Hamilton Bank, NA En Espanol",
+ "The Citizens Savings Bank Pioneer Community Bank, Inc.",
]
new = [
"First Vietnamese American Bank",
@@ -1302,3 +1302,22 @@ def test_parse_path_object(self, datapath):
df1 = self.read_html(file_path_string)[0]
df2 = self.read_html(file_path)[0]
tm.assert_frame_equal(df1, df2)
+
+ def test_parse_br_as_space(self):
+ # GH 29528: pd.read_html() convert
to space
+ result = self.read_html(
+ """
+
A | +
---|
word1 word2 |
+