diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6cbee83247692..e4e46299d101b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -497,6 +497,7 @@ I/O - Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) +- Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`) - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements(:issue:`45598`) diff --git a/pandas/io/html.py b/pandas/io/html.py index efcbb3c588ce9..b58a408806b54 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -622,7 +622,13 @@ def _build_doc(self): else: udoc = bdoc from_encoding = self.encoding - return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + for br in soup.find_all("br"): + br.replace_with("\n" + br.text) + + return soup def _build_xpath_expr(attrs) -> str: @@ -759,6 +765,10 @@ def _build_doc(self): else: if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) + + for br in r.xpath("*//br"): + br.tail = "\n" + (br.tail or "") + return r def _parse_thead_tr(self, table): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 99fa31726445a..b0009a45edd72 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -627,17 +627,17 @@ def try_remove_ws(x): ) assert df.shape == ground_truth.shape old = [ - "First Vietnamese American BankIn Vietnamese", - "Westernbank Puerto RicoEn Espanol", - "R-G Premier Bank of Puerto RicoEn Espanol", - "EurobankEn Espanol", - "Sanderson State BankEn Espanol", - "Washington Mutual Bank(Including its subsidiary Washington " + "First Vietnamese American Bank In Vietnamese", + "Westernbank Puerto Rico En Espanol", + "R-G Premier Bank of Puerto Rico En Espanol", + "Eurobank En Espanol", + "Sanderson State Bank En Espanol", + "Washington Mutual Bank (Including its subsidiary Washington " "Mutual Bank FSB)", - "Silver State BankEn Espanol", - "AmTrade International BankEn Espanol", - "Hamilton Bank, NAEn Espanol", - "The Citizens Savings BankPioneer Community Bank, Inc.", + "Silver State Bank En Espanol", + "AmTrade International Bank En Espanol", + "Hamilton Bank, NA En Espanol", + "The Citizens Savings Bank Pioneer Community Bank, Inc.", ] new = [ "First Vietnamese American Bank", @@ -1302,3 +1302,22 @@ def test_parse_path_object(self, datapath): df1 = self.read_html(file_path_string)[0] df2 = self.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) + + def test_parse_br_as_space(self): + # GH 29528: pd.read_html() convert
to space + result = self.read_html( + """ + + + + + + + +
A
word1
word2
+ """ + )[0] + + expected = DataFrame(data=[["word1 word2"]], columns=["A"]) + + tm.assert_frame_equal(result, expected)