From d8ff169e4c0a7990216af727e875d727cafaea4b Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Wed, 21 Jun 2023 22:11:27 -0400 Subject: [PATCH 1/5] Updating documentation and adding deprecation logic for read_html. --- doc/source/user_guide/io.rst | 3 +- doc/source/whatsnew/v2.1.0.rst | 3 +- pandas/io/html.py | 15 +++++ pandas/tests/io/test_html.py | 117 ++++++++++++++++----------------- 4 files changed, 77 insertions(+), 61 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 84a78ace8d7c7..de6e05ec9d922 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2652,6 +2652,7 @@ Links can be extracted from cells along with the text using ``extract_links="all .. ipython:: python + from io import StringIO html_table = """ @@ -2664,7 +2665,7 @@ Links can be extracted from cells along with the text using ``extract_links="all """ df = pd.read_html( - html_table, + StringIO(html_table), extract_links="all" )[0] df diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 511e5793608bc..6abb965e23382 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -305,7 +305,8 @@ Deprecations - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) - +- Deprecated literal string/bytes input to :func:`read_excel`, :func:`read_html`, :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) +- .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/io/html.py b/pandas/io/html.py index 510ff1fa95868..85dfb2832c4af 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -6,6 +6,8 @@ from __future__ import annotations +import os.path +import warnings from collections import abc import numbers import re @@ -24,6 +26,7 @@ AbstractMethodError, EmptyDataError, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -1023,6 +1026,9 @@ def read_html( lxml only accepts the http, ftp and file url protocols. If you have a URL that starts with ``'https'`` you might try removing the ``'s'``. + .. deprecated:: 2.1.0 + Passing html literal strings is deprecated. + match : str or compiled regular expression, optional The set of tables containing text matching this regex or string will be returned. Unless the HTML is extremely simple you will probably need to @@ -1178,6 +1184,15 @@ def read_html( io = stringify_path(io) + if isinstance(io, str) and "\n" in io: + warnings.warn( + "Passing literal html to 'read_html' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return _parse( flavor=flavor, io=io, diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 5c6c33de5ac5f..15c2b3bb81d42 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -134,7 +134,7 @@ def test_to_html_compat(self): .map("{:.3f}".format).astype(float) ) out = df.to_html() - res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] + res = self.read_html(StringIO(out), attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) def test_dtype_backend(self, string_storage, dtype_backend): @@ -163,7 +163,7 @@ def test_dtype_backend(self, string_storage, dtype_backend): out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): - result = self.read_html(out, dtype_backend=dtype_backend)[0] + result = self.read_html(StringIO(out), dtype_backend=dtype_backend)[0] expected = DataFrame( { @@ -351,8 +351,8 @@ def test_string(self, spam_data): with open(spam_data, encoding="UTF-8") as f: data = f.read() - df1 = self.read_html(data, match=".*Water.*") - df2 = self.read_html(data, match="Unit") + df1 = self.read_html(StringIO(data), match=".*Water.*") + df2 = self.read_html(StringIO(data), match="Unit") assert_framelist_equal(df1, df2) @@ -493,14 +493,14 @@ def test_empty_tables(self):
""" - result = self.read_html(html) + result = self.read_html(StringIO(html)) assert len(result) == 1 def test_multiple_tbody(self): # GH-20690 # Read all tbody tags within a single table. result = self.read_html( - """ + StringIO("""
@@ -520,7 +520,7 @@ def test_multiple_tbody(self):
A
""" - )[0] + ))[0] expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"]) @@ -532,7 +532,7 @@ def test_header_and_one_column(self): as described in issue #9178 """ result = self.read_html( - """ + StringIO("""
@@ -544,7 +544,7 @@ def test_header_and_one_column(self):
Header
""" - )[0] + ))[0] expected = DataFrame(data={"Header": "first"}, index=[0]) @@ -555,7 +555,7 @@ def test_thead_without_tr(self): Ensure parser adds within on malformed HTML. """ result = self.read_html( - """ + StringIO("""
@@ -571,7 +571,7 @@ def test_thead_without_tr(self):
Country
""" - )[0] + ))[0] expected = DataFrame( data=[["Ukraine", "Odessa", 1944]], @@ -612,8 +612,8 @@ def test_tfoot_read(self): data1 = data_template.format(footer="") data2 = data_template.format(footer="footAfootB") - result1 = self.read_html(data1)[0] - result2 = self.read_html(data2)[0] + result1 = self.read_html(StringIO(data1))[0] + result2 = self.read_html(StringIO(data2))[0] tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) @@ -622,7 +622,7 @@ def test_parse_header_of_non_string_column(self): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str result = self.read_html( - """ + StringIO(""" @@ -633,7 +633,7 @@ def test_parse_header_of_non_string_column(self):
S1944
- """, + """), header=0, )[0] @@ -703,7 +703,7 @@ def test_gold_canyon(self, banklist_data): def test_different_number_of_cols(self): expected = self.read_html( - """ + StringIO("""
@@ -732,12 +732,12 @@ def test_different_number_of_cols(self): -
0.222
""", + """), index_col=0, )[0] result = self.read_html( - """ + StringIO("""
@@ -763,7 +763,7 @@ def test_different_number_of_cols(self): -
0.222
""", + """), index_col=0, )[0] @@ -772,7 +772,7 @@ def test_different_number_of_cols(self): def test_colspan_rowspan_1(self): # GH17054 result = self.read_html( - """ + StringIO(""" @@ -786,7 +786,7 @@ def test_colspan_rowspan_1(self):
A
""" - )[0] + ))[0] expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"]) @@ -801,7 +801,7 @@ def test_colspan_rowspan_copy_values(self): # A B b z C result = self.read_html( - """ + StringIO(""" @@ -815,7 +815,7 @@ def test_colspan_rowspan_copy_values(self):
XC
- """, + """), header=0, )[0] @@ -834,7 +834,7 @@ def test_colspan_rowspan_both_not_1(self): # a b b b D result = self.read_html( - """ + StringIO(""" @@ -845,7 +845,7 @@ def test_colspan_rowspan_both_not_1(self):
AD
- """, + """), header=0, )[0] @@ -864,7 +864,7 @@ def test_rowspan_at_end_of_row(self): # C b result = self.read_html( - """ + StringIO(""" @@ -874,7 +874,7 @@ def test_rowspan_at_end_of_row(self):
AC
- """, + """), header=0, )[0] @@ -886,14 +886,14 @@ def test_rowspan_only_rows(self): # GH17054 result = self.read_html( - """ + StringIO("""
A B
- """, + """), header=0, )[0] @@ -904,7 +904,7 @@ def test_rowspan_only_rows(self): def test_header_inferred_from_rows_with_only_th(self): # GH17054 result = self.read_html( - """ + StringIO(""" @@ -919,7 +919,7 @@ def test_header_inferred_from_rows_with_only_th(self):
A2
- """ + """) )[0] columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) @@ -930,9 +930,9 @@ def test_header_inferred_from_rows_with_only_th(self): def test_parse_dates_list(self): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) expected = df.to_html() - res = self.read_html(expected, parse_dates=[1], index_col=0) + res = self.read_html(StringIO(expected), parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) - res = self.read_html(expected, parse_dates=["date"], index_col=0) + res = self.read_html(StringIO(expected), parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) def test_parse_dates_combine(self): @@ -944,7 +944,7 @@ def test_parse_dates_combine(self): } ) res = self.read_html( - df.to_html(), parse_dates={"datetime": [1, 2]}, index_col=1 + StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 ) newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) @@ -969,7 +969,7 @@ def test_wikipedia_states_multiindex(self, datapath): def test_parser_error_on_empty_header_row(self): result = self.read_html( - """ + StringIO(""" @@ -979,7 +979,7 @@ def test_parser_error_on_empty_header_row(self):
ab
- """, + """), header=[0, 1], ) expected = DataFrame( @@ -993,7 +993,7 @@ def test_parser_error_on_empty_header_row(self): def test_decimal_rows(self): # GH 12907 result = self.read_html( - """ + StringIO(""" @@ -1008,7 +1008,7 @@ def test_decimal_rows(self):
- """, + """), decimal="#", )[0] @@ -1031,7 +1031,7 @@ def test_bool_header_arg(self, spam_data, arg): def test_converters(self): # GH 13461 result = self.read_html( - """ + StringIO("""
@@ -1045,7 +1045,7 @@ def test_converters(self): -
a 0.244
""", + """), converters={"a": str}, )[0] @@ -1056,7 +1056,7 @@ def test_converters(self): def test_na_values(self): # GH 13461 result = self.read_html( - """ + StringIO("""
@@ -1070,7 +1070,7 @@ def test_na_values(self): -
a 0.244
""", + """), na_values=[0.244], )[0] @@ -1096,16 +1096,16 @@ def test_keep_default_na(self): """ expected_df = DataFrame({"a": ["N/A", "NA"]}) - html_df = self.read_html(html_data, keep_default_na=False)[0] + html_df = self.read_html(StringIO(html_data), keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({"a": [np.nan, np.nan]}) - html_df = self.read_html(html_data, keep_default_na=True)[0] + html_df = self.read_html(StringIO(html_data), keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) def test_preserve_empty_rows(self): result = self.read_html( - """ + StringIO(""" @@ -1121,7 +1121,7 @@ def test_preserve_empty_rows(self):
A
""" - )[0] + ))[0] expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"]) @@ -1129,7 +1129,7 @@ def test_preserve_empty_rows(self): def test_ignore_empty_rows_when_inferring_header(self): result = self.read_html( - """ + StringIO(""" @@ -1141,7 +1141,7 @@ def test_ignore_empty_rows_when_inferring_header(self):
""" - )[0] + ))[0] columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) @@ -1158,7 +1158,7 @@ def test_multiple_header_rows(self): ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], ] html = expected_df.to_html(index=False) - html_df = self.read_html(html)[0] + html_df = self.read_html(StringIO(html))[0] tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): @@ -1208,8 +1208,7 @@ def test_to_html_borderless(self): ) def test_displayed_only(self, displayed_only, exp0, exp1): # GH 20027 - data = StringIO( - """ + data = """ @@ -1228,9 +1227,9 @@ def test_displayed_only(self, displayed_only, exp0, exp1):
""" - ) - dfs = self.read_html(data, displayed_only=displayed_only) + + dfs = self.read_html(StringIO(data), displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) if exp1 is not None: @@ -1256,7 +1255,7 @@ def test_displayed_only_with_many_elements(self, displayed_only): """ - result = read_html(html_table, displayed_only=displayed_only)[0] + result = read_html(StringIO(html_table), displayed_only=displayed_only)[0] expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) @@ -1382,7 +1381,7 @@ def test_parse_path_object(self, datapath): def test_parse_br_as_space(self): # GH 29528: pd.read_html() convert
to space result = self.read_html( - """ + StringIO(""" @@ -1392,7 +1391,7 @@ def test_parse_br_as_space(self):
A
""" - )[0] + ))[0] expected = DataFrame(data=[["word1 word2"]], columns=["A"]) @@ -1462,7 +1461,7 @@ def test_extract_links(self, arg): elif arg == "header": head_exp = gh_13141_expected["head_extract"] - result = self.read_html(gh_13141_data, extract_links=arg)[0] + result = self.read_html(StringIO(gh_13141_data), extract_links=arg)[0] expected = DataFrame([data_exp, foot_exp], columns=head_exp) expected = expected.fillna(np.nan, downcast=False) tm.assert_frame_equal(result, expected) @@ -1486,7 +1485,7 @@ def test_extract_links_all_no_header(self): """ - result = self.read_html(data, extract_links="all")[0] + result = self.read_html(StringIO(data), extract_links="all")[0] expected = DataFrame([[("Google.com", "https://google.com")]]) tm.assert_frame_equal(result, expected) @@ -1519,6 +1518,6 @@ def test_style_tag(self): """ - result = self.read_html(data)[0] + result = self.read_html(StringIO(data))[0] expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) From 8750e7b3069c4c7c6fec4c5b35df12456a842e75 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Wed, 21 Jun 2023 22:14:30 -0400 Subject: [PATCH 2/5] Updating documentation and adding deprecation logic for read_html. --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/io/html.py | 3 +- pandas/tests/io/test_html.py | 119 ++++++++++++++++++++++----------- 3 files changed, 81 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6abb965e23382..0e5056e156000 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,6 +298,7 @@ Deprecations - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) +- Deprecated literal string/bytes input to :func:`read_excel`, :func:`read_html`, :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) @@ -305,7 +306,6 @@ Deprecations - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) -- Deprecated literal string/bytes input to :func:`read_excel`, :func:`read_html`, :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) - .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/io/html.py b/pandas/io/html.py index 85dfb2832c4af..8cc1889c2f542 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -6,8 +6,6 @@ from __future__ import annotations -import os.path -import warnings from collections import abc import numbers import re @@ -19,6 +17,7 @@ Sequence, cast, ) +import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 15c2b3bb81d42..59eb11d8bfd3f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -134,7 +134,9 @@ def test_to_html_compat(self): .map("{:.3f}".format).astype(float) ) out = df.to_html() - res = self.read_html(StringIO(out), attrs={"class": "dataframe"}, index_col=0)[0] + res = self.read_html(StringIO(out), attrs={"class": "dataframe"}, index_col=0)[ + 0 + ] tm.assert_frame_equal(res, df) def test_dtype_backend(self, string_storage, dtype_backend): @@ -500,7 +502,8 @@ def test_multiple_tbody(self): # GH-20690 # Read all tbody tags within a single table. result = self.read_html( - StringIO(""" + StringIO( + """
@@ -520,7 +523,8 @@ def test_multiple_tbody(self):
A
""" - ))[0] + ) + )[0] expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"]) @@ -532,7 +536,8 @@ def test_header_and_one_column(self): as described in issue #9178 """ result = self.read_html( - StringIO(""" + StringIO( + """
@@ -544,7 +549,8 @@ def test_header_and_one_column(self):
Header
""" - ))[0] + ) + )[0] expected = DataFrame(data={"Header": "first"}, index=[0]) @@ -555,7 +561,8 @@ def test_thead_without_tr(self): Ensure parser adds within on malformed HTML. """ result = self.read_html( - StringIO(""" + StringIO( + """
@@ -571,7 +578,8 @@ def test_thead_without_tr(self):
Country
""" - ))[0] + ) + )[0] expected = DataFrame( data=[["Ukraine", "Odessa", 1944]], @@ -622,7 +630,8 @@ def test_parse_header_of_non_string_column(self): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str result = self.read_html( - StringIO(""" + StringIO( + """ @@ -633,7 +642,8 @@ def test_parse_header_of_non_string_column(self):
S1944
- """), + """ + ), header=0, )[0] @@ -703,7 +713,8 @@ def test_gold_canyon(self, banklist_data): def test_different_number_of_cols(self): expected = self.read_html( - StringIO(""" + StringIO( + """
@@ -732,12 +743,14 @@ def test_different_number_of_cols(self): -
0.222
"""), + """ + ), index_col=0, )[0] result = self.read_html( - StringIO(""" + StringIO( + """
@@ -763,7 +776,8 @@ def test_different_number_of_cols(self): -
0.222
"""), + """ + ), index_col=0, )[0] @@ -772,7 +786,8 @@ def test_different_number_of_cols(self): def test_colspan_rowspan_1(self): # GH17054 result = self.read_html( - StringIO(""" + StringIO( + """ @@ -786,7 +801,8 @@ def test_colspan_rowspan_1(self):
A
""" - ))[0] + ) + )[0] expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"]) @@ -801,7 +817,8 @@ def test_colspan_rowspan_copy_values(self): # A B b z C result = self.read_html( - StringIO(""" + StringIO( + """ @@ -815,7 +832,8 @@ def test_colspan_rowspan_copy_values(self):
XC
- """), + """ + ), header=0, )[0] @@ -834,7 +852,8 @@ def test_colspan_rowspan_both_not_1(self): # a b b b D result = self.read_html( - StringIO(""" + StringIO( + """ @@ -845,7 +864,8 @@ def test_colspan_rowspan_both_not_1(self):
AD
- """), + """ + ), header=0, )[0] @@ -864,7 +884,8 @@ def test_rowspan_at_end_of_row(self): # C b result = self.read_html( - StringIO(""" + StringIO( + """ @@ -874,7 +895,8 @@ def test_rowspan_at_end_of_row(self):
AC
- """), + """ + ), header=0, )[0] @@ -886,14 +908,16 @@ def test_rowspan_only_rows(self): # GH17054 result = self.read_html( - StringIO(""" + StringIO( + """
A B
- """), + """ + ), header=0, )[0] @@ -904,7 +928,8 @@ def test_rowspan_only_rows(self): def test_header_inferred_from_rows_with_only_th(self): # GH17054 result = self.read_html( - StringIO(""" + StringIO( + """ @@ -919,7 +944,8 @@ def test_header_inferred_from_rows_with_only_th(self):
A2
- """) + """ + ) )[0] columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) @@ -969,7 +995,8 @@ def test_wikipedia_states_multiindex(self, datapath): def test_parser_error_on_empty_header_row(self): result = self.read_html( - StringIO(""" + StringIO( + """ @@ -979,7 +1006,8 @@ def test_parser_error_on_empty_header_row(self):
ab
- """), + """ + ), header=[0, 1], ) expected = DataFrame( @@ -993,7 +1021,8 @@ def test_parser_error_on_empty_header_row(self): def test_decimal_rows(self): # GH 12907 result = self.read_html( - StringIO(""" + StringIO( + """ @@ -1008,7 +1037,8 @@ def test_decimal_rows(self):
- """), + """ + ), decimal="#", )[0] @@ -1031,7 +1061,8 @@ def test_bool_header_arg(self, spam_data, arg): def test_converters(self): # GH 13461 result = self.read_html( - StringIO(""" + StringIO( + """
@@ -1045,7 +1076,8 @@ def test_converters(self): -
a 0.244
"""), + """ + ), converters={"a": str}, )[0] @@ -1056,7 +1088,8 @@ def test_converters(self): def test_na_values(self): # GH 13461 result = self.read_html( - StringIO(""" + StringIO( + """
@@ -1070,7 +1103,8 @@ def test_na_values(self): -
a 0.244
"""), + """ + ), na_values=[0.244], )[0] @@ -1105,7 +1139,8 @@ def test_keep_default_na(self): def test_preserve_empty_rows(self): result = self.read_html( - StringIO(""" + StringIO( + """ @@ -1121,7 +1156,8 @@ def test_preserve_empty_rows(self):
A
""" - ))[0] + ) + )[0] expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"]) @@ -1129,7 +1165,8 @@ def test_preserve_empty_rows(self): def test_ignore_empty_rows_when_inferring_header(self): result = self.read_html( - StringIO(""" + StringIO( + """ @@ -1141,7 +1178,8 @@ def test_ignore_empty_rows_when_inferring_header(self):
""" - ))[0] + ) + )[0] columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) @@ -1228,7 +1266,6 @@ def test_displayed_only(self, displayed_only, exp0, exp1): """ - dfs = self.read_html(StringIO(data), displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) @@ -1381,7 +1418,8 @@ def test_parse_path_object(self, datapath): def test_parse_br_as_space(self): # GH 29528: pd.read_html() convert
to space result = self.read_html( - StringIO(""" + StringIO( + """ @@ -1391,7 +1429,8 @@ def test_parse_br_as_space(self):
A
""" - ))[0] + ) + )[0] expected = DataFrame(data=[["word1 word2"]], columns=["A"]) From 4f0c654ad797358c95279dfd9a89c0a09fa4e2f9 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Wed, 21 Jun 2023 23:04:46 -0400 Subject: [PATCH 3/5] Adding unit test and fixing documentation errors. --- doc/source/whatsnew/v0.24.0.rst | 5 +++-- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/tests/io/test_html.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index fa4ae8f25cc05..a23aded7ef512 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -286,7 +286,8 @@ value. (:issue:`17054`) .. ipython:: python - result = pd.read_html(""" + from io import StringIO + result = pd.read_html(StringIO(""" @@ -298,7 +299,7 @@ value. (:issue:`17054`) -
12
""") + """)) *Previous behavior*: diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0e5056e156000..383c4abffb732 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -307,6 +307,7 @@ Deprecations - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) - + .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 59eb11d8bfd3f..9b1ee2dcce24e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -38,6 +38,39 @@ import pandas.io.html +def test_literal_html_deprecation(): + # GH 53785 + msg = ( + "Passing literal html to 'read_html' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + read_html( + """ + + + + + + + + + + + + + + + + + + +
AB
12
34
""" + ) + + @pytest.fixture( params=[ "chinese_utf-16.html", From c0bdf32810a27052339efd599a08a0662a2037d6 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Wed, 21 Jun 2023 23:40:36 -0400 Subject: [PATCH 4/5] Fixing failing unit test --- pandas/tests/io/test_html.py | 65 ++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9b1ee2dcce24e..02c1646dc1a8d 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -38,39 +38,6 @@ import pandas.io.html -def test_literal_html_deprecation(): - # GH 53785 - msg = ( - "Passing literal html to 'read_html' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) - - with tm.assert_produces_warning(FutureWarning, match=msg): - read_html( - """ - - - - - - - - - - - - - - - - - - -
AB
12
34
""" - ) - - @pytest.fixture( params=[ "chinese_utf-16.html", @@ -141,6 +108,38 @@ def test_same_ordering(datapath): ], ) class TestReadHtml: + def test_literal_html_deprecation(self): + # GH 53785 + msg = ( + "Passing literal html to 'read_html' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + self.read_html( + """ + + + + + + + + + + + + + + + + + + +
AB
12
34
""" + ) + @pytest.fixture def spam_data(self, datapath): return datapath("io", "data", "html", "spam.html") From ca9a34134cadb9110334b07640f6f5425dec959d Mon Sep 17 00:00:00 2001 From: rmhowe425 <45905457+rmhowe425@users.noreply.github.com> Date: Thu, 22 Jun 2023 12:33:27 -0400 Subject: [PATCH 5/5] Update v2.1.0.rst --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 383c4abffb732..20e0e88ea11dc 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,7 +298,7 @@ Deprecations - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) -- Deprecated literal string/bytes input to :func:`read_excel`, :func:`read_html`, :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) +- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`)