From c8cade5bfab37a04af826b810f0fc868d2728113 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Thu, 22 Jun 2023 19:05:56 -0400 Subject: [PATCH 1/5] Updating documentation and adding deprecation logic for read_html. --- doc/source/whatsnew/v2.1.0.rst | 2 + pandas/io/html.py | 14 +++ pandas/tests/io/test_html.py | 172 +++++++++++++++++++++++---------- 3 files changed, 137 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2436d91690ed3..005ee45d641f5 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -305,6 +305,8 @@ Deprecations - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) +- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) +- .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/io/html.py b/pandas/io/html.py index 510ff1fa95868..8cc1889c2f542 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -17,6 +17,7 @@ Sequence, cast, ) +import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -24,6 +25,7 @@ AbstractMethodError, EmptyDataError, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -1023,6 +1025,9 @@ def read_html( lxml only accepts the http, ftp and file url protocols. If you have a URL that starts with ``'https'`` you might try removing the ``'s'``. + .. deprecated:: 2.1.0 + Passing html literal strings is deprecated. + match : str or compiled regular expression, optional The set of tables containing text matching this regex or string will be returned. Unless the HTML is extremely simple you will probably need to @@ -1178,6 +1183,15 @@ def read_html( io = stringify_path(io) + if isinstance(io, str) and "\n" in io: + warnings.warn( + "Passing literal html to 'read_html' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return _parse( flavor=flavor, io=io, diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 5c6c33de5ac5f..02c1646dc1a8d 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -108,6 +108,38 @@ def test_same_ordering(datapath): ], ) class TestReadHtml: + def test_literal_html_deprecation(self): + # GH 53785 + msg = ( + "Passing literal html to 'read_html' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + self.read_html( + """ + + + + + + + + + + + + + + + + + + +
AB
12
34
""" + ) + @pytest.fixture def spam_data(self, datapath): return datapath("io", "data", "html", "spam.html") @@ -134,7 +166,9 @@ def test_to_html_compat(self): .map("{:.3f}".format).astype(float) ) out = df.to_html() - res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] + res = self.read_html(StringIO(out), attrs={"class": "dataframe"}, index_col=0)[ + 0 + ] tm.assert_frame_equal(res, df) def test_dtype_backend(self, string_storage, dtype_backend): @@ -163,7 +197,7 @@ def test_dtype_backend(self, string_storage, dtype_backend): out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): - result = self.read_html(out, dtype_backend=dtype_backend)[0] + result = self.read_html(StringIO(out), dtype_backend=dtype_backend)[0] expected = DataFrame( { @@ -351,8 +385,8 @@ def test_string(self, spam_data): with open(spam_data, encoding="UTF-8") as f: data = f.read() - df1 = self.read_html(data, match=".*Water.*") - df2 = self.read_html(data, match="Unit") + df1 = self.read_html(StringIO(data), match=".*Water.*") + df2 = self.read_html(StringIO(data), match="Unit") assert_framelist_equal(df1, df2) @@ -493,14 +527,15 @@ def test_empty_tables(self): """ - result = self.read_html(html) + result = self.read_html(StringIO(html)) assert len(result) == 1 def test_multiple_tbody(self): # GH-20690 # Read all tbody tags within a single table. result = self.read_html( - """ + StringIO( + """
@@ -520,6 +555,7 @@ def test_multiple_tbody(self):
A
""" + ) )[0] expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"]) @@ -532,7 +568,8 @@ def test_header_and_one_column(self): as described in issue #9178 """ result = self.read_html( - """ + StringIO( + """
@@ -544,6 +581,7 @@ def test_header_and_one_column(self):
Header
""" + ) )[0] expected = DataFrame(data={"Header": "first"}, index=[0]) @@ -555,7 +593,8 @@ def test_thead_without_tr(self): Ensure parser adds within on malformed HTML. """ result = self.read_html( - """ + StringIO( + """
@@ -571,6 +610,7 @@ def test_thead_without_tr(self):
Country
""" + ) )[0] expected = DataFrame( @@ -612,8 +652,8 @@ def test_tfoot_read(self): data1 = data_template.format(footer="") data2 = data_template.format(footer="footAfootB") - result1 = self.read_html(data1)[0] - result2 = self.read_html(data2)[0] + result1 = self.read_html(StringIO(data1))[0] + result2 = self.read_html(StringIO(data2))[0] tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) @@ -622,7 +662,8 @@ def test_parse_header_of_non_string_column(self): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str result = self.read_html( - """ + StringIO( + """ @@ -633,7 +674,8 @@ def test_parse_header_of_non_string_column(self):
S1944
- """, + """ + ), header=0, )[0] @@ -703,7 +745,8 @@ def test_gold_canyon(self, banklist_data): def test_different_number_of_cols(self): expected = self.read_html( - """ + StringIO( + """
@@ -732,12 +775,14 @@ def test_different_number_of_cols(self): -
0.222
""", + """ + ), index_col=0, )[0] result = self.read_html( - """ + StringIO( + """
@@ -763,7 +808,8 @@ def test_different_number_of_cols(self): -
0.222
""", + """ + ), index_col=0, )[0] @@ -772,7 +818,8 @@ def test_different_number_of_cols(self): def test_colspan_rowspan_1(self): # GH17054 result = self.read_html( - """ + StringIO( + """ @@ -786,6 +833,7 @@ def test_colspan_rowspan_1(self):
A
""" + ) )[0] expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"]) @@ -801,7 +849,8 @@ def test_colspan_rowspan_copy_values(self): # A B b z C result = self.read_html( - """ + StringIO( + """ @@ -815,7 +864,8 @@ def test_colspan_rowspan_copy_values(self):
XC
- """, + """ + ), header=0, )[0] @@ -834,7 +884,8 @@ def test_colspan_rowspan_both_not_1(self): # a b b b D result = self.read_html( - """ + StringIO( + """ @@ -845,7 +896,8 @@ def test_colspan_rowspan_both_not_1(self):
AD
- """, + """ + ), header=0, )[0] @@ -864,7 +916,8 @@ def test_rowspan_at_end_of_row(self): # C b result = self.read_html( - """ + StringIO( + """ @@ -874,7 +927,8 @@ def test_rowspan_at_end_of_row(self):
AC
- """, + """ + ), header=0, )[0] @@ -886,14 +940,16 @@ def test_rowspan_only_rows(self): # GH17054 result = self.read_html( - """ + StringIO( + """
A B
- """, + """ + ), header=0, )[0] @@ -904,7 +960,8 @@ def test_rowspan_only_rows(self): def test_header_inferred_from_rows_with_only_th(self): # GH17054 result = self.read_html( - """ + StringIO( + """ @@ -920,6 +977,7 @@ def test_header_inferred_from_rows_with_only_th(self):
A
""" + ) )[0] columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) @@ -930,9 +988,9 @@ def test_header_inferred_from_rows_with_only_th(self): def test_parse_dates_list(self): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) expected = df.to_html() - res = self.read_html(expected, parse_dates=[1], index_col=0) + res = self.read_html(StringIO(expected), parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) - res = self.read_html(expected, parse_dates=["date"], index_col=0) + res = self.read_html(StringIO(expected), parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) def test_parse_dates_combine(self): @@ -944,7 +1002,7 @@ def test_parse_dates_combine(self): } ) res = self.read_html( - df.to_html(), parse_dates={"datetime": [1, 2]}, index_col=1 + StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 ) newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) @@ -969,7 +1027,8 @@ def test_wikipedia_states_multiindex(self, datapath): def test_parser_error_on_empty_header_row(self): result = self.read_html( - """ + StringIO( + """ @@ -979,7 +1038,8 @@ def test_parser_error_on_empty_header_row(self):
ab
- """, + """ + ), header=[0, 1], ) expected = DataFrame( @@ -993,7 +1053,8 @@ def test_parser_error_on_empty_header_row(self): def test_decimal_rows(self): # GH 12907 result = self.read_html( - """ + StringIO( + """ @@ -1008,7 +1069,8 @@ def test_decimal_rows(self):
- """, + """ + ), decimal="#", )[0] @@ -1031,7 +1093,8 @@ def test_bool_header_arg(self, spam_data, arg): def test_converters(self): # GH 13461 result = self.read_html( - """ + StringIO( + """
@@ -1045,7 +1108,8 @@ def test_converters(self): -
a 0.244
""", + """ + ), converters={"a": str}, )[0] @@ -1056,7 +1120,8 @@ def test_converters(self): def test_na_values(self): # GH 13461 result = self.read_html( - """ + StringIO( + """
@@ -1070,7 +1135,8 @@ def test_na_values(self): -
a 0.244
""", + """ + ), na_values=[0.244], )[0] @@ -1096,16 +1162,17 @@ def test_keep_default_na(self): """ expected_df = DataFrame({"a": ["N/A", "NA"]}) - html_df = self.read_html(html_data, keep_default_na=False)[0] + html_df = self.read_html(StringIO(html_data), keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({"a": [np.nan, np.nan]}) - html_df = self.read_html(html_data, keep_default_na=True)[0] + html_df = self.read_html(StringIO(html_data), keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) def test_preserve_empty_rows(self): result = self.read_html( - """ + StringIO( + """ @@ -1121,6 +1188,7 @@ def test_preserve_empty_rows(self):
A
""" + ) )[0] expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"]) @@ -1129,7 +1197,8 @@ def test_preserve_empty_rows(self): def test_ignore_empty_rows_when_inferring_header(self): result = self.read_html( - """ + StringIO( + """ @@ -1141,6 +1210,7 @@ def test_ignore_empty_rows_when_inferring_header(self):
""" + ) )[0] columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) @@ -1158,7 +1228,7 @@ def test_multiple_header_rows(self): ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], ] html = expected_df.to_html(index=False) - html_df = self.read_html(html)[0] + html_df = self.read_html(StringIO(html))[0] tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): @@ -1208,8 +1278,7 @@ def test_to_html_borderless(self): ) def test_displayed_only(self, displayed_only, exp0, exp1): # GH 20027 - data = StringIO( - """ + data = """ @@ -1228,9 +1297,8 @@ def test_displayed_only(self, displayed_only, exp0, exp1):
""" - ) - dfs = self.read_html(data, displayed_only=displayed_only) + dfs = self.read_html(StringIO(data), displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) if exp1 is not None: @@ -1256,7 +1324,7 @@ def test_displayed_only_with_many_elements(self, displayed_only): """ - result = read_html(html_table, displayed_only=displayed_only)[0] + result = read_html(StringIO(html_table), displayed_only=displayed_only)[0] expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) @@ -1382,7 +1450,8 @@ def test_parse_path_object(self, datapath): def test_parse_br_as_space(self): # GH 29528: pd.read_html() convert
to space result = self.read_html( - """ + StringIO( + """ @@ -1392,6 +1461,7 @@ def test_parse_br_as_space(self):
A
""" + ) )[0] expected = DataFrame(data=[["word1 word2"]], columns=["A"]) @@ -1462,7 +1532,7 @@ def test_extract_links(self, arg): elif arg == "header": head_exp = gh_13141_expected["head_extract"] - result = self.read_html(gh_13141_data, extract_links=arg)[0] + result = self.read_html(StringIO(gh_13141_data), extract_links=arg)[0] expected = DataFrame([data_exp, foot_exp], columns=head_exp) expected = expected.fillna(np.nan, downcast=False) tm.assert_frame_equal(result, expected) @@ -1486,7 +1556,7 @@ def test_extract_links_all_no_header(self): """ - result = self.read_html(data, extract_links="all")[0] + result = self.read_html(StringIO(data), extract_links="all")[0] expected = DataFrame([[("Google.com", "https://google.com")]]) tm.assert_frame_equal(result, expected) @@ -1519,6 +1589,6 @@ def test_style_tag(self): """ - result = self.read_html(data)[0] + result = self.read_html(StringIO(data))[0] expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) From 6c39ea835e924d7fa17054003e393d6b45e3fa93 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Thu, 22 Jun 2023 20:17:29 -0400 Subject: [PATCH 2/5] Fixing formatting errors --- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v2.1.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 84a78ace8d7c7..0084e885db2b5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2664,7 +2664,7 @@ Links can be extracted from cells along with the text using ``extract_links="all """ df = pd.read_html( - html_table, + StringIO(html_table), extract_links="all" )[0] df diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 005ee45d641f5..d7fda196c1aea 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,6 +298,7 @@ Deprecations - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) +- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) @@ -305,7 +306,6 @@ Deprecations - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) -- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) - .. --------------------------------------------------------------------------- From cdd27e26beb2e77e7487b91a10a40ec4c14f1731 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Thu, 22 Jun 2023 21:07:31 -0400 Subject: [PATCH 3/5] Fixing documentation errors --- doc/source/whatsnew/v0.24.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index fa4ae8f25cc05..7f5025e6ce60b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -286,7 +286,7 @@ value. (:issue:`17054`) .. ipython:: python - result = pd.read_html(""" + result = pd.read_html(StringIO(""" @@ -298,7 +298,7 @@ value. (:issue:`17054`) -
12
""") + """)) *Previous behavior*: From f6f22b435f03a19b11b833c235b53c75c0d7731e Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Mon, 26 Jun 2023 18:42:54 -0400 Subject: [PATCH 4/5] Updating deprecation logic and documentation per reviewer recommendations. --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/io/html.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 89ce9f07f465d..391d46a00a881 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,7 +298,7 @@ Deprecations - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) -- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) +- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`) - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) diff --git a/pandas/io/html.py b/pandas/io/html.py index 8cc1889c2f542..8fd5f6f9c2e31 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -38,6 +38,7 @@ from pandas.io.common import ( file_exists, get_handle, + is_file_like, is_url, stringify_path, urlopen, @@ -1027,6 +1028,7 @@ def read_html( .. deprecated:: 2.1.0 Passing html literal strings is deprecated. + Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead. match : str or compiled regular expression, optional The set of tables containing text matching this regex or string will be @@ -1183,7 +1185,7 @@ def read_html( io = stringify_path(io) - if isinstance(io, str) and "\n" in io: + if isinstance(io, str) and not is_file_like(io) and "\n" in io: warnings.warn( "Passing literal html to 'read_html' is deprecated and " "will be removed in a future version. To read from a " From 0e6176e26be6a5e57ef289d15f7852ec78b7ff2d Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Tue, 27 Jun 2023 18:44:05 -0400 Subject: [PATCH 5/5] Updating implementation per reviewer recommendations. --- pandas/io/html.py | 10 +++++++++- pandas/tests/io/test_html.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 8fd5f6f9c2e31..606bdc5a326a2 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -39,6 +39,7 @@ file_exists, get_handle, is_file_like, + is_fsspec_url, is_url, stringify_path, urlopen, @@ -1185,7 +1186,14 @@ def read_html( io = stringify_path(io) - if isinstance(io, str) and not is_file_like(io) and "\n" in io: + if isinstance(io, str) and not any( + [ + is_file_like(io), + file_exists(io), + is_url(io), + is_fsspec_url(io), + ] + ): warnings.warn( "Passing literal html to 'read_html' is deprecated and " "will be removed in a future version. To read from a " diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 02c1646dc1a8d..dfd2389b7a097 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -87,7 +87,7 @@ def test_invalid_flavor(): msg = r"\{" + flavor + r"\} is not a valid set of flavors" with pytest.raises(ValueError, match=msg): - read_html(url, match="google", flavor=flavor) + read_html(StringIO(url), match="google", flavor=flavor) @td.skip_if_no("bs4")