From 54d47e4530d2955019dc6b4f17c99b87debf4850 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Mon, 30 Apr 2018 18:18:34 -0700 Subject: [PATCH 1/2] Read from multiple within a refs #20690 --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/io/html.py | 22 +++++++++++++--------- pandas/tests/io/test_html.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 567a720fa5e32..35683701efdfd 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -443,6 +443,7 @@ Other Enhancements - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row. ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) +- :func:`read_html` now reads all ```` elements in a ``
``, not just the first. (:issue:`20690`) - :meth:`~pandas.core.window.Rolling.quantile` and :meth:`~pandas.core.window.Expanding.quantile` now accept the ``interpolation`` keyword, ``linear`` by default (:issue:`20497`) - zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) - :class:`pandas.tseries.api.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`). diff --git a/pandas/io/html.py b/pandas/io/html.py index ba5da1b4e3a76..03d87e322ba03 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -324,7 +324,7 @@ def _parse_thead(self, table): raise com.AbstractMethodError(self) def _parse_tbody(self, table): - """Return the body of the table. + """Return the list of tbody elements from the parsed table element. Parameters ---------- @@ -333,8 +333,8 @@ def _parse_tbody(self, table): Returns ------- - tbody : node-like - A ... element. + tbodys : list of node-like + A list of ... elements """ raise com.AbstractMethodError(self) @@ -388,13 +388,17 @@ def _parse_raw_tfoot(self, table): np.array(res).squeeze()) if res and len(res) == 1 else res def _parse_raw_tbody(self, table): - tbody = self._parse_tbody(table) + tbodies = self._parse_tbody(table) - try: - res = self._parse_tr(tbody[0]) - except IndexError: - res = self._parse_tr(table) - return self._parse_raw_data(res) + raw_data = [] + + if len(tbodies) > 0: + for tbody in tbodies: + raw_data.extend(self._parse_tr(tbody)) + else: + raw_data.extend(self._parse_tr(table)) + + return self._parse_raw_data(raw_data) def _handle_hidden_tables(self, tbl_list, attr_name): """Returns list of tables, potentially removing hidden elements diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 078b5f8448d46..77420ef86c6e0 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -396,6 +396,34 @@ def test_empty_tables(self): res2 = self.read_html(StringIO(data2)) assert_framelist_equal(res1, res2) + def test_multiple_tbody(self): + """ + Read all tbody tags within a single table. + """ + data = '''
+ + + + + + + + + + + + + + + + + + +
AB
12
34
''' + expected = DataFrame({'A': [1, 3], 'B': [2, 4]}) + result = self.read_html(StringIO(data)) + tm.assert_frame_equal(result[0], expected) + def test_header_and_one_column(self): """ Don't fail with bs4 when there is a header and only one column From a881c602933585fe3177c39a5f6f5df8e5c8d9a1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 May 2018 09:09:24 -0500 Subject: [PATCH 2/2] Updates --- pandas/io/html.py | 2 +- pandas/tests/io/test_html.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 03d87e322ba03..8fd876e85889f 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -392,7 +392,7 @@ def _parse_raw_tbody(self, table): raw_data = [] - if len(tbodies) > 0: + if tbodies: for tbody in tbodies: raw_data.extend(self._parse_tr(tbody)) else: diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 77420ef86c6e0..a56946b82b027 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -397,9 +397,8 @@ def test_empty_tables(self): assert_framelist_equal(res1, res2) def test_multiple_tbody(self): - """ - Read all tbody tags within a single table. - """ + # GH-20690 + # Read all tbody tags within a single table. data = ''' @@ -421,8 +420,8 @@ def test_multiple_tbody(self):
''' expected = DataFrame({'A': [1, 3], 'B': [2, 4]}) - result = self.read_html(StringIO(data)) - tm.assert_frame_equal(result[0], expected) + result = self.read_html(StringIO(data))[0] + tm.assert_frame_equal(result, expected) def test_header_and_one_column(self): """