From d17fe216a44e66914ae029537ca0f2cde8a06259 Mon Sep 17 00:00:00 2001 From: inspurwusixuan Date: Mon, 9 Nov 2020 22:40:45 -0800 Subject: [PATCH 1/9] BUG: read_html - file path cannot be pathlib.Path type --- pandas/io/html.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 1534e42d8fb5a..27e357e3fff4e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -9,6 +9,7 @@ import os import re from typing import Dict, List, Optional, Pattern, Sequence, Tuple, Union +from pathlib import Path from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency @@ -20,7 +21,7 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame -from pandas.io.common import is_url, urlopen, validate_header_arg +from pandas.io.common import is_url, urlopen, validate_header_arg, stringify_path from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser @@ -1080,6 +1081,10 @@ def read_html( "data (you passed a negative value)" ) validate_header_arg(header) + + if isinstance(io, Path): + io = stringify_path(io) + return _parse( flavor=flavor, io=io, From e06477590bb15e6eb1455a50686dc870bf49867b Mon Sep 17 00:00:00 2001 From: inspurwusixuan Date: Tue, 10 Nov 2020 00:18:45 -0800 Subject: [PATCH 2/9] BUG: read_html - file path cannot be pathlib.Path type --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/html.py | 6 ++---- pandas/tests/io/test_html.py | 9 +++++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e488ca52be8a0..c98128a39fd8a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -507,6 +507,7 @@ I/O - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) +- Bug in :func:`read_html` when parsing ``pathlib.Path`` object as html path (:issue:`37705`) Plotting ^^^^^^^^ diff --git a/pandas/io/html.py b/pandas/io/html.py index 27e357e3fff4e..334a3dab6c13a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -9,7 +9,6 @@ import os import re from typing import Dict, List, Optional, Pattern, Sequence, Tuple, Union -from pathlib import Path from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency @@ -21,7 +20,7 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame -from pandas.io.common import is_url, urlopen, validate_header_arg, stringify_path +from pandas.io.common import is_url, stringify_path, urlopen, validate_header_arg from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser @@ -1082,8 +1081,7 @@ def read_html( ) validate_header_arg(header) - if isinstance(io, Path): - io = stringify_path(io) + io = stringify_path(io) return _parse( flavor=flavor, diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f929d4ac31484..41907ccc62d40 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -2,6 +2,7 @@ from importlib import reload from io import BytesIO, StringIO import os +from pathlib import Path import re import threading from urllib.error import URLError @@ -1233,3 +1234,11 @@ def run(self): while helper_thread1.is_alive() or helper_thread2.is_alive(): pass assert None is helper_thread1.err is helper_thread2.err + + def test_parse_Path_object(self): + file_path_string = r'pandas/tests/io/data/html/spam.html' + file_path = Path(file_path_string) + df1 = pd.read_html(file_path_string)[0] + df2 = pd.read_html(file_path)[0] + tm.assert_frame_equal(df1, df2) + From 8e1f8a86b532a0c80d4b5e147bf0835d5c80029d Mon Sep 17 00:00:00 2001 From: inspurwusixuan Date: Tue, 10 Nov 2020 00:25:15 -0800 Subject: [PATCH 3/9] BUG: read_html - file path cannot be pathlib.Path type --- pandas/tests/io/test_html.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 41907ccc62d40..a8df8773be7df 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1238,7 +1238,6 @@ def run(self): def test_parse_Path_object(self): file_path_string = r'pandas/tests/io/data/html/spam.html' file_path = Path(file_path_string) - df1 = pd.read_html(file_path_string)[0] + df1 = pd.read_html(file_path_string)[0] df2 = pd.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) - From 57814896557f86a602fd9eeaf514adab75adae20 Mon Sep 17 00:00:00 2001 From: inspurwusixuan Date: Tue, 10 Nov 2020 00:39:25 -0800 Subject: [PATCH 4/9] closes #37705 --- pandas/tests/io/test_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index a8df8773be7df..292d2c4c98397 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1236,8 +1236,8 @@ def run(self): assert None is helper_thread1.err is helper_thread2.err def test_parse_Path_object(self): - file_path_string = r'pandas/tests/io/data/html/spam.html' + file_path_string = r"pandas/tests/io/data/html/spam.html" file_path = Path(file_path_string) - df1 = pd.read_html(file_path_string)[0] - df2 = pd.read_html(file_path)[0] + df1 = self.read_html(file_path_string)[0] + df2 = self.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) From e88ff154132854e5804b3883604ae2e1576f94c5 Mon Sep 17 00:00:00 2001 From: inspurwusixuan Date: Tue, 10 Nov 2020 09:43:06 -0800 Subject: [PATCH 5/9] Add comments closes #37705 --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/tests/io/test_html.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c98128a39fd8a..012000fd717fb 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -507,7 +507,7 @@ I/O - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) -- Bug in :func:`read_html` when parsing ``pathlib.Path`` object as html path (:issue:`37705`) +- Bug in :func:`read_html` caused a ``TypeError`` when parsing ``pathlib.Path`` as html path since it was not converted to string (:issue:`37705`) Plotting ^^^^^^^^ diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 292d2c4c98397..b5bf1545d4acc 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1236,6 +1236,11 @@ def run(self): assert None is helper_thread1.err is helper_thread2.err def test_parse_Path_object(self): + """ + read_html should be able to cope with Path + + GH 37705 + """ file_path_string = r"pandas/tests/io/data/html/spam.html" file_path = Path(file_path_string) df1 = self.read_html(file_path_string)[0] From 72261b9b74c63020d06f002b51996eb41ef6f444 Mon Sep 17 00:00:00 2001 From: "Sixuan (Cherie) Wu" <73203695+inspurwusixuan@users.noreply.github.com> Date: Tue, 10 Nov 2020 15:11:02 -0800 Subject: [PATCH 6/9] Update doc/source/whatsnew/v1.2.0.rst Co-authored-by: William Ayd --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 012000fd717fb..5ffdd959eefbd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -507,7 +507,7 @@ I/O - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) -- Bug in :func:`read_html` caused a ``TypeError`` when parsing ``pathlib.Path`` as html path since it was not converted to string (:issue:`37705`) +- Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) Plotting ^^^^^^^^ From 88bc2ee492633ab6a94be78384c5ac7524043322 Mon Sep 17 00:00:00 2001 From: "Sixuan (Cherie) Wu" <73203695+inspurwusixuan@users.noreply.github.com> Date: Tue, 10 Nov 2020 15:11:33 -0800 Subject: [PATCH 7/9] Update pandas/tests/io/test_html.py Co-authored-by: William Ayd --- pandas/tests/io/test_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index b5bf1545d4acc..921edc731e61e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1235,7 +1235,7 @@ def run(self): pass assert None is helper_thread1.err is helper_thread2.err - def test_parse_Path_object(self): + def test_parse_path_object(self): """ read_html should be able to cope with Path From 2fbea2a2cd980dad7688575748d5b5b413e7a6c2 Mon Sep 17 00:00:00 2001 From: inspurwusixuan Date: Tue, 10 Nov 2020 15:23:45 -0800 Subject: [PATCH 8/9] Fix comments closes #37705 --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/tests/io/test_html.py | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 012000fd717fb..afa297397aa7c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -507,7 +507,7 @@ I/O - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) -- Bug in :func:`read_html` caused a ``TypeError`` when parsing ``pathlib.Path`` as html path since it was not converted to string (:issue:`37705`) +- Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) Plotting ^^^^^^^^ diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index b5bf1545d4acc..eb704ccf1e594 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1235,13 +1235,9 @@ def run(self): pass assert None is helper_thread1.err is helper_thread2.err - def test_parse_Path_object(self): - """ - read_html should be able to cope with Path - - GH 37705 - """ - file_path_string = r"pandas/tests/io/data/html/spam.html" + def test_parse_path_object(self, datapath): + # GH 37705 + file_path_string = datapath("io", "data", "html", "spam.html") file_path = Path(file_path_string) df1 = self.read_html(file_path_string)[0] df2 = self.read_html(file_path)[0] From 51ab6a2907f020789e1e29e2d29d74bafb2855e9 Mon Sep 17 00:00:00 2001 From: inspurwusixuan Date: Tue, 10 Nov 2020 15:33:59 -0800 Subject: [PATCH 9/9] Fix merge issue closes #37705 --- doc/source/whatsnew/v1.2.0.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index cdc14ba19a8a8..5ffdd959eefbd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -507,11 +507,7 @@ I/O - Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) -<<<<<<< HEAD -- Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) -======= - Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) ->>>>>>> 88bc2ee492633ab6a94be78384c5ac7524043322 Plotting ^^^^^^^^