From 48120327633a70d88a593ecf7fbb321066b6f2ce Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 15 Dec 2022 21:33:11 +0100 Subject: [PATCH 1/7] ENH: Add use_nullable_dtypes for read_html --- doc/source/whatsnew/v2.0.0.rst | 2 ++ pandas/_libs/parsers.pyx | 1 + pandas/io/html.py | 9 +++++ pandas/tests/io/test_html.py | 66 ++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5c635f2d9d3be..58306638ed396 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -37,6 +37,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_csv` * :func:`read_excel` +* :func:`read_html` * :func:`read_sql` Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions @@ -44,6 +45,7 @@ to select the nullable dtypes implementation. * :func:`read_csv` (with ``engine="pyarrow"``) * :func:`read_excel` +* :func:`read_html` * :func:`read_parquet` * :func:`read_orc` diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1941cfde4acb9..6150c2b262fc8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1384,6 +1384,7 @@ STR_NA_VALUES = { "nan", "-nan", "", + "None", } _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) diff --git a/pandas/io/html.py b/pandas/io/html.py index 4f6e43a1639a5..a2635dbe879c6 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1043,6 +1043,7 @@ def read_html( keep_default_na: bool = True, displayed_only: bool = True, extract_links: Literal[None, "header", "footer", "body", "all"] = None, + use_nullable_dtypes: bool = False, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1143,6 +1144,13 @@ def read_html( .. versionadded:: 1.5.0 + use_nullable_dtypes : bool = False + Whether to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + .. versionadded:: 2.0 + Returns ------- dfs @@ -1218,4 +1226,5 @@ def read_html( keep_default_na=keep_default_na, displayed_only=displayed_only, extract_links=extract_links, + use_nullable_dtypes=use_nullable_dtypes, ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 4bf79733b1957..a090cda9fa6ba 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -17,7 +17,9 @@ from pandas.compat import is_platform_windows import pandas.util._test_decorators as td +import pandas as pd from pandas import ( + NA, DataFrame, MultiIndex, Series, @@ -27,6 +29,10 @@ to_datetime, ) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) from pandas.io.common import file_path_to_url import pandas.io.html @@ -132,6 +138,66 @@ def test_to_html_compat(self): res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) + @pytest.mark.parametrize("nullable_backend", ["pandas", "pyarrow"]) + @pytest.mark.parametrize("storage", ["python", "pyarrow"]) + def test_use_nullable_dtypes(self, storage, nullable_backend): + # GH# + df = DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], + } + ) + + string_array: StringArray | ArrowStringArray + string_array_na: StringArray | ArrowStringArray + if storage == "python": + string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) + string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + + else: + pa = pytest.importorskip("pyarrow") + string_array = ArrowStringArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + + out = df.to_html(index=False) + with pd.option_context("mode.string_storage", storage): + with pd.option_context("mode.nullable_backend", nullable_backend): + result = self.read_html(out, use_nullable_dtypes=True)[0] + + expected = DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": Series([True, False, NA], dtype="boolean"), + "f": Series([True, False, True], dtype="boolean"), + "g": string_array, + "h": string_array_na, + } + ) + + if nullable_backend == "pyarrow": + import pyarrow as pa + + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.network @tm.network( url=( From 431e6e7e31b0a287b7da50d1614f345f9e42a29b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 15 Dec 2022 21:35:04 +0100 Subject: [PATCH 2/7] Add gh ref --- pandas/tests/io/test_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index a090cda9fa6ba..7aaf9a0dc0df6 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -141,7 +141,7 @@ def test_to_html_compat(self): @pytest.mark.parametrize("nullable_backend", ["pandas", "pyarrow"]) @pytest.mark.parametrize("storage", ["python", "pyarrow"]) def test_use_nullable_dtypes(self, storage, nullable_backend): - # GH# + # GH#50286 df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), From e1c43288300d0060780f2846807e8e5a7ff60092 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 15 Dec 2022 23:31:22 +0100 Subject: [PATCH 3/7] Fix test --- pandas/tests/io/parser/test_na_values.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 9fb096bfeb346..0ca4884153b59 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -110,6 +110,7 @@ def test_default_na_values(all_parsers): "-nan", "#N/A N/A", "", + "None", } assert _NA_VALUES == STR_NA_VALUES From a6df2c8700972dacd7550e70a76d12d7c90a8615 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Dec 2022 00:17:29 +0100 Subject: [PATCH 4/7] Fix test --- pandas/tests/io/test_html.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 7aaf9a0dc0df6..83cc7fc9b30d9 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -155,8 +155,6 @@ def test_use_nullable_dtypes(self, storage, nullable_backend): } ) - string_array: StringArray | ArrowStringArray - string_array_na: StringArray | ArrowStringArray if storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) From 3156f9b2bf48e3f6a856e80ec361c5e122914025 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Dec 2022 16:40:41 +0100 Subject: [PATCH 5/7] Add whatsnew --- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v2.0.0.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 67441554c16fe..263a58bdf6726 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1146,7 +1146,7 @@ To completely override the default values that are recognized as missing, specif .. _io.navaluesconst: The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', -'n/a', 'NA', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. +'n/a', 'NA', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None', '']``. Let us consider some examples: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 58306638ed396..ad7f80aff580c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -478,6 +478,7 @@ Other API changes - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) - Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects()`` to do type inference on the result (:issue:`49999`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) +- Added ``"None"`` to default ``na_values`` in :func:`read_csv` (:issue:`50286`) - Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) - Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`) From c7fb7dc5f104a69059ab3c71ef1bb82325b9f3bc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 23 Dec 2022 18:48:13 +0100 Subject: [PATCH 6/7] Address review --- pandas/tests/io/test_html.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 83cc7fc9b30d9..4d0f99b93ac5f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -138,9 +138,9 @@ def test_to_html_compat(self): res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) - @pytest.mark.parametrize("nullable_backend", ["pandas", "pyarrow"]) + @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) @pytest.mark.parametrize("storage", ["python", "pyarrow"]) - def test_use_nullable_dtypes(self, storage, nullable_backend): + def test_use_nullable_dtypes(self, storage, dtype_backend): # GH#50286 df = DataFrame( { @@ -166,7 +166,7 @@ def test_use_nullable_dtypes(self, storage, nullable_backend): out = df.to_html(index=False) with pd.option_context("mode.string_storage", storage): - with pd.option_context("mode.nullable_backend", nullable_backend): + with pd.option_context("mode.dtype_backend", dtype_backend): result = self.read_html(out, use_nullable_dtypes=True)[0] expected = DataFrame( @@ -182,7 +182,7 @@ def test_use_nullable_dtypes(self, storage, nullable_backend): } ) - if nullable_backend == "pyarrow": + if dtype_backend == "pyarrow": import pyarrow as pa from pandas.arrays import ArrowExtensionArray From abd64ad3f47bd3bd35d8beceef2fa31ed4edf527 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 23 Dec 2022 18:51:32 +0100 Subject: [PATCH 7/7] Add backend --- pandas/io/html.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/html.py b/pandas/io/html.py index a2635dbe879c6..7dcbd76b77b28 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1149,6 +1149,12 @@ def read_html( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. versionadded:: 2.0 Returns