diff --git a/doc/source/io.rst b/doc/source/io.rst index da0444a8b8df9..113afa32d182e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1959,6 +1959,35 @@ Specify an HTML attribute dfs2 = read_html(url, attrs={'class': 'sortable'}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True +Specify values that should be converted to NaN + +.. code-block:: python + + dfs = read_html(url, na_values=['No Acquirer']) + +.. versionadded:: 0.19 + +Specify whether to keep the default set of NaN values + +.. code-block:: python + + dfs = read_html(url, keep_default_na=False) + +.. versionadded:: 0.19 + +Specify converters for columns. This is useful for numerical text data that has +leading zeros. By default columns that are numerical are cast to numeric +types and the leading zeros are lost. To avoid this, we can convert these +columns to strings. + +.. code-block:: python + + url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' + dfs = read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': + str}) + +.. versionadded:: 0.19 + Use some combination of the above .. code-block:: python diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 657de7ec26efc..351b0ba9b2906 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -207,6 +207,8 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) +- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`) + - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/html.py b/pandas/io/html.py index 609642e248eda..79f0f326c4dd7 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -611,10 +611,10 @@ def _expand_elements(body): body[ind] += empty * (lens_max - length) -def _data_to_frame(data, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, - decimal): - head, body, foot = data +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop('data') + header = kwargs.pop('header') + kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: body = [head] + body @@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows, # fill out elements of body that are "ragged" _expand_elements(body) - tp = TextParser(body, header=header, index_col=index_col, - skiprows=_get_skiprows(skiprows), - parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands, decimal=decimal) + tp = TextParser(body, header=header, **kwargs) df = tp.read() return df @@ -716,9 +713,7 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal): +def _parse(flavor, io, match, attrs, encoding, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -740,15 +735,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, ret = [] for table in tables: try: - ret.append(_data_to_frame(data=table, - header=header, - index_col=index_col, - skiprows=skiprows, - parse_dates=parse_dates, - tupleize_cols=tupleize_cols, - thousands=thousands, - decimal=decimal - )) + ret.append(_data_to_frame(data=table, **kwargs)) except EmptyDataError: # empty table continue return ret @@ -757,7 +744,8 @@ def _parse(flavor, io, match, header, index_col, skiprows, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, - decimal='.'): + decimal='.', converters=None, na_values=None, + keep_default_na=True): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -839,6 +827,25 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + .. versionadded:: 0.19.0 + + na_values : iterable, default None + Custom NA values + + .. versionadded:: 0.19.0 + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to + + .. versionadded:: 0.19.0 + Returns ------- dfs : list of DataFrames @@ -881,6 +888,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') _validate_header_arg(header) - return _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal) + return _parse(flavor=flavor, io=io, match=match, header=header, + index_col=index_col, skiprows=skiprows, + parse_dates=parse_dates, tupleize_cols=tupleize_cols, + thousands=thousands, attrs=attrs, encoding=encoding, + decimal=decimal, converters=converters, na_values=na_values, + keep_default_na=keep_default_na) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 5a95fe7727df0..7b4e775db9476 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -694,6 +694,72 @@ def test_bool_header_arg(self): with tm.assertRaises(TypeError): read_html(self.spam_data, header=arg) + def test_converters(self): + # GH 13461 + html_data = """ + + + + + + + + + + + + +
a
0.763
0.244
""" + + expected_df = DataFrame({'a': ['0.763', '0.244']}) + html_df = read_html(html_data, converters={'a': str})[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_na_values(self): + # GH 13461 + html_data = """ + + + + + + + + + + + + +
a
0.763
0.244
""" + + expected_df = DataFrame({'a': [0.763, np.nan]}) + html_df = read_html(html_data, na_values=[0.244])[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_keep_default_na(self): + html_data = """ + + + + + + + + + + + + +
a
N/A
NA
""" + + expected_df = DataFrame({'a': ['N/A', 'NA']}) + html_df = read_html(html_data, keep_default_na=False)[0] + tm.assert_frame_equal(expected_df, html_df) + + expected_df = DataFrame({'a': [np.nan, np.nan]}) + html_df = read_html(html_data, keep_default_na=True)[0] + tm.assert_frame_equal(expected_df, html_df) + def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_')