-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Adding additional keywords to read_html for #13461 #13575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
afc7b2e
b7fa8a7
cfe786c
3f828e2
8a49bad
ef371d0
7e6b5fe
dac660a
2abb473
5cb8243
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -207,6 +207,12 @@ Other enhancements | |
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) | ||
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) | ||
|
||
- The ``pd.read_html()`` has gained support for the ``na_values`` option (:issue:`13461`) | ||
- The ``pd.read_html()`` has gained support for the ``converters`` option (:issue:`13461`) | ||
- The ``pd.read_html()`` has gained support for the ``keep_default_na`` option (:issue:`13461`) | ||
- The ``pd.read_html()`` has gained support for the ``squeeze`` option (:issue:`13461`) | ||
- The ``pd.read_html()`` has gained support for the ``date_parser`` option (:issue:`13461`) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need an update in the docs themsleves? e.g. an example There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done: added to io.rst |
||
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) | ||
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -611,10 +611,10 @@ def _expand_elements(body): | |
body[ind] += empty * (lens_max - length) | ||
|
||
|
||
def _data_to_frame(data, header, index_col, skiprows, | ||
parse_dates, tupleize_cols, thousands, | ||
decimal): | ||
head, body, foot = data | ||
def _data_to_frame(**kwargs): | ||
head, body, foot = kwargs.pop('data') | ||
header = kwargs.pop('header') | ||
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) | ||
|
||
if head: | ||
body = [head] + body | ||
|
@@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows, | |
# fill out elements of body that are "ragged" | ||
_expand_elements(body) | ||
|
||
tp = TextParser(body, header=header, index_col=index_col, | ||
skiprows=_get_skiprows(skiprows), | ||
parse_dates=parse_dates, tupleize_cols=tupleize_cols, | ||
thousands=thousands, decimal=decimal) | ||
tp = TextParser(body, header=header, **kwargs) | ||
df = tp.read() | ||
return df | ||
|
||
|
@@ -716,9 +713,9 @@ def _validate_flavor(flavor): | |
return flavor | ||
|
||
|
||
def _parse(flavor, io, match, header, index_col, skiprows, | ||
parse_dates, tupleize_cols, thousands, attrs, encoding, | ||
decimal): | ||
def _parse(flavor, io, match, | ||
attrs, encoding, | ||
**kwargs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you put those on one line? (not needed for PEP8 to put on multiple lines here) |
||
flavor = _validate_flavor(flavor) | ||
compiled_match = re.compile(match) # you can pass a compiled regex here | ||
|
||
|
@@ -740,15 +737,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, | |
ret = [] | ||
for table in tables: | ||
try: | ||
ret.append(_data_to_frame(data=table, | ||
header=header, | ||
index_col=index_col, | ||
skiprows=skiprows, | ||
parse_dates=parse_dates, | ||
tupleize_cols=tupleize_cols, | ||
thousands=thousands, | ||
decimal=decimal | ||
)) | ||
ret.append(_data_to_frame(data=table, **kwargs)) | ||
except EmptyDataError: # empty table | ||
continue | ||
return ret | ||
|
@@ -757,7 +746,8 @@ def _parse(flavor, io, match, header, index_col, skiprows, | |
def read_html(io, match='.+', flavor=None, header=None, index_col=None, | ||
skiprows=None, attrs=None, parse_dates=False, | ||
tupleize_cols=False, thousands=',', encoding=None, | ||
decimal='.'): | ||
decimal='.', converters=None, na_values=None, | ||
keep_default_na=True, squeeze=False, date_parser=None): | ||
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. | ||
|
||
Parameters | ||
|
@@ -839,6 +829,34 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, | |
|
||
.. versionadded:: 0.19.0 | ||
|
||
converters : dict, default None | ||
Dict of functions for converting values in certain columns. Keys can | ||
either be integers or column labels, values are functions that take one | ||
input argument, the cell (not column) content, and return the | ||
transformed content. | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
na_values : iterable, default None | ||
Custom NA values | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
keep_default_na : bool, default True | ||
If na_values are specified and keep_default_na is False the default NaN | ||
values are overridden, otherwise they're appended to | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
squeeze : boolean, default False | ||
If the parsed data only contains one column then return a Series | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
date_parser : function, default None | ||
|
||
.. versionadded:: 0.19.0 | ||
|
||
Returns | ||
------- | ||
dfs : list of DataFrames | ||
|
@@ -881,6 +899,10 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, | |
raise ValueError('cannot skip rows starting from the end of the ' | ||
'data (you passed a negative value)') | ||
_validate_header_arg(header) | ||
return _parse(flavor, io, match, header, index_col, skiprows, | ||
parse_dates, tupleize_cols, thousands, attrs, encoding, | ||
decimal) | ||
return _parse(flavor=flavor, io=io, match=match, header=header, | ||
index_col=index_col, skiprows=skiprows, | ||
parse_dates=parse_dates, tupleize_cols=tupleize_cols, | ||
thousands=thousands, attrs=attrs, encoding=encoding, | ||
decimal=decimal, converters=converters, na_values=na_values, | ||
keep_default_na=keep_default_na, squeeze=squeeze, | ||
date_parser=date_parser) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you combine this in one entry?