Skip to content

Commit 92aa277

Browse files
committed
ENH: add tupleize_cols, thousands args
tupleize_cols=False by default, for back compat thousands=',' by default, because we're not parsing CSV
1 parent d63ac05 commit 92aa277

File tree

4 files changed

+3905
-113
lines changed

4 files changed

+3905
-113
lines changed

pandas/io/html.py

+84-62
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
import numbers
99
import collections
1010
import warnings
11+
import itertools
1112

12-
from itertools import repeat
1313
from distutils.version import LooseVersion
1414

1515
import numpy as np
@@ -48,7 +48,7 @@
4848
#############
4949
# READ HTML #
5050
#############
51-
_RE_WHITESPACE = re.compile(r'([\r\n]+|\s{2,})')
51+
_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}')
5252

5353

5454
def _remove_whitespace(s, regex=_RE_WHITESPACE):
@@ -100,8 +100,8 @@ def _get_skiprows(skiprows):
100100
elif skiprows is None:
101101
return 0
102102
else:
103-
raise TypeError('{0} is not a valid type for skipping'
104-
' rows'.format(type(skiprows)))
103+
raise TypeError('{0!r} is not a valid type for skipping'
104+
' rows'.format(type(skiprows).__name__))
105105

106106

107107
def _read(io):
@@ -127,7 +127,7 @@ def _read(io):
127127
raw_text = io
128128
else:
129129
raise TypeError("Cannot read object of type "
130-
"'{0.__class__.__name__!r}'".format(io))
130+
"{0!r}".format(type(io).__name__))
131131
return raw_text
132132

133133

@@ -587,30 +587,38 @@ def _parse_raw_tfoot(self, table):
587587

588588

589589
def _nan_list(n):
590-
return list(repeat(np.nan, n))
590+
return list(itertools.repeat(np.nan, n))
591591

592592

593593
def _expand_elements(body):
594594
lens = Series(lmap(len, body))
595595
lens_max = lens.max()
596596
not_max = lens[lens != lens_max]
597597

598-
for ind, length in not_max.iteritems():
598+
for ind, length in compat.iteritems(not_max):
599599
body[ind] += _nan_list(lens_max - length)
600600

601601

602602
def _data_to_frame(data, header, index_col, skiprows, infer_types,
603-
parse_dates):
603+
parse_dates, tupleize_cols, thousands):
604604
head, body, _ = data # _ is footer which is rarely used: ignore for now
605+
606+
if head:
607+
body = [head] + body
608+
609+
if header is None: # special case when a table has <th> elements
610+
header = 0
611+
612+
# fill out elements of body that are "ragged"
605613
_expand_elements(body)
606-
body = [head] + body
607-
import ipdb; ipdb.set_trace()
614+
608615
tp = TextParser(body, header=header, index_col=index_col,
609616
skiprows=_get_skiprows(skiprows),
610-
parse_dates=parse_dates, tupleize_cols=False)
617+
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
618+
thousands=thousands)
611619
df = tp.read()
612620

613-
if infer_types: # remove in 0.14
621+
if infer_types: # TODO: remove in 0.14
614622
df = df.convert_objects(convert_dates='coerce')
615623
else:
616624
df = df.applymap(compat.text_type)
@@ -687,7 +695,7 @@ def _validate_parser_flavor(flavor):
687695

688696

689697
def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
690-
parse_dates, attrs):
698+
parse_dates, tupleize_cols, thousands, attrs):
691699
# bonus: re.compile is idempotent under function iteration so you can pass
692700
# a compiled regex to it and it will return itself
693701
flavor = _validate_parser_flavor(flavor)
@@ -709,65 +717,65 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
709717
raise retained
710718

711719
return [_data_to_frame(table, header, index_col, skiprows, infer_types,
712-
parse_dates) for table in tables]
720+
parse_dates, tupleize_cols, thousands)
721+
for table in tables]
713722

714723

715-
def read_html(io, match='.+', flavor=None, header=0, index_col=None,
716-
skiprows=None, infer_types=None, attrs=None, parse_dates=False):
717-
r"""Read an HTML table into a DataFrame.
724+
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
725+
skiprows=None, infer_types=None, attrs=None, parse_dates=False,
726+
tupleize_cols=False, thousands=','):
727+
r"""Read HTML tables into a ``list`` of DataFrames.
718728
719729
Parameters
720730
----------
721731
io : str or file-like
722-
A string or file like object that can be either a url, a file-like
723-
object, or a raw string containing HTML. Note that lxml only accepts
724-
the http, ftp and file url protocols. If you have a URI that starts
725-
with ``'https'`` you might removing the ``'s'``.
732+
A URL, a file-like object, or a raw string containing HTML. Note that
733+
lxml only accepts the http, ftp and file url protocols. If you have a
734+
URL that starts with ``'https'`` you might removing the ``'s'``.
726735
727-
match : str or regex, optional, default '.+'
736+
match : str or compiled regular expression, optional
728737
The set of tables containing text matching this regex or string will be
729738
returned. Unless the HTML is extremely simple you will probably need to
730739
pass a non-empty string here. Defaults to '.+' (match any non-empty
731740
string). The default value will return all tables contained on a page.
732741
This value is converted to a regular expression so that there is
733742
consistent behavior between Beautiful Soup and lxml.
734743
735-
flavor : str, container of strings, default ``None``
736-
The parsing engine to use under the hood. 'bs4' and 'html5lib' are
737-
synonymous with each other, they are both there for backwards
738-
compatibility. The default of ``None`` tries to use ``lxml`` to parse
739-
and if that fails it falls back on ``bs4`` + ``html5lib``.
744+
flavor : str or None, container of strings
745+
The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
746+
each other, they are both there for backwards compatibility. The
747+
default of ``None`` tries to use ``lxml`` to parse and if that fails it
748+
falls back on ``bs4`` + ``html5lib``.
740749
741-
header : int or array-like, optional, default ``0``
742-
The row (or rows for a MultiIndex) to use to make the columns headers.
743-
Note that this row will be removed from the data.
750+
header : int or list-like or None, optional
751+
The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
752+
make the columns headers.
744753
745-
index_col : int or array-like or None, optional, default ``None``
746-
The column to use to make the index. Note that this column will be
747-
removed from the data.
754+
index_col : int or list-like or None, optional
755+
The column (or list of columns) to use to create the index.
748756
749-
skiprows : int or collections.Container or slice or None, optional, default ``None``
757+
skiprows : int or list-like or slice or None, optional
750758
If an integer is given then skip this many rows after parsing the
751759
column header. If a sequence of integers is given skip those specific
752760
rows (0-based). Note that
753761
754762
.. code-block:: python
755763
756-
skiprows == 0
764+
pandas.read_html(..., skiprows=0)
757765
758766
yields the same result as
759767
760768
.. code-block:: python
761769
762-
skiprows is None
770+
pandas.read_html(..., skiprows=None)
763771
764772
If `skiprows` is a positive integer, say :math:`n`, then
765773
it is treated as "skip :math:`n` rows", *not* as "skip the
766774
:math:`n^\textrm{th}` row".
767775
768-
infer_types : bool or None, optional, default ``None``, deprecated since 0.13, removed in 0.14
776+
infer_types : bool, optional, deprecated since 0.13, removed in 0.14
769777
770-
attrs : dict or None, optional, default ``None``
778+
attrs : dict or None, optional
771779
This is a dictionary of attributes that you can pass to use to identify
772780
the table in the HTML. These are not checked for validity before being
773781
passed to lxml or Beautiful Soup. However, these attributes must be
@@ -793,51 +801,65 @@ def read_html(io, match='.+', flavor=None, header=0, index_col=None,
793801
<http://www.w3.org/TR/html-markup/table.html>`__. It contains the
794802
latest information on table attributes for the modern web.
795803
804+
parse_dates : bool, optional
805+
See :func:`~pandas.read_csv` for details.
806+
807+
tupleize_cols : bool, optional
808+
If ``False`` try to parse multiple header rows into a
809+
:class:`~pandas.MultiIndex`. See :func:`~pandas.read_csv` for more
810+
details. Defaults to ``False`` for backwards compatibility. This is in
811+
contrast to other IO functions which default to ``True``.
812+
813+
thousands : str, optional
814+
Separator to use to parse thousands. Defaults to ``','``. Note that
815+
this is different from :func:`~pandas.read_csv` because
816+
:func:`~pandas.read_csv` must be able to parse different separators,
817+
and the default separator is ``','``. :func:`~pandas.read_html` does
818+
not need to do this, so it defaults to ``','``.
819+
796820
Returns
797821
-------
798822
dfs : list of DataFrames
799-
A list of DataFrames, each of which is the parsed data from each of the
800-
tables on the page.
801823
802824
Notes
803825
-----
804-
Before using this function you should probably read the :ref:`gotchas about
805-
the parser libraries that this function uses <html-gotchas>`.
806-
807-
There's as little cleaning of the data as possible due to the heterogeneity
808-
and general disorder of HTML on the web.
826+
Before using this function you should read the :ref:`gotchas about the
827+
HTML parsing libraries <html-gotchas>`.
809828
810-
Expect some cleanup after you call this function. For example,
811-
you might need to pass `infer_types=False` and perform manual conversion if
812-
the column names are converted to NaN when you pass the `header=0`
813-
argument. We try to assume as little as possible about the structure of the
814-
table and push the idiosyncrasies of the HTML contained in the table to
815-
you, the user.
829+
Expect to do some cleanup after you call this function. For example, you
830+
might need to manually assign column names if the column names are
831+
converted to NaN when you pass the `header=0` argument. We try to assume as
832+
little as possible about the structure of the table and push the
833+
idiosyncrasies of the HTML contained in the table to the user.
816834
817-
This function only searches for <table> elements and only for <tr> and <th>
818-
rows and <td> elements within those rows. This could be extended by
819-
subclassing one of the parser classes contained in :mod:`pandas.io.html`.
835+
This function searches for ``<table>`` elements and only for ``<tr>``
836+
and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
837+
element in the table. ``<td>`` stands for "table data".
820838
821-
Similar to :func:`read_csv` the `header` argument is applied **after**
822-
`skiprows` is applied.
839+
Similar to :func:`~pandas.read_csv` the `header` argument is applied
840+
**after** `skiprows` is applied.
823841
824842
This function will *always* return a list of :class:`DataFrame` *or*
825843
it will fail, e.g., it will *not* return an empty list.
826844
827845
Examples
828846
--------
829847
See the :ref:`read_html documentation in the IO section of the docs
830-
<io.read_html>` for many examples of reading HTML.
848+
<io.read_html>` for some examples of reading in HTML tables.
849+
850+
See Also
851+
--------
852+
pandas.read_csv
831853
"""
832-
# Type check here. We don't want to parse only to fail because of an
833-
# invalid value of an integer skiprows.
834854
if infer_types is not None:
835-
warnings.warn("infer_types will be removed in 0.14", UserWarning)
855+
warnings.warn("infer_types will be removed in 0.14")
836856
else:
837-
infer_types = True # remove in 0.14
857+
infer_types = True # TODO: remove in 0.14
838858

859+
# Type check here. We don't want to parse only to fail because of an
860+
# invalid value of an integer skiprows.
839861
if isinstance(skiprows, numbers.Integral) and skiprows < 0:
840862
raise AssertionError('cannot skip rows starting from the end of the '
841863
'data (you passed a negative value)')
842864
return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
843-
parse_dates, attrs)
865+
parse_dates, tupleize_cols, thousands, attrs)

pandas/io/parsers.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -1468,23 +1468,22 @@ def _convert_data(self, data):
14681468
col = self.orig_names[col]
14691469
clean_conv[col] = f
14701470

1471-
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
1472-
self.verbose, clean_conv)
1471+
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose,
1472+
clean_conv)
14731473

14741474
def _infer_columns(self):
1475-
#import ipdb; ipdb.set_trace()
14761475
names = self.names
14771476

14781477
if self.header is not None:
14791478
header = self.header
14801479

14811480
# we have a mi columns, so read and extra line
1482-
if isinstance(header, (list, tuple, np.ndarray)):
1481+
if isinstance(header,(list,tuple,np.ndarray)):
14831482
have_mi_columns = True
1484-
header = list(header) + [header[-1] + 1]
1483+
header = list(header) + [header[-1]+1]
14851484
else:
14861485
have_mi_columns = False
1487-
header = [header]
1486+
header = [ header ]
14881487

14891488
columns = []
14901489
for level, hr in enumerate(header):
@@ -1499,7 +1498,7 @@ def _infer_columns(self):
14991498

15001499
this_columns = []
15011500
for i, c in enumerate(line):
1502-
if not c:
1501+
if c == '':
15031502
if have_mi_columns:
15041503
this_columns.append('Unnamed: %d_level_%d' % (i,level))
15051504
else:

0 commit comments

Comments
 (0)