8
8
import numbers
9
9
import collections
10
10
import warnings
11
+ import itertools
11
12
12
- from itertools import repeat
13
13
from distutils .version import LooseVersion
14
14
15
15
import numpy as np
48
48
#############
49
49
# READ HTML #
50
50
#############
51
- _RE_WHITESPACE = re .compile (r'( [\r\n]+|\s{2,}) ' )
51
+ _RE_WHITESPACE = re .compile (r'[\r\n]+|\s{2,}' )
52
52
53
53
54
54
def _remove_whitespace (s , regex = _RE_WHITESPACE ):
@@ -100,8 +100,8 @@ def _get_skiprows(skiprows):
100
100
elif skiprows is None :
101
101
return 0
102
102
else :
103
- raise TypeError ('{0} is not a valid type for skipping'
104
- ' rows' .format (type (skiprows )))
103
+ raise TypeError ('{0!r } is not a valid type for skipping'
104
+ ' rows' .format (type (skiprows ). __name__ ))
105
105
106
106
107
107
def _read (io ):
@@ -127,7 +127,7 @@ def _read(io):
127
127
raw_text = io
128
128
else :
129
129
raise TypeError ("Cannot read object of type "
130
- "'{0.__class__.__name__ !r}' " .format (io ))
130
+ "{0 !r}" .format (type ( io ). __name__ ))
131
131
return raw_text
132
132
133
133
@@ -587,30 +587,38 @@ def _parse_raw_tfoot(self, table):
587
587
588
588
589
589
def _nan_list (n ):
590
- return list (repeat (np .nan , n ))
590
+ return list (itertools . repeat (np .nan , n ))
591
591
592
592
593
593
def _expand_elements (body ):
594
594
lens = Series (lmap (len , body ))
595
595
lens_max = lens .max ()
596
596
not_max = lens [lens != lens_max ]
597
597
598
- for ind , length in not_max .iteritems ():
598
+ for ind , length in compat .iteritems (not_max ):
599
599
body [ind ] += _nan_list (lens_max - length )
600
600
601
601
602
602
def _data_to_frame (data , header , index_col , skiprows , infer_types ,
603
- parse_dates ):
603
+ parse_dates , tupleize_cols , thousands ):
604
604
head , body , _ = data # _ is footer which is rarely used: ignore for now
605
+
606
+ if head :
607
+ body = [head ] + body
608
+
609
+ if header is None : # special case when a table has <th> elements
610
+ header = 0
611
+
612
+ # fill out elements of body that are "ragged"
605
613
_expand_elements (body )
606
- body = [head ] + body
607
- import ipdb ; ipdb .set_trace ()
614
+
608
615
tp = TextParser (body , header = header , index_col = index_col ,
609
616
skiprows = _get_skiprows (skiprows ),
610
- parse_dates = parse_dates , tupleize_cols = False )
617
+ parse_dates = parse_dates , tupleize_cols = tupleize_cols ,
618
+ thousands = thousands )
611
619
df = tp .read ()
612
620
613
- if infer_types : # remove in 0.14
621
+ if infer_types : # TODO: remove in 0.14
614
622
df = df .convert_objects (convert_dates = 'coerce' )
615
623
else :
616
624
df = df .applymap (compat .text_type )
@@ -687,7 +695,7 @@ def _validate_parser_flavor(flavor):
687
695
688
696
689
697
def _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
690
- parse_dates , attrs ):
698
+ parse_dates , tupleize_cols , thousands , attrs ):
691
699
# bonus: re.compile is idempotent under function iteration so you can pass
692
700
# a compiled regex to it and it will return itself
693
701
flavor = _validate_parser_flavor (flavor )
@@ -709,65 +717,65 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
709
717
raise retained
710
718
711
719
return [_data_to_frame (table , header , index_col , skiprows , infer_types ,
712
- parse_dates ) for table in tables ]
720
+ parse_dates , tupleize_cols , thousands )
721
+ for table in tables ]
713
722
714
723
715
- def read_html (io , match = '.+' , flavor = None , header = 0 , index_col = None ,
716
- skiprows = None , infer_types = None , attrs = None , parse_dates = False ):
717
- r"""Read an HTML table into a DataFrame.
724
+ def read_html (io , match = '.+' , flavor = None , header = None , index_col = None ,
725
+ skiprows = None , infer_types = None , attrs = None , parse_dates = False ,
726
+ tupleize_cols = False , thousands = ',' ):
727
+ r"""Read HTML tables into a ``list`` of DataFrames.
718
728
719
729
Parameters
720
730
----------
721
731
io : str or file-like
722
- A string or file like object that can be either a url, a file-like
723
- object, or a raw string containing HTML. Note that lxml only accepts
724
- the http, ftp and file url protocols. If you have a URI that starts
725
- with ``'https'`` you might removing the ``'s'``.
732
+ A URL, a file-like object, or a raw string containing HTML. Note that
733
+ lxml only accepts the http, ftp and file url protocols. If you have a
734
+ URL that starts with ``'https'`` you might removing the ``'s'``.
726
735
727
- match : str or regex, optional, default '.+'
736
+ match : str or compiled regular expression, optional
728
737
The set of tables containing text matching this regex or string will be
729
738
returned. Unless the HTML is extremely simple you will probably need to
730
739
pass a non-empty string here. Defaults to '.+' (match any non-empty
731
740
string). The default value will return all tables contained on a page.
732
741
This value is converted to a regular expression so that there is
733
742
consistent behavior between Beautiful Soup and lxml.
734
743
735
- flavor : str, container of strings, default ``None``
736
- The parsing engine to use under the hood . 'bs4' and 'html5lib' are
737
- synonymous with each other, they are both there for backwards
738
- compatibility. The default of ``None`` tries to use ``lxml`` to parse
739
- and if that fails it falls back on ``bs4`` + ``html5lib``.
744
+ flavor : str or None , container of strings
745
+ The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
746
+ each other, they are both there for backwards compatibility. The
747
+ default of ``None`` tries to use ``lxml`` to parse and if that fails it
748
+ falls back on ``bs4`` + ``html5lib``.
740
749
741
- header : int or array -like, optional, default ``0``
742
- The row (or rows for a MultiIndex) to use to make the columns headers.
743
- Note that this row will be removed from the data .
750
+ header : int or list -like or None, optional
751
+ The row (or list of rows for a :class:`~pandas. MultiIndex` ) to use to
752
+ make the columns headers .
744
753
745
- index_col : int or array-like or None, optional, default ``None``
746
- The column to use to make the index. Note that this column will be
747
- removed from the data.
754
+ index_col : int or list-like or None, optional
755
+ The column (or list of columns) to use to create the index.
748
756
749
- skiprows : int or collections.Container or slice or None, optional, default ``None``
757
+ skiprows : int or list-like or slice or None, optional
750
758
If an integer is given then skip this many rows after parsing the
751
759
column header. If a sequence of integers is given skip those specific
752
760
rows (0-based). Note that
753
761
754
762
.. code-block:: python
755
763
756
- skiprows == 0
764
+ pandas.read_html(..., skiprows=0)
757
765
758
766
yields the same result as
759
767
760
768
.. code-block:: python
761
769
762
- skiprows is None
770
+ pandas.read_html(..., skiprows= None)
763
771
764
772
If `skiprows` is a positive integer, say :math:`n`, then
765
773
it is treated as "skip :math:`n` rows", *not* as "skip the
766
774
:math:`n^\textrm{th}` row".
767
775
768
- infer_types : bool or None , optional, default ``None`` , deprecated since 0.13, removed in 0.14
776
+ infer_types : bool, optional, deprecated since 0.13, removed in 0.14
769
777
770
- attrs : dict or None, optional, default ``None``
778
+ attrs : dict or None, optional
771
779
This is a dictionary of attributes that you can pass to use to identify
772
780
the table in the HTML. These are not checked for validity before being
773
781
passed to lxml or Beautiful Soup. However, these attributes must be
@@ -793,51 +801,65 @@ def read_html(io, match='.+', flavor=None, header=0, index_col=None,
793
801
<http://www.w3.org/TR/html-markup/table.html>`__. It contains the
794
802
latest information on table attributes for the modern web.
795
803
804
+ parse_dates : bool, optional
805
+ See :func:`~pandas.read_csv` for details.
806
+
807
+ tupleize_cols : bool, optional
808
+ If ``False`` try to parse multiple header rows into a
809
+ :class:`~pandas.MultiIndex`. See :func:`~pandas.read_csv` for more
810
+ details. Defaults to ``False`` for backwards compatibility. This is in
811
+ contrast to other IO functions which default to ``True``.
812
+
813
+ thousands : str, optional
814
+ Separator to use to parse thousands. Defaults to ``','``. Note that
815
+ this is different from :func:`~pandas.read_csv` because
816
+ :func:`~pandas.read_csv` must be able to parse different separators,
817
+ and the default separator is ``','``. :func:`~pandas.read_html` does
818
+ not need to do this, so it defaults to ``','``.
819
+
796
820
Returns
797
821
-------
798
822
dfs : list of DataFrames
799
- A list of DataFrames, each of which is the parsed data from each of the
800
- tables on the page.
801
823
802
824
Notes
803
825
-----
804
- Before using this function you should probably read the :ref:`gotchas about
805
- the parser libraries that this function uses <html-gotchas>`.
806
-
807
- There's as little cleaning of the data as possible due to the heterogeneity
808
- and general disorder of HTML on the web.
826
+ Before using this function you should read the :ref:`gotchas about the
827
+ HTML parsing libraries <html-gotchas>`.
809
828
810
- Expect some cleanup after you call this function. For example,
811
- you might need to pass `infer_types=False` and perform manual conversion if
812
- the column names are converted to NaN when you pass the `header=0`
813
- argument. We try to assume as little as possible about the structure of the
814
- table and push the idiosyncrasies of the HTML contained in the table to
815
- you, the user.
829
+ Expect to do some cleanup after you call this function. For example, you
830
+ might need to manually assign column names if the column names are
831
+ converted to NaN when you pass the `header=0` argument. We try to assume as
832
+ little as possible about the structure of the table and push the
833
+ idiosyncrasies of the HTML contained in the table to the user.
816
834
817
- This function only searches for <table> elements and only for <tr> and <th>
818
- rows and <td> elements within those rows. This could be extended by
819
- subclassing one of the parser classes contained in :mod:`pandas.io.html` .
835
+ This function searches for `` <table>`` elements and only for `` <tr>``
836
+ and ``<th>`` rows and `` <td>`` elements within each ``<tr>`` or ``<th>``
837
+ element in the table. ``<td>`` stands for "table data" .
820
838
821
- Similar to :func:`read_csv` the `header` argument is applied **after**
822
- `skiprows` is applied.
839
+ Similar to :func:`~pandas. read_csv` the `header` argument is applied
840
+ **after** `skiprows` is applied.
823
841
824
842
This function will *always* return a list of :class:`DataFrame` *or*
825
843
it will fail, e.g., it will *not* return an empty list.
826
844
827
845
Examples
828
846
--------
829
847
See the :ref:`read_html documentation in the IO section of the docs
830
- <io.read_html>` for many examples of reading HTML.
848
+ <io.read_html>` for some examples of reading in HTML tables.
849
+
850
+ See Also
851
+ --------
852
+ pandas.read_csv
831
853
"""
832
- # Type check here. We don't want to parse only to fail because of an
833
- # invalid value of an integer skiprows.
834
854
if infer_types is not None :
835
- warnings .warn ("infer_types will be removed in 0.14" , UserWarning )
855
+ warnings .warn ("infer_types will be removed in 0.14" )
836
856
else :
837
- infer_types = True # remove in 0.14
857
+ infer_types = True # TODO: remove in 0.14
838
858
859
+ # Type check here. We don't want to parse only to fail because of an
860
+ # invalid value of an integer skiprows.
839
861
if isinstance (skiprows , numbers .Integral ) and skiprows < 0 :
840
862
raise AssertionError ('cannot skip rows starting from the end of the '
841
863
'data (you passed a negative value)' )
842
864
return _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
843
- parse_dates , attrs )
865
+ parse_dates , tupleize_cols , thousands , attrs )
0 commit comments