7
7
import re
8
8
import numbers
9
9
import collections
10
+ import warnings
10
11
12
+ from itertools import repeat
11
13
from distutils .version import LooseVersion
12
14
13
15
import numpy as np
14
16
15
- from pandas import DataFrame , MultiIndex , isnull
16
17
from pandas .io .common import _is_url , urlopen , parse_url
17
- from pandas .compat import range , lrange , lmap , u , map
18
- from pandas import compat
18
+ from pandas .io .parsers import TextParser
19
+ from pandas .compat import lrange , lmap , u
20
+ from pandas .core import common as com
21
+ from pandas import compat , Series
19
22
20
23
21
24
try :
@@ -67,7 +70,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE):
67
70
return regex .sub (' ' , s .strip ())
68
71
69
72
70
- def _get_skiprows_iter (skiprows ):
73
+ def _get_skiprows (skiprows ):
71
74
"""Get an iterator given an integer, slice or container.
72
75
73
76
Parameters
@@ -92,10 +95,10 @@ def _get_skiprows_iter(skiprows):
92
95
"""
93
96
if isinstance (skiprows , slice ):
94
97
return lrange (skiprows .start or 0 , skiprows .stop , skiprows .step or 1 )
95
- elif isinstance (skiprows , numbers .Integral ):
96
- return lrange (skiprows )
97
- elif isinstance (skiprows , collections .Container ):
98
+ elif isinstance (skiprows , numbers .Integral ) or com .is_list_like (skiprows ):
98
99
return skiprows
100
+ elif skiprows is None :
101
+ return 0
99
102
else :
100
103
raise TypeError ('{0} is not a valid type for skipping'
101
104
' rows' .format (type (skiprows )))
@@ -583,101 +586,34 @@ def _parse_raw_tfoot(self, table):
583
586
table .xpath (expr )]
584
587
585
588
586
- def _data_to_frame ( data , header , index_col , infer_types , skiprows ):
587
- """Parse a BeautifulSoup table into a DataFrame.
589
+ def _nan_list ( n ):
590
+ return list ( repeat ( np . nan , n ))
588
591
589
- Parameters
590
- ----------
591
- data : tuple of lists
592
- The raw data to be placed into a DataFrame. This is a list of lists of
593
- strings or unicode. If it helps, it can be thought of as a matrix of
594
- strings instead.
595
-
596
- header : int or None
597
- An integer indicating the row to use for the column header or None
598
- indicating no header will be used.
599
-
600
- index_col : int or None
601
- An integer indicating the column to use for the index or None
602
- indicating no column will be used.
603
-
604
- infer_types : bool
605
- Whether to convert numbers and dates.
606
-
607
- skiprows : collections.Container or int or slice
608
- Iterable used to skip rows.
609
-
610
- Returns
611
- -------
612
- df : DataFrame
613
- A DataFrame containing the data from `data`
614
-
615
- Raises
616
- ------
617
- ValueError
618
- * If `skiprows` is not found in the rows of the parsed DataFrame.
619
592
620
- Raises
621
- ------
622
- ValueError
623
- * If `skiprows` is not found in the rows of the parsed DataFrame.
593
+ def _expand_elements ( body ):
594
+ lens = Series ( lmap ( len , body ))
595
+ lens_max = lens . max ()
596
+ not_max = lens [ lens != lens_max ]
624
597
625
- See Also
626
- --------
627
- read_html
598
+ for ind , length in not_max .iteritems ():
599
+ body [ind ] += _nan_list (lens_max - length )
628
600
629
- Notes
630
- -----
631
- The `data` parameter is guaranteed not to be a list of empty lists.
632
- """
633
- thead , tbody , tfoot = data
634
- columns = thead or None
635
- df = DataFrame (tbody , columns = columns )
636
601
637
- if skiprows is not None :
638
- it = _get_skiprows_iter (skiprows )
602
+ def _data_to_frame (data , header , index_col , skiprows , infer_types ,
603
+ parse_dates ):
604
+ head , body , _ = data # _ is footer which is rarely used: ignore for now
605
+ _expand_elements (body )
606
+ body = [head ] + body
607
+ import ipdb ; ipdb .set_trace ()
608
+ tp = TextParser (body , header = header , index_col = index_col ,
609
+ skiprows = _get_skiprows (skiprows ),
610
+ parse_dates = parse_dates , tupleize_cols = False )
611
+ df = tp .read ()
639
612
640
- try :
641
- df = df .drop (it )
642
- except ValueError :
643
- raise ValueError ('Labels {0} not found when trying to skip'
644
- ' rows' .format (it ))
645
-
646
- # convert to numbers/dates where possible
647
- # must be sequential since dates trump numbers if both args are given
648
- if infer_types :
649
- df = df .convert_objects (convert_numeric = True )
613
+ if infer_types : # remove in 0.14
650
614
df = df .convert_objects (convert_dates = 'coerce' )
651
-
652
- if header is not None :
653
- header_rows = df .iloc [header ]
654
-
655
- if header_rows .ndim == 2 :
656
- names = header_rows .index
657
- df .columns = MultiIndex .from_arrays (header_rows .values ,
658
- names = names )
659
- else :
660
- df .columns = header_rows
661
-
662
- df = df .drop (df .index [header ])
663
-
664
- if index_col is not None :
665
- cols = df .columns [index_col ]
666
-
667
- try :
668
- cols = cols .tolist ()
669
- except AttributeError :
670
- pass
671
-
672
- # drop by default
673
- df .set_index (cols , inplace = True )
674
- if df .index .nlevels == 1 :
675
- if isnull (df .index .name ) or not df .index .name :
676
- df .index .name = None
677
- else :
678
- names = [name or None for name in df .index .names ]
679
- df .index = MultiIndex .from_tuples (df .index .values , names = names )
680
-
615
+ else :
616
+ df = df .applymap (compat .text_type )
681
617
return df
682
618
683
619
@@ -750,7 +686,8 @@ def _validate_parser_flavor(flavor):
750
686
return flavor
751
687
752
688
753
- def _parse (flavor , io , match , header , index_col , skiprows , infer_types , attrs ):
689
+ def _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
690
+ parse_dates , attrs ):
754
691
# bonus: re.compile is idempotent under function iteration so you can pass
755
692
# a compiled regex to it and it will return itself
756
693
flavor = _validate_parser_flavor (flavor )
@@ -771,12 +708,12 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs):
771
708
else :
772
709
raise retained
773
710
774
- return [_data_to_frame (table , header , index_col , infer_types , skiprows )
775
- for table in tables ]
711
+ return [_data_to_frame (table , header , index_col , skiprows , infer_types ,
712
+ parse_dates ) for table in tables ]
776
713
777
714
778
- def read_html (io , match = '.+' , flavor = None , header = None , index_col = None ,
779
- skiprows = None , infer_types = True , attrs = None ):
715
+ def read_html (io , match = '.+' , flavor = None , header = 0 , index_col = None ,
716
+ skiprows = None , infer_types = None , attrs = None , parse_dates = False ):
780
717
r"""Read an HTML table into a DataFrame.
781
718
782
719
Parameters
@@ -801,7 +738,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
801
738
compatibility. The default of ``None`` tries to use ``lxml`` to parse
802
739
and if that fails it falls back on ``bs4`` + ``html5lib``.
803
740
804
- header : int or array-like or None , optional, default ``None ``
741
+ header : int or array-like, optional, default ``0 ``
805
742
The row (or rows for a MultiIndex) to use to make the columns headers.
806
743
Note that this row will be removed from the data.
807
744
@@ -828,9 +765,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
828
765
it is treated as "skip :math:`n` rows", *not* as "skip the
829
766
:math:`n^\textrm{th}` row".
830
767
831
- infer_types : bool, optional, default ``True``
832
- Whether to convert numeric types and date-appearing strings to numbers
833
- and dates, respectively.
768
+ infer_types : bool or None, optional, default ``None``, deprecated since 0.13, removed in 0.14
834
769
835
770
attrs : dict or None, optional, default ``None``
836
771
This is a dictionary of attributes that you can pass to use to identify
@@ -896,8 +831,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
896
831
"""
897
832
# Type check here. We don't want to parse only to fail because of an
898
833
# invalid value of an integer skiprows.
834
+ if infer_types is not None :
835
+ warnings .warn ("infer_types will be removed in 0.14" , UserWarning )
836
+ else :
837
+ infer_types = True # remove in 0.14
838
+
899
839
if isinstance (skiprows , numbers .Integral ) and skiprows < 0 :
900
840
raise AssertionError ('cannot skip rows starting from the end of the '
901
841
'data (you passed a negative value)' )
902
842
return _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
903
- attrs )
843
+ parse_dates , attrs )
0 commit comments