Skip to content

Commit d63ac05

Browse files
committed
CLN/ENH: add parse_dates arg and use TextReader
1 parent ac0ce3c commit d63ac05

File tree

3 files changed

+102
-155
lines changed

3 files changed

+102
-155
lines changed

pandas/io/html.py

+45-105
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,18 @@
77
import re
88
import numbers
99
import collections
10+
import warnings
1011

12+
from itertools import repeat
1113
from distutils.version import LooseVersion
1214

1315
import numpy as np
1416

15-
from pandas import DataFrame, MultiIndex, isnull
1617
from pandas.io.common import _is_url, urlopen, parse_url
17-
from pandas.compat import range, lrange, lmap, u, map
18-
from pandas import compat
18+
from pandas.io.parsers import TextParser
19+
from pandas.compat import lrange, lmap, u
20+
from pandas.core import common as com
21+
from pandas import compat, Series
1922

2023

2124
try:
@@ -67,7 +70,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE):
6770
return regex.sub(' ', s.strip())
6871

6972

70-
def _get_skiprows_iter(skiprows):
73+
def _get_skiprows(skiprows):
7174
"""Get an iterator given an integer, slice or container.
7275
7376
Parameters
@@ -92,10 +95,10 @@ def _get_skiprows_iter(skiprows):
9295
"""
9396
if isinstance(skiprows, slice):
9497
return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1)
95-
elif isinstance(skiprows, numbers.Integral):
96-
return lrange(skiprows)
97-
elif isinstance(skiprows, collections.Container):
98+
elif isinstance(skiprows, numbers.Integral) or com.is_list_like(skiprows):
9899
return skiprows
100+
elif skiprows is None:
101+
return 0
99102
else:
100103
raise TypeError('{0} is not a valid type for skipping'
101104
' rows'.format(type(skiprows)))
@@ -583,101 +586,34 @@ def _parse_raw_tfoot(self, table):
583586
table.xpath(expr)]
584587

585588

586-
def _data_to_frame(data, header, index_col, infer_types, skiprows):
587-
"""Parse a BeautifulSoup table into a DataFrame.
589+
def _nan_list(n):
590+
return list(repeat(np.nan, n))
588591

589-
Parameters
590-
----------
591-
data : tuple of lists
592-
The raw data to be placed into a DataFrame. This is a list of lists of
593-
strings or unicode. If it helps, it can be thought of as a matrix of
594-
strings instead.
595-
596-
header : int or None
597-
An integer indicating the row to use for the column header or None
598-
indicating no header will be used.
599-
600-
index_col : int or None
601-
An integer indicating the column to use for the index or None
602-
indicating no column will be used.
603-
604-
infer_types : bool
605-
Whether to convert numbers and dates.
606-
607-
skiprows : collections.Container or int or slice
608-
Iterable used to skip rows.
609-
610-
Returns
611-
-------
612-
df : DataFrame
613-
A DataFrame containing the data from `data`
614-
615-
Raises
616-
------
617-
ValueError
618-
* If `skiprows` is not found in the rows of the parsed DataFrame.
619592

620-
Raises
621-
------
622-
ValueError
623-
* If `skiprows` is not found in the rows of the parsed DataFrame.
593+
def _expand_elements(body):
594+
lens = Series(lmap(len, body))
595+
lens_max = lens.max()
596+
not_max = lens[lens != lens_max]
624597

625-
See Also
626-
--------
627-
read_html
598+
for ind, length in not_max.iteritems():
599+
body[ind] += _nan_list(lens_max - length)
628600

629-
Notes
630-
-----
631-
The `data` parameter is guaranteed not to be a list of empty lists.
632-
"""
633-
thead, tbody, tfoot = data
634-
columns = thead or None
635-
df = DataFrame(tbody, columns=columns)
636601

637-
if skiprows is not None:
638-
it = _get_skiprows_iter(skiprows)
602+
def _data_to_frame(data, header, index_col, skiprows, infer_types,
603+
parse_dates):
604+
head, body, _ = data # _ is footer which is rarely used: ignore for now
605+
_expand_elements(body)
606+
body = [head] + body
607+
import ipdb; ipdb.set_trace()
608+
tp = TextParser(body, header=header, index_col=index_col,
609+
skiprows=_get_skiprows(skiprows),
610+
parse_dates=parse_dates, tupleize_cols=False)
611+
df = tp.read()
639612

640-
try:
641-
df = df.drop(it)
642-
except ValueError:
643-
raise ValueError('Labels {0} not found when trying to skip'
644-
' rows'.format(it))
645-
646-
# convert to numbers/dates where possible
647-
# must be sequential since dates trump numbers if both args are given
648-
if infer_types:
649-
df = df.convert_objects(convert_numeric=True)
613+
if infer_types: # remove in 0.14
650614
df = df.convert_objects(convert_dates='coerce')
651-
652-
if header is not None:
653-
header_rows = df.iloc[header]
654-
655-
if header_rows.ndim == 2:
656-
names = header_rows.index
657-
df.columns = MultiIndex.from_arrays(header_rows.values,
658-
names=names)
659-
else:
660-
df.columns = header_rows
661-
662-
df = df.drop(df.index[header])
663-
664-
if index_col is not None:
665-
cols = df.columns[index_col]
666-
667-
try:
668-
cols = cols.tolist()
669-
except AttributeError:
670-
pass
671-
672-
# drop by default
673-
df.set_index(cols, inplace=True)
674-
if df.index.nlevels == 1:
675-
if isnull(df.index.name) or not df.index.name:
676-
df.index.name = None
677-
else:
678-
names = [name or None for name in df.index.names]
679-
df.index = MultiIndex.from_tuples(df.index.values, names=names)
680-
615+
else:
616+
df = df.applymap(compat.text_type)
681617
return df
682618

683619

@@ -750,7 +686,8 @@ def _validate_parser_flavor(flavor):
750686
return flavor
751687

752688

753-
def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs):
689+
def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
690+
parse_dates, attrs):
754691
# bonus: re.compile is idempotent under function iteration so you can pass
755692
# a compiled regex to it and it will return itself
756693
flavor = _validate_parser_flavor(flavor)
@@ -771,12 +708,12 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs):
771708
else:
772709
raise retained
773710

774-
return [_data_to_frame(table, header, index_col, infer_types, skiprows)
775-
for table in tables]
711+
return [_data_to_frame(table, header, index_col, skiprows, infer_types,
712+
parse_dates) for table in tables]
776713

777714

778-
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
779-
skiprows=None, infer_types=True, attrs=None):
715+
def read_html(io, match='.+', flavor=None, header=0, index_col=None,
716+
skiprows=None, infer_types=None, attrs=None, parse_dates=False):
780717
r"""Read an HTML table into a DataFrame.
781718
782719
Parameters
@@ -801,7 +738,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
801738
compatibility. The default of ``None`` tries to use ``lxml`` to parse
802739
and if that fails it falls back on ``bs4`` + ``html5lib``.
803740
804-
header : int or array-like or None, optional, default ``None``
741+
header : int or array-like, optional, default ``0``
805742
The row (or rows for a MultiIndex) to use to make the columns headers.
806743
Note that this row will be removed from the data.
807744
@@ -828,9 +765,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
828765
it is treated as "skip :math:`n` rows", *not* as "skip the
829766
:math:`n^\textrm{th}` row".
830767
831-
infer_types : bool, optional, default ``True``
832-
Whether to convert numeric types and date-appearing strings to numbers
833-
and dates, respectively.
768+
infer_types : bool or None, optional, default ``None``, deprecated since 0.13, removed in 0.14
834769
835770
attrs : dict or None, optional, default ``None``
836771
This is a dictionary of attributes that you can pass to use to identify
@@ -896,8 +831,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
896831
"""
897832
# Type check here. We don't want to parse only to fail because of an
898833
# invalid value of an integer skiprows.
834+
if infer_types is not None:
835+
warnings.warn("infer_types will be removed in 0.14", UserWarning)
836+
else:
837+
infer_types = True # remove in 0.14
838+
899839
if isinstance(skiprows, numbers.Integral) and skiprows < 0:
900840
raise AssertionError('cannot skip rows starting from the end of the '
901841
'data (you passed a negative value)')
902842
return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
903-
attrs)
843+
parse_dates, attrs)

pandas/io/parsers.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -1468,22 +1468,23 @@ def _convert_data(self, data):
14681468
col = self.orig_names[col]
14691469
clean_conv[col] = f
14701470

1471-
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose,
1472-
clean_conv)
1471+
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
1472+
self.verbose, clean_conv)
14731473

14741474
def _infer_columns(self):
1475+
#import ipdb; ipdb.set_trace()
14751476
names = self.names
14761477

14771478
if self.header is not None:
14781479
header = self.header
14791480

14801481
# we have a mi columns, so read and extra line
1481-
if isinstance(header,(list,tuple,np.ndarray)):
1482+
if isinstance(header, (list, tuple, np.ndarray)):
14821483
have_mi_columns = True
1483-
header = list(header) + [header[-1]+1]
1484+
header = list(header) + [header[-1] + 1]
14841485
else:
14851486
have_mi_columns = False
1486-
header = [ header ]
1487+
header = [header]
14871488

14881489
columns = []
14891490
for level, hr in enumerate(header):
@@ -1498,7 +1499,7 @@ def _infer_columns(self):
14981499

14991500
this_columns = []
15001501
for i, c in enumerate(line):
1501-
if c == '':
1502+
if not c:
15021503
if have_mi_columns:
15031504
this_columns.append('Unnamed: %d_level_%d' % (i,level))
15041505
else:

0 commit comments

Comments
 (0)