Skip to content

Commit ab923e7

Browse files
committed
starting unit tests for parsing functions, way overdue
1 parent 4001816 commit ab923e7

File tree

3 files changed

+55
-22
lines changed

3 files changed

+55
-22
lines changed

pandas/core/generic.py

+8
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ def load(cls, fileName):
2424
class PandasError(Exception):
2525
pass
2626

27+
class NDFrame(object):
28+
"""
29+
N-dimensional labeled array data structure with potentially heterogenous
30+
dtypes along one axis
31+
"""
32+
def __init__(self, data):
33+
pass
34+
2735
class PandasGeneric(Picklable):
2836

2937
_AXIS_NUMBERS = {

pandas/io/parsers.py

+42-21
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
from pandas.core.index import Index
1313
from pandas.core.frame import DataFrame
1414

15-
def read_csv(filepath, header=0, skiprows=None, index_col=0,
15+
def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0,
1616
na_values=None, date_parser=None):
1717
"""
1818
Read CSV file into DataFrame
1919
2020
Parameters
2121
----------
22-
filepath : string or file handle
22+
filepath_or_buffer : string or file handle / StringIO
2323
2424
header : int, default 0
2525
Row to use for the column labels of the parsed DataFrame
@@ -29,13 +29,21 @@ def read_csv(filepath, header=0, skiprows=None, index_col=0,
2929
Column to use as the row labels of the DataFrame. Pass None if there is
3030
no such column
3131
na_values : list-like, default None
32-
List of strings to recognize as NA/NaN
32+
List of additional strings to recognize as NA/NaN
33+
date_parser : function
34+
Function to use for converting dates to strings. Defaults to
35+
dateutil.parser
3336
"""
3437
import csv
35-
try:
36-
f = open(filepath, 'U')
37-
except Exception:
38-
f = open(filepath, 'r')
38+
39+
if hasattr(filepath_or_buffer, 'read'):
40+
f = filepath_or_buffer
41+
else:
42+
try:
43+
# universal newline mode
44+
f = open(filepath_or_buffer, 'U')
45+
except Exception:
46+
f = open(filepath_or_buffer, 'r')
3947

4048
reader = csv.reader(f, dialect='excel')
4149

@@ -48,14 +56,14 @@ def read_csv(filepath, header=0, skiprows=None, index_col=0,
4856
return _simple_parser(lines, header=header, indexCol=index_col,
4957
na_values=na_values, date_parser=date_parser)
5058

51-
def read_table(filepath, sep='\t', header=0, skiprows=None, index_col=0,
59+
def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, index_col=0,
5260
na_values=None, names=None, date_parser=None):
5361
"""
5462
Read delimited file into DataFrame
5563
5664
Parameters
5765
----------
58-
filepath : string or file handle
66+
filepath_or_buffer : string or file handle
5967
sep : string, default '\t'
6068
Delimiter to use
6169
header : int, default 0
@@ -66,9 +74,12 @@ def read_table(filepath, sep='\t', header=0, skiprows=None, index_col=0,
6674
Column to use as the row labels of the DataFrame. Pass None if there is
6775
no such column
6876
na_values : list-like, default None
69-
List of strings to recognize as NA/NaN
77+
List of additional strings to recognize as NA/NaN
78+
date_parser : function
79+
Function to use for converting dates to strings. Defaults to
80+
dateutil.parser
7081
"""
71-
reader = open(filepath,'rb')
82+
reader = open(filepath_or_buffer,'rb')
7283

7384
if skiprows is not None:
7485
skiprows = set(skiprows)
@@ -88,7 +99,6 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0,
8899
89100
Should be replaced by np.genfromtxt eventually?
90101
"""
91-
data = {}
92102
if header is not None:
93103
columns = []
94104
for i, c in enumerate(lines[header]):
@@ -106,22 +116,33 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0,
106116
colCounts[col] += 1
107117
else:
108118
if not colNames:
109-
columns = list(string.ascii_uppercase[:len(lines[0])])
110-
# columns = ['X.%d' % (i + 1) for i in range(len(lines[0]))]
119+
# columns = list(string.ascii_uppercase[:len(lines[0])])
120+
columns = ['X.%d' % (i + 1) for i in range(len(lines[0]))]
111121
else:
112122
columns = colNames
113123
content = lines
114124

115-
data = dict(izip(columns, izip(*content)))
125+
zipped_content = zip(*content)
126+
127+
if len(content) == 0:
128+
raise Exception('No content to parse')
129+
130+
# no index column specified, so infer that's what is wanted
116131
if indexCol is not None:
117-
index_name = columns[indexCol]
118-
# try to parse dates
119-
index = data.pop(index_name)
132+
if indexCol == 0 and len(content[0]) == len(columns) + 1:
133+
index = zipped_content[0]
134+
zipped_content = zipped_content[1:]
135+
else:
136+
index = zipped_content.pop(indexCol)
137+
columns.pop(indexCol)
138+
120139
if parse_dates:
121140
index = _try_parse_dates(index, parser=date_parser)
141+
122142
else:
123-
index = np.arange(len(data.values()[0]))
143+
index = np.arange(len(content))
124144

145+
data = dict(izip(columns, zipped_content))
125146
data = _floatify(data, na_values=na_values)
126147
data = _convert_to_ndarrays(data)
127148
return DataFrame(data=data, columns=columns, index=Index(index))
@@ -134,7 +155,7 @@ def _floatify(data_dict, na_values=None):
134155
if na_values is None:
135156
na_values = NA_VALUES
136157
else:
137-
na_values = set(list(na_values))
158+
na_values = set(list(na_values)) | NA_VALUES
138159

139160
def _convert_float(val):
140161
if val in na_values:
@@ -234,7 +255,7 @@ def parse(self, sheetname, header=None, skiprows=None, index_col=0,
234255
Column to use as the row labels of the DataFrame. Pass None if there
235256
is no such column
236257
na_values : list-like, default None
237-
List of strings to recognize as NA/NaN
258+
List of additional strings to recognize as NA/NaN
238259
"""
239260
from datetime import MINYEAR, time, datetime
240261
from xlrd import xldate_as_tuple, XL_CELL_DATE

pandas/io/tests/test_pytables.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -297,14 +297,18 @@ def _check_roundtrip_table(self, obj, comparator):
297297
os.remove(self.scratchpath)
298298

299299
def test_legacy_read(self):
300-
pth, _ = os.path.split(os.path.abspath(__file__))
300+
pth = curpath()
301301
store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r')
302302
store['a']
303303
store['b']
304304
store['c']
305305
store['d']
306306
store.close()
307307

308+
def curpath():
309+
pth, _ = os.path.split(os.path.abspath(__file__))
310+
return pth
311+
308312
def _test_sort(obj):
309313
if isinstance(obj, DataFrame):
310314
return obj.reindex(sorted(obj.index))

0 commit comments

Comments
 (0)