12
12
from pandas .core .index import Index
13
13
from pandas .core .frame import DataFrame
14
14
15
- def read_csv (filepath , header = 0 , skiprows = None , index_col = 0 ,
15
+ def read_csv (filepath_or_buffer , header = 0 , skiprows = None , index_col = 0 ,
16
16
na_values = None , date_parser = None ):
17
17
"""
18
18
Read CSV file into DataFrame
19
19
20
20
Parameters
21
21
----------
22
- filepath : string or file handle
22
+ filepath_or_buffer : string or file handle / StringIO
23
23
24
24
header : int, default 0
25
25
Row to use for the column labels of the parsed DataFrame
@@ -29,13 +29,21 @@ def read_csv(filepath, header=0, skiprows=None, index_col=0,
29
29
Column to use as the row labels of the DataFrame. Pass None if there is
30
30
no such column
31
31
na_values : list-like, default None
32
- List of strings to recognize as NA/NaN
32
+ List of additional strings to recognize as NA/NaN
33
+ date_parser : function
34
+ Function to use for converting dates to strings. Defaults to
35
+ dateutil.parser
33
36
"""
34
37
import csv
35
- try :
36
- f = open (filepath , 'U' )
37
- except Exception :
38
- f = open (filepath , 'r' )
38
+
39
+ if hasattr (filepath_or_buffer , 'read' ):
40
+ f = filepath_or_buffer
41
+ else :
42
+ try :
43
+ # universal newline mode
44
+ f = open (filepath_or_buffer , 'U' )
45
+ except Exception :
46
+ f = open (filepath_or_buffer , 'r' )
39
47
40
48
reader = csv .reader (f , dialect = 'excel' )
41
49
@@ -48,14 +56,14 @@ def read_csv(filepath, header=0, skiprows=None, index_col=0,
48
56
return _simple_parser (lines , header = header , indexCol = index_col ,
49
57
na_values = na_values , date_parser = date_parser )
50
58
51
- def read_table (filepath , sep = '\t ' , header = 0 , skiprows = None , index_col = 0 ,
59
+ def read_table (filepath_or_buffer , sep = '\t ' , header = 0 , skiprows = None , index_col = 0 ,
52
60
na_values = None , names = None , date_parser = None ):
53
61
"""
54
62
Read delimited file into DataFrame
55
63
56
64
Parameters
57
65
----------
58
- filepath : string or file handle
66
+ filepath_or_buffer : string or file handle
59
67
sep : string, default '\t '
60
68
Delimiter to use
61
69
header : int, default 0
@@ -66,9 +74,12 @@ def read_table(filepath, sep='\t', header=0, skiprows=None, index_col=0,
66
74
Column to use as the row labels of the DataFrame. Pass None if there is
67
75
no such column
68
76
na_values : list-like, default None
69
- List of strings to recognize as NA/NaN
77
+ List of additional strings to recognize as NA/NaN
78
+ date_parser : function
79
+ Function to use for converting dates to strings. Defaults to
80
+ dateutil.parser
70
81
"""
71
- reader = open (filepath ,'rb' )
82
+ reader = open (filepath_or_buffer ,'rb' )
72
83
73
84
if skiprows is not None :
74
85
skiprows = set (skiprows )
@@ -88,7 +99,6 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0,
88
99
89
100
Should be replaced by np.genfromtxt eventually?
90
101
"""
91
- data = {}
92
102
if header is not None :
93
103
columns = []
94
104
for i , c in enumerate (lines [header ]):
@@ -106,22 +116,33 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0,
106
116
colCounts [col ] += 1
107
117
else :
108
118
if not colNames :
109
- columns = list (string .ascii_uppercase [:len (lines [0 ])])
110
- # columns = ['X.%d' % (i + 1) for i in range(len(lines[0]))]
119
+ # columns = list(string.ascii_uppercase[:len(lines[0])])
120
+ columns = ['X.%d' % (i + 1 ) for i in range (len (lines [0 ]))]
111
121
else :
112
122
columns = colNames
113
123
content = lines
114
124
115
- data = dict (izip (columns , izip (* content )))
125
+ zipped_content = zip (* content )
126
+
127
+ if len (content ) == 0 :
128
+ raise Exception ('No content to parse' )
129
+
130
+ # no index column specified, so infer that's what is wanted
116
131
if indexCol is not None :
117
- index_name = columns [indexCol ]
118
- # try to parse dates
119
- index = data .pop (index_name )
132
+ if indexCol == 0 and len (content [0 ]) == len (columns ) + 1 :
133
+ index = zipped_content [0 ]
134
+ zipped_content = zipped_content [1 :]
135
+ else :
136
+ index = zipped_content .pop (indexCol )
137
+ columns .pop (indexCol )
138
+
120
139
if parse_dates :
121
140
index = _try_parse_dates (index , parser = date_parser )
141
+
122
142
else :
123
- index = np .arange (len (data . values ()[ 0 ] ))
143
+ index = np .arange (len (content ))
124
144
145
+ data = dict (izip (columns , zipped_content ))
125
146
data = _floatify (data , na_values = na_values )
126
147
data = _convert_to_ndarrays (data )
127
148
return DataFrame (data = data , columns = columns , index = Index (index ))
@@ -134,7 +155,7 @@ def _floatify(data_dict, na_values=None):
134
155
if na_values is None :
135
156
na_values = NA_VALUES
136
157
else :
137
- na_values = set (list (na_values ))
158
+ na_values = set (list (na_values )) | NA_VALUES
138
159
139
160
def _convert_float (val ):
140
161
if val in na_values :
@@ -234,7 +255,7 @@ def parse(self, sheetname, header=None, skiprows=None, index_col=0,
234
255
Column to use as the row labels of the DataFrame. Pass None if there
235
256
is no such column
236
257
na_values : list-like, default None
237
- List of strings to recognize as NA/NaN
258
+ List of additional strings to recognize as NA/NaN
238
259
"""
239
260
from datetime import MINYEAR , time , datetime
240
261
from xlrd import xldate_as_tuple , XL_CELL_DATE
0 commit comments