Skip to content

Commit d90a829

Browse files
Chang Shewesm
authored andcommitted
ENH: option to disable empty string conversion to NaN in file readers
1 parent 8961404 commit d90a829

File tree

3 files changed

+38
-7
lines changed

3 files changed

+38
-7
lines changed

pandas/io/parsers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,9 +1029,9 @@ def _convert_types(values, na_values):
10291029
return values, na_count
10301030

10311031
try:
1032-
result = lib.maybe_convert_numeric(values, na_values)
1032+
result = lib.maybe_convert_numeric(values, na_values, False)
10331033
except Exception:
1034-
na_count = lib.sanitize_objects(values, na_values)
1034+
na_count = lib.sanitize_objects(values, na_values, False)
10351035
result = values
10361036

10371037
if result.dtype == np.object_:

pandas/io/tests/test_parsers.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,32 @@ def setUp(self):
5252
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
5353
self.xls1 = os.path.join(self.dirpath, 'test.xls')
5454

55+
def test_empty_string(self):
56+
data = """\
57+
One,Two,Three
58+
a,1,one
59+
b,2,two
60+
,3,three
61+
d,4,nan
62+
e,5,five
63+
nan,6,
64+
g,7,seven
65+
"""
66+
df = read_csv(StringIO(data))
67+
xp = DataFrame({'One' : ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
68+
'Two' : [1,2,3,4,5,6,7],
69+
'Three' : ['one', 'two', 'three', np.nan, 'five',
70+
np.nan, 'seven']})
71+
assert_frame_equal(xp.reindex(columns=df.columns), df)
72+
73+
df = read_csv(StringIO(data), na_values={'One': [], 'Three': []})
74+
xp = DataFrame({'One' : ['a', 'b', '', 'd', 'e', 'nan', 'g'],
75+
'Two' : [1,2,3,4,5,6,7],
76+
'Three' : ['one', 'two', 'three', 'nan', 'five',
77+
'', 'seven']})
78+
assert_frame_equal(xp.reindex(columns=df.columns), df)
79+
80+
5581
def test_read_csv(self):
5682
pass
5783

pandas/src/inference.pyx

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,8 @@ def is_date_array(ndarray[object] values):
241241
return True
242242

243243

244-
def maybe_convert_numeric(ndarray[object] values, set na_values):
244+
def maybe_convert_numeric(ndarray[object] values, set na_values,
245+
convert_empty=True):
245246
'''
246247
Type inference function-- convert strings to numeric (potentially) and
247248
convert to proper dtype array
@@ -275,8 +276,11 @@ def maybe_convert_numeric(ndarray[object] values, set na_values):
275276
floats[i] = complexes[i] = nan
276277
seen_float = 1
277278
elif len(val) == 0:
278-
floats[i] = complexes[i] = nan
279-
seen_float = 1
279+
if convert_empty:
280+
floats[i] = complexes[i] = nan
281+
seen_float = 1
282+
else:
283+
raise ValueError('Empty string encountered')
280284
elif util.is_complex_object(val):
281285
complexes[i] = val
282286
seen_complex = 1
@@ -573,7 +577,8 @@ def try_parse_datetime_components(ndarray[object] years, ndarray[object] months,
573577

574578
return result
575579

576-
def sanitize_objects(ndarray[object] values, set na_values):
580+
def sanitize_objects(ndarray[object] values, set na_values,
581+
convert_empty=True):
577582
cdef:
578583
Py_ssize_t i, n
579584
object val, onan
@@ -585,7 +590,7 @@ def sanitize_objects(ndarray[object] values, set na_values):
585590

586591
for i from 0 <= i < n:
587592
val = values[i]
588-
if val == '' or val in na_values:
593+
if (convert_empty and val == '') or (val in na_values):
589594
values[i] = onan
590595
na_count += 1
591596
elif val in memo:

0 commit comments

Comments
 (0)