Skip to content

Commit bfc81ea

Browse files
committed
BUG: use map_infer instead of np.vectorize. handle NA sentinels if converter yields numeric array, GH #753
1 parent 9b07b42 commit bfc81ea

File tree

2 files changed

+63
-2
lines changed

2 files changed

+63
-2
lines changed

pandas/io/parsers.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ def get_chunk(self, rows=None):
475475
for col, f in self.converters.iteritems():
476476
if isinstance(col, int) and col not in self.columns:
477477
col = self.columns[col]
478-
result = np.vectorize(f)(data[col])
478+
result = lib.map_infer(data[col], f)
479479
if issubclass(result.dtype.type, (basestring, unicode)):
480480
result = result.astype('O')
481481
data[col] = result
@@ -533,6 +533,12 @@ def _convert_to_ndarrays(dct, na_values, verbose=False):
533533
def _convert_types(values, na_values):
534534
na_count = 0
535535
if issubclass(values.dtype.type, (np.number, np.bool_)):
536+
mask = lib.ismember(values, na_values)
537+
na_count = mask.sum()
538+
if na_count > 0:
539+
if com.is_integer_dtype(values):
540+
values = values.astype(np.float64)
541+
np.putmask(values, mask, np.nan)
536542
return values, na_count
537543

538544
try:

pandas/io/tests/test_parsers.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from numpy import nan
1212
import numpy as np
1313

14-
from pandas import DataFrame, Index
14+
from pandas import DataFrame, Index, isnull
1515
from pandas.io.parsers import read_csv, read_table, ExcelFile, TextParser
1616
from pandas.util.testing import assert_almost_equal, assert_frame_equal
1717
import pandas._tseries as lib
@@ -584,6 +584,61 @@ def test_read_csv_parse_simple_list(self):
584584
'foo', 'bar']})
585585
assert_frame_equal(df, expected)
586586

587+
def test_converters_corner_with_nas(self):
588+
import StringIO
589+
import numpy as np
590+
import pandas
591+
csv = """id,score,days
592+
1,2,12
593+
2,2-5,
594+
3,,14+
595+
4,6-12,2"""
596+
597+
def convert_days(x):
598+
x = x.strip()
599+
if not x: return np.nan
600+
601+
is_plus = x.endswith('+')
602+
if is_plus:
603+
x = int(x[:-1]) + 1
604+
else:
605+
x = int(x)
606+
return x
607+
608+
def convert_days_sentinel(x):
609+
x = x.strip()
610+
if not x: return -1
611+
612+
is_plus = x.endswith('+')
613+
if is_plus:
614+
x = int(x[:-1]) + 1
615+
else:
616+
x = int(x)
617+
return x
618+
619+
def convert_score(x):
620+
x = x.strip()
621+
if not x: return np.nan
622+
if x.find('-')>0:
623+
valmin, valmax = map(int, x.split('-'))
624+
val = 0.5*(valmin + valmax)
625+
else:
626+
val = float(x)
627+
628+
return val
629+
630+
fh = StringIO.StringIO(csv)
631+
result = pandas.read_csv(fh, converters={'score':convert_score,
632+
'days':convert_days},
633+
na_values=[-1,'',None])
634+
self.assert_(isnull(result['days'][1]))
635+
636+
fh = StringIO.StringIO(csv)
637+
result2 = pandas.read_csv(fh, converters={'score':convert_score,
638+
'days':convert_days_sentinel},
639+
na_values=[-1,'',None])
640+
assert_frame_equal(result, result2)
641+
587642
class TestParseSQL(unittest.TestCase):
588643

589644
def test_convert_sql_column_floats(self):

0 commit comments

Comments
 (0)