Skip to content

Commit d00f109

Browse files
committed
Merge pull request #3841 from jreback/GH3611_2
BUG: GH3611 fix again, float na_values were not stringified correctly
2 parents 4d06037 + c840591 commit d00f109

File tree

3 files changed

+127
-48
lines changed

3 files changed

+127
-48
lines changed

pandas/io/parsers.py

+64-30
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ def parser_f(filepath_or_buffer,
297297
skipfooter=None,
298298
skip_footer=0,
299299
na_values=None,
300+
na_fvalues=None,
300301
true_values=None,
301302
false_values=None,
302303
delimiter=None,
@@ -359,6 +360,7 @@ def parser_f(filepath_or_buffer,
359360
prefix=prefix,
360361
skiprows=skiprows,
361362
na_values=na_values,
363+
na_fvalues=na_fvalues,
362364
true_values=true_values,
363365
false_values=false_values,
364366
keep_default_na=keep_default_na,
@@ -554,7 +556,7 @@ def _clean_options(self, options, engine):
554556
converters = {}
555557

556558
# Converting values to NA
557-
na_values = _clean_na_values(na_values, keep_default_na)
559+
na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
558560

559561
if com.is_integer(skiprows):
560562
skiprows = range(skiprows)
@@ -565,6 +567,7 @@ def _clean_options(self, options, engine):
565567
result['names'] = names
566568
result['converters'] = converters
567569
result['na_values'] = na_values
570+
result['na_fvalues'] = na_fvalues
568571
result['skiprows'] = skiprows
569572

570573
return result, engine
@@ -644,6 +647,7 @@ def __init__(self, kwds):
644647
self.keep_date_col = kwds.pop('keep_date_col', False)
645648

646649
self.na_values = kwds.get('na_values')
650+
self.na_fvalues = kwds.get('na_fvalues')
647651
self.true_values = kwds.get('true_values')
648652
self.false_values = kwds.get('false_values')
649653
self.tupleize_cols = kwds.get('tupleize_cols',True)
@@ -837,31 +841,34 @@ def _agg_index(self, index, try_parse_dates=True):
837841
arr = self._date_conv(arr)
838842

839843
col_na_values = self.na_values
844+
col_na_fvalues = self.na_fvalues
840845

841846
if isinstance(self.na_values, dict):
842847
col_name = self.index_names[i]
843848
if col_name is not None:
844-
col_na_values = _get_na_values(col_name,
845-
self.na_values)
846-
847-
arr, _ = self._convert_types(arr, col_na_values)
849+
col_na_values, col_na_fvalues = _get_na_values(col_name,
850+
self.na_values,
851+
self.na_fvalues)
852+
853+
arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues)
848854
arrays.append(arr)
849855

850856
index = MultiIndex.from_arrays(arrays, names=self.index_names)
851857

852858
return index
853859

854-
def _convert_to_ndarrays(self, dct, na_values, verbose=False,
860+
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
855861
converters=None):
856862
result = {}
857863
for c, values in dct.iteritems():
858864
conv_f = None if converters is None else converters.get(c, None)
859-
col_na_values = _get_na_values(c, na_values)
865+
col_na_values, col_na_fvalues = _get_na_values(c, na_values, na_fvalues)
860866
coerce_type = True
861867
if conv_f is not None:
862868
values = lib.map_infer(values, conv_f)
863869
coerce_type = False
864-
cvals, na_count = self._convert_types(values, col_na_values,
870+
cvals, na_count = self._convert_types(values,
871+
set(col_na_values) | col_na_fvalues,
865872
coerce_type)
866873
result[c] = cvals
867874
if verbose and na_count:
@@ -1370,7 +1377,7 @@ def _convert_data(self, data):
13701377
col = self.orig_names[col]
13711378
clean_conv[col] = f
13721379

1373-
return self._convert_to_ndarrays(data, self.na_values, self.verbose,
1380+
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose,
13741381
clean_conv)
13751382

13761383
def _infer_columns(self):
@@ -1754,37 +1761,26 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
17541761

17551762

17561763
def _clean_na_values(na_values, keep_default_na=True):
1764+
17571765
if na_values is None and keep_default_na:
17581766
na_values = _NA_VALUES
1767+
na_fvalues = set()
17591768
elif isinstance(na_values, dict):
17601769
if keep_default_na:
17611770
for k, v in na_values.iteritems():
17621771
v = set(list(v)) | _NA_VALUES
17631772
na_values[k] = v
1773+
na_fvalues = dict([ (k, _floatify_na_values(v)) for k, v in na_values.items() ])
17641774
else:
17651775
if not com.is_list_like(na_values):
17661776
na_values = [na_values]
1767-
na_values = set(_stringify_na_values(na_values))
1777+
na_values = _stringify_na_values(na_values)
17681778
if keep_default_na:
17691779
na_values = na_values | _NA_VALUES
17701780

1771-
return na_values
1781+
na_fvalues = _floatify_na_values(na_values)
17721782

1773-
def _stringify_na_values(na_values):
1774-
""" return a stringified and numeric for these values """
1775-
result = []
1776-
for x in na_values:
1777-
result.append(str(x))
1778-
result.append(x)
1779-
try:
1780-
result.append(float(x))
1781-
except:
1782-
pass
1783-
try:
1784-
result.append(int(x))
1785-
except:
1786-
pass
1787-
return result
1783+
return na_values, na_fvalues
17881784

17891785
def _clean_index_names(columns, index_col):
17901786
if not _is_index_col(index_col):
@@ -1832,14 +1828,52 @@ def _get_empty_meta(columns, index_col, index_names):
18321828
return index, columns, {}
18331829

18341830

1835-
def _get_na_values(col, na_values):
1831+
def _floatify_na_values(na_values):
1832+
# create float versions of the na_values
1833+
result = set()
1834+
for v in na_values:
1835+
try:
1836+
v = float(v)
1837+
if not np.isnan(v):
1838+
result.add(v)
1839+
except:
1840+
pass
1841+
return result
1842+
1843+
def _stringify_na_values(na_values):
1844+
""" return a stringified and numeric for these values """
1845+
result = []
1846+
for x in na_values:
1847+
result.append(str(x))
1848+
result.append(x)
1849+
try:
1850+
v = float(x)
1851+
1852+
# we are like 999 here
1853+
if v == int(v):
1854+
v = int(v)
1855+
result.append("%s.0" % v)
1856+
result.append(str(v))
1857+
1858+
result.append(v)
1859+
except:
1860+
pass
1861+
try:
1862+
result.append(int(x))
1863+
except:
1864+
pass
1865+
return set(result)
1866+
1867+
def _get_na_values(col, na_values, na_fvalues):
18361868
if isinstance(na_values, dict):
18371869
if col in na_values:
1838-
return set(_stringify_na_values(list(na_values[col])))
1870+
values = na_values[col]
1871+
fvalues = na_fvalues[col]
1872+
return na_values[col], na_fvalues[col]
18391873
else:
1840-
return _NA_VALUES
1874+
return _NA_VALUES, set()
18411875
else:
1842-
return na_values
1876+
return na_values, na_fvalues
18431877

18441878

18451879
def _get_col_names(colspec, columns):

pandas/io/tests/test_parsers.py

+30
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,36 @@ def test_non_string_na_values(self):
540540
tm.assert_frame_equal(result1,result2)
541541
tm.assert_frame_equal(result2,result3)
542542

543+
result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0'])
544+
result5 = read_csv(path, sep= ' ', header=0, na_values=['-999'])
545+
result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0])
546+
result7 = read_csv(path, sep= ' ', header=0, na_values=[-999])
547+
tm.assert_frame_equal(result4,result3)
548+
tm.assert_frame_equal(result5,result3)
549+
tm.assert_frame_equal(result6,result3)
550+
tm.assert_frame_equal(result7,result3)
551+
552+
good_compare = result3
553+
554+
# with an odd float format, so we can't match the string 999.0 exactly,
555+
# but need float matching
556+
df.to_csv(path, sep=' ', index=False, float_format = '%.3f')
557+
result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999'])
558+
result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0])
559+
result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999])
560+
tm.assert_frame_equal(result1,good_compare)
561+
tm.assert_frame_equal(result2,good_compare)
562+
tm.assert_frame_equal(result3,good_compare)
563+
564+
result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0'])
565+
result5 = read_csv(path, sep= ' ', header=0, na_values=['-999'])
566+
result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0])
567+
result7 = read_csv(path, sep= ' ', header=0, na_values=[-999])
568+
tm.assert_frame_equal(result4,good_compare)
569+
tm.assert_frame_equal(result5,good_compare)
570+
tm.assert_frame_equal(result6,good_compare)
571+
tm.assert_frame_equal(result7,good_compare)
572+
543573
def test_custom_na_values(self):
544574
data = """A,B,C
545575
ignore,this,row

0 commit comments

Comments
 (0)