BUG, DOC: Fix inconsistencies with scalar na_values in read_csv (pandas-dev#14056)

gfyoung · jorisvandenbossche · commit 447df80ac69e · 2016-08-21T21:34:41.000+02:00
Update documentation to state that scalars are accepted for na_values. In addition, accept scalars for the values when a dictionary is passed in for na_values. Closes pandas-devgh-12224.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -208,7 +208,7 @@ memory_map : boolean, default False
 NA and Missing Data Handling
 ++++++++++++++++++++++++++++
 
-na_values : str, list-like or dict, default ``None``
+na_values : scalar, str, list-like, or dict, default ``None``
   Additional strings to recognize as NA/NaN. If dict passed, specific per-column
   NA values. By default the following values are interpreted as NaN:
   ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA',
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -957,6 +957,7 @@ Bug Fixes
 - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
 - Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`)
 - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
+- Bug in ``pd.read_csv()``, which caused errors to be raised when a dictionary containing scalars is passed in for ``na_values`` (:issue:`12224`)
 - Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`)
 - Bug in ``pd.read_csv()`` with ``engine='python'`` which raised errors when a numpy array was passed in for ``usecols`` (:issue:`12546`)
 - Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`)
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -94,7 +94,7 @@
       column ranges (e.g. "A:E" or "A,C,E:F")
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
-na_values : str or list-like or dict, default None
+na_values : scalar, str, list-like, or dict, default None
     Additional strings to recognize as NA/NaN. If dict passed, specific
     per-column NA values. By default the following values are interpreted
     as NaN: '""" + "', '".join(sorted(_NA_VALUES)) + """'.
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -129,7 +129,7 @@
     DEPRECATED: use the `skipfooter` parameter instead, as they are identical
 nrows : int, default None
     Number of rows of file to read. Useful for reading pieces of large files
-na_values : str or list-like or dict, default None
+na_values : scalar, str, list-like, or dict, default None
     Additional strings to recognize as NA/NaN. If dict passed, specific
     per-column NA values.  By default the following values are interpreted as
     NaN: `'""" + "'`, `'".join(sorted(_NA_VALUES)) + """'`.
@@ -1604,8 +1604,8 @@ def TextParser(*args, **kwds):
     has_index_names: boolean, default False
         True if the cols defined in index_col have an index name and are
         not in the header
-    na_values : iterable, default None
-        Custom NA values
+    na_values : scalar, str, list-like, or dict, default None
+        Additional strings to recognize as NA/NaN.
     keep_default_na : bool, default True
     thousands : str, default None
         Thousands separator
@@ -2687,7 +2687,9 @@ def _clean_na_values(na_values, keep_default_na=True):
     elif isinstance(na_values, dict):
         if keep_default_na:
             for k, v in compat.iteritems(na_values):
-                v = set(list(v)) | _NA_VALUES
+                if not is_list_like(v):
+                    v = [v]
+                v = set(v) | _NA_VALUES
                 na_values[k] = v
         na_fvalues = dict([
             (k, _floatify_na_values(v)) for k, v in na_values.items()  # noqa
diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py
@@ -250,3 +250,19 @@ def test_na_trailing_columns(self):
         result = self.read_csv(StringIO(data))
         self.assertEqual(result['Date'][1], '2012-05-12')
         self.assertTrue(result['UnitPrice'].isnull().all())
+
+    def test_na_values_scalar(self):
+        # see gh-12224
+        names = ['a', 'b']
+        data = '1,2\n2,1'
+
+        expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]],
+                             columns=names)
+        out = self.read_csv(StringIO(data), names=names, na_values=1)
+        tm.assert_frame_equal(out, expected)
+
+        expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]],
+                             columns=names)
+        out = self.read_csv(StringIO(data), names=names,
+                            na_values={'a': 2, 'b': 1})
+        tm.assert_frame_equal(out, expected)