BUG: Prevent aliasing of dict na_values

gfyoung · gfyoung · commit 1439c2780b41 · 2016-12-15T11:09:18.000-05:00
diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt
@@ -39,6 +39,7 @@ Bug Fixes
 
 - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
 - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
+- Bug in ``pd.read_csv`` in which aliasing was being done for ``na_values`` when passed in as a dictionary (:issue:`14203`)
 - Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
 - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`)
 - Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally.
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2767,6 +2767,7 @@ def _clean_na_values(na_values, keep_default_na=True):
             na_values = []
         na_fvalues = set()
     elif isinstance(na_values, dict):
+        na_values = na_values.copy()  # Prevent aliasing.
         if keep_default_na:
             for k, v in compat.iteritems(na_values):
                 if not is_list_like(v):
diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py
@@ -266,3 +266,16 @@ def test_na_values_scalar(self):
         out = self.read_csv(StringIO(data), names=names,
                             na_values={'a': 2, 'b': 1})
         tm.assert_frame_equal(out, expected)
+
+    def test_na_values_dict_aliasing(self):
+        na_values = {'a': 2, 'b': 1}
+        na_values_copy = na_values.copy()
+
+        names = ['a', 'b']
+        data = '1,2\n2,1'
+
+        expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
+        out = self.read_csv(StringIO(data), names=names, na_values=na_values)
+
+        tm.assert_frame_equal(out, expected)
+        tm.assert_dict_equal(na_values, na_values_copy)