Merge pull request #7941 from ahlmss/set_item_to_none

jreback · jreback · commit 1d5cb4a150ce · 2014-08-19T12:42:50.000-04:00
API: Coerce None according to the dtype of the container
diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst
@@ -105,6 +105,34 @@ pandas objects provide intercompatibility between ``NaT`` and ``NaN``.
    df2
    df2.get_dtype_counts()
 
+.. _missing.inserting:
+
+Inserting missing data
+----------------------
+
+You can insert missing values by simply assigning to containers. The
+actual missing value used will be chosen based on the dtype.
+
+For example, numeric containers will always use ``NaN`` regardless of
+the missing value type chosen:
+
+.. ipython:: python
+
+   s = Series([1, 2, 3])
+   s.loc[0] = None
+   s
+
+Likewise, datetime containers will always use ``NaT``.
+
+For object containers, pandas will use the value given:
+
+.. ipython:: python
+
+   s = Series(["a", "b", "c"])
+   s.loc[0] = None
+   s.loc[1] = np.nan
+   s
+
 
 Calculations with missing data
 ------------------------------
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -270,6 +270,31 @@ API changes
      idx.duplicated()
      idx.drop_duplicates()
 
+- Assigning values to ``None`` now considers the dtype when choosing an 'empty' value (:issue:`7941`).
+
+  Previously, assigning to ``None`` in numeric containers changed the
+  dtype to object (or errored, depending on the call). It now uses
+  NaN:
+
+  .. ipython:: python
+
+     s = Series([1, 2, 3])
+     s.loc[0] = None
+     s
+
+  ``NaT`` is now used similarly for datetime containers.
+
+  For object containers, we now preserve None values (previously these
+  were converted to NaN values).
+
+  .. ipython:: python
+
+     s = Series(["a", "b", "c"])
+     s.loc[0] = None
+     s
+
+  To insert a NaN, you must explicitly use ``np.nan``. See the :ref:`docs <missing.inserting>`.
+
 .. _whatsnew_0150.dt:
 
 .dt accessor
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -374,7 +374,7 @@ def _is_null_datelike_scalar(other):
         return isnull(other)
     return False
 
-def array_equivalent(left, right):
+def array_equivalent(left, right, strict_nan=False):
     """
     True if two arrays, left and right, have equal non-NaN elements, and NaNs in
     corresponding locations.  False otherwise. It is assumed that left and right
@@ -385,6 +385,8 @@ def array_equivalent(left, right):
     Parameters
     ----------
     left, right : ndarrays
+    strict_nan : bool, default False
+        If True, consider NaN and None to be different.
 
     Returns
     -------
@@ -400,11 +402,32 @@ def array_equivalent(left, right):
     """
     left, right = np.asarray(left), np.asarray(right)
     if left.shape != right.shape: return False
-    # NaNs occur only in object arrays, float or complex arrays.
+
+    # Object arrays can contain None, NaN and NaT.
     if issubclass(left.dtype.type, np.object_):
-        return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
+
+        if not strict_nan:
+            # pd.isnull considers NaN and None to be equivalent.
+            return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
+
+        for left_value, right_value in zip(left, right):
+            if left_value is tslib.NaT and right_value is not tslib.NaT:
+                return False
+
+            elif isinstance(left_value, float) and np.isnan(left_value):
+                if not isinstance(right_value, float) or not np.isnan(right_value):
+                    return False
+            else:
+                if left_value != right_value:
+                    return False
+
+        return True
+
+    # NaNs can occur in float and complex arrays.
     if issubclass(left.dtype.type, (np.floating, np.complexfloating)):
         return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
+
+    # NaNs cannot occur otherwise.
     return np.array_equal(left, right)
 
 def _iterable_not_string(x):
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -494,6 +494,11 @@ def setitem(self, indexer, value):
         compatible shape
         """
 
+        # coerce None values, if appropriate
+        if value is None:
+            if self.is_numeric:
+                value = np.nan
+
         # coerce args
         values, value = self._try_coerce_args(self.values, value)
         arr_value = np.array(value)
@@ -587,7 +592,7 @@ def putmask(self, mask, new, align=True, inplace=False):
             mask = mask.values.T
 
         # if we are passed a scalar None, convert it here
-        if not is_list_like(new) and isnull(new):
+        if not is_list_like(new) and isnull(new) and not self.is_object:
             new = self.fill_value
 
         if self._can_hold_element(new):
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -2,8 +2,10 @@
 import nose
 import itertools
 import warnings
+from datetime import datetime
 
 from pandas.compat import range, lrange, lzip, StringIO, lmap, map
+from pandas.tslib import NaT
 from numpy import nan
 from numpy.random import randn
 import numpy as np
@@ -14,7 +16,8 @@
 from pandas.core.api import (DataFrame, Index, Series, Panel, isnull,
                              MultiIndex, Float64Index, Timestamp)
 from pandas.util.testing import (assert_almost_equal, assert_series_equal,
-                                 assert_frame_equal, assert_panel_equal)
+                                 assert_frame_equal, assert_panel_equal,
+                                 assert_attr_equal)
 from pandas import concat
 
 import pandas.util.testing as tm
@@ -3816,6 +3819,139 @@ def test_float_index_non_scalar_assignment(self):
         tm.assert_frame_equal(df,df2)
 
 
+class TestSeriesNoneCoercion(tm.TestCase):
+    EXPECTED_RESULTS = [
+        # For numeric series, we should coerce to NaN.
+        ([1, 2, 3], [np.nan, 2, 3]),
+        ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
+        
+        # For datetime series, we should coerce to NaT.
+        ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
+         [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]),
+        
+        # For objects, we should preserve the None value.
+        (["foo", "bar", "baz"], [None, "bar", "baz"]),
+    ]
+
+    def test_coercion_with_setitem(self):
+        for start_data, expected_result in self.EXPECTED_RESULTS:
+            start_series = Series(start_data)
+            start_series[0] = None
+
+            expected_series = Series(expected_result)
+
+            assert_attr_equal('dtype', start_series, expected_series)
+            self.assert_numpy_array_equivalent(
+                start_series.values,
+                expected_series.values, strict_nan=True)
+    
+    def test_coercion_with_loc_setitem(self):
+        for start_data, expected_result in self.EXPECTED_RESULTS:
+            start_series = Series(start_data)
+            start_series.loc[0] = None
+
+            expected_series = Series(expected_result)
+
+            assert_attr_equal('dtype', start_series, expected_series)
+            self.assert_numpy_array_equivalent(
+                start_series.values,
+                expected_series.values, strict_nan=True)
+    
+    def test_coercion_with_setitem_and_series(self):
+        for start_data, expected_result in self.EXPECTED_RESULTS:
+            start_series = Series(start_data)
+            start_series[start_series == start_series[0]] = None
+
+            expected_series = Series(expected_result)
+
+            assert_attr_equal('dtype', start_series, expected_series)
+            self.assert_numpy_array_equivalent(
+                start_series.values,
+                expected_series.values, strict_nan=True)
+    
+    def test_coercion_with_loc_and_series(self):
+        for start_data, expected_result in self.EXPECTED_RESULTS:
+            start_series = Series(start_data)
+            start_series.loc[start_series == start_series[0]] = None
+
+            expected_series = Series(expected_result)
+
+            assert_attr_equal('dtype', start_series, expected_series)
+            self.assert_numpy_array_equivalent(
+                start_series.values,
+                expected_series.values, strict_nan=True)
+    
+
+class TestDataframeNoneCoercion(tm.TestCase):
+    EXPECTED_SINGLE_ROW_RESULTS = [
+        # For numeric series, we should coerce to NaN.
+        ([1, 2, 3], [np.nan, 2, 3]),
+        ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
+        
+        # For datetime series, we should coerce to NaT.
+        ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
+         [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]),
+        
+        # For objects, we should preserve the None value.
+        (["foo", "bar", "baz"], [None, "bar", "baz"]),
+    ]
+
+    def test_coercion_with_loc(self):
+        for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
+            start_dataframe = DataFrame({'foo': start_data})
+            start_dataframe.loc[0, ['foo']] = None
+
+            expected_dataframe = DataFrame({'foo': expected_result})
+
+            assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
+            self.assert_numpy_array_equivalent(
+                start_dataframe['foo'].values,
+                expected_dataframe['foo'].values, strict_nan=True)
+
+    def test_coercion_with_setitem_and_dataframe(self):
+        for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
+            start_dataframe = DataFrame({'foo': start_data})
+            start_dataframe[start_dataframe['foo'] == start_dataframe['foo'][0]] = None
+
+            expected_dataframe = DataFrame({'foo': expected_result})
+
+            assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
+            self.assert_numpy_array_equivalent(
+                start_dataframe['foo'].values,
+                expected_dataframe['foo'].values, strict_nan=True)
+
+    def test_none_coercion_loc_and_dataframe(self):
+        for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
+            start_dataframe = DataFrame({'foo': start_data})
+            start_dataframe.loc[start_dataframe['foo'] == start_dataframe['foo'][0]] = None
+
+            expected_dataframe = DataFrame({'foo': expected_result})
+
+            assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
+            self.assert_numpy_array_equivalent(
+                start_dataframe['foo'].values,
+                expected_dataframe['foo'].values, strict_nan=True)
+
+    def test_none_coercion_mixed_dtypes(self):
+        start_dataframe = DataFrame({
+            'a': [1, 2, 3],
+            'b': [1.0, 2.0, 3.0],
+            'c': [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
+            'd': ['a', 'b', 'c']})
+        start_dataframe.iloc[0] = None
+
+        expected_dataframe = DataFrame({
+            'a': [np.nan, 2, 3],
+            'b': [np.nan, 2.0, 3.0],
+            'c': [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
+            'd': [None, 'b', 'c']})
+
+        for column in expected_dataframe.columns:
+            assert_attr_equal('dtype', start_dataframe[column], expected_dataframe[column])
+            self.assert_numpy_array_equivalent(
+                start_dataframe[column].values,
+                expected_dataframe[column].values, strict_nan=True)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -105,7 +105,7 @@ def round_trip_pickle(self, obj, path=None):
             pd.to_pickle(obj, path)
             return pd.read_pickle(path)
 
-    def assert_numpy_array_equivalent(self, np_array, assert_equal):
+    def assert_numpy_array_equivalent(self, np_array, assert_equal, strict_nan=False):
         """Checks that 'np_array' is equivalent to 'assert_equal'
 
         Two numpy arrays are equivalent if the arrays have equal non-NaN elements, and
@@ -115,7 +115,7 @@ def assert_numpy_array_equivalent(self, np_array, assert_equal):
         similar to `assert_numpy_array_equal()`. If the expected array includes `np.nan` use this
         function.
         """
-        if array_equivalent(np_array, assert_equal):
+        if array_equivalent(np_array, assert_equal, strict_nan=strict_nan):
             return
         raise AssertionError('{0} is not equivalent to {1}.'.format(np_array, assert_equal))