DOC/TST: clean up docs & tests, xref pandas-dev#15594

jreback · jreback · commit c7c74ad7b2fc · 2017-03-14T09:33:01.000-04:00
BUG: default_fill_value for get_dummies will be 0
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -516,6 +516,39 @@ New Behavior:
   In [5]: df['a']['2011-12-31 23:59:59']
   Out[5]: 1
 
+.. _whatsnew_0200.api_breaking.concat_dtypes:
+
+Concat of different float dtypes will not automatically upcast
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, ``concat`` of multiple objects with different ``float`` dtypes would automatically upcast results to a dtype of ``float64``.
+Now the smallest acceptable dtype will be used (:issue:`13247`)
+
+.. ipython:: python
+
+   df1 = pd.DataFrame(np.array([1.0], dtype=np.float32, ndmin=2))
+   df1.dtypes
+
+.. ipython:: python
+
+   df2 = pd.DataFrame(np.array([np.nan], dtype=np.float32, ndmin=2))
+   df2.dtypes
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [7]: pd.concat([df1,df2]).dtypes
+   Out[7]:
+   0    float64
+   dtype: object
+
+New Behavior:
+
+.. ipython:: python
+
+   pd.concat([df1,df2]).dtypes
+
 .. _whatsnew_0200.api_breaking.gbq:
 
 Pandas Google BigQuery support has moved
@@ -693,6 +726,7 @@ Other API Changes
 - Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`)
 - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`)
 - ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
+- ``SparseDataFrame.default_fill_value`` will be 0, previously was ``nan`` in the return from ``pd.get_dummies(..., sparse=True)`` (:issue:`15594`)
 
 .. _whatsnew_0200.deprecations:
 
@@ -784,7 +818,6 @@ Bug Fixes
 - Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`)
 
 
-
 - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
 
 - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)
@@ -886,5 +919,3 @@ Bug Fixes
 - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
 - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
 - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
-
-- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -4936,9 +4936,9 @@ def get_empty_dtype_and_na(join_units):
                 return np.float64, np.nan
             else:
                 return g, None
-        else:
-            msg = "invalid dtype determination in get_concat_dtype"
-            raise AssertionError(msg)
+
+    msg = "invalid dtype determination in get_concat_dtype"
+    raise AssertionError(msg)
 
 
 def concatenate_join_units(join_units, concat_axis, copy):
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -1308,7 +1308,7 @@ def get_empty_Frame(data, sparse):
         if not sparse:
             return DataFrame(index=index)
         else:
-            return SparseDataFrame(index=index)
+            return SparseDataFrame(index=index, default_fill_value=0)
 
     # if all NaN
     if not dummy_na and len(levels) == 0:
@@ -1357,6 +1357,7 @@ def get_empty_Frame(data, sparse):
             sparse_series[col] = SparseSeries(data=sarr, index=index)
 
         out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
+                              default_fill_value=0,
                               dtype=np.uint8)
         return out
 
diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py
@@ -205,7 +205,7 @@ def f():
 
         self.assertRaises(ValueError, f)
 
-        # these are coerced to float unavoidably (as its a list-like to begin)
+        # TODO: #15657, these are left as object and not coerced
         df = DataFrame(columns=['A', 'B'])
         df.loc[3] = [6, 7]
 
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
@@ -2,7 +2,6 @@
 # pylint: disable-msg=W0612,E1101
 
 from pandas import DataFrame, Series
-from pandas.core.sparse import SparseDataFrame
 import pandas as pd
 
 from numpy import nan
@@ -234,26 +233,31 @@ def test_basic_types(self):
                           'b': ['A', 'A', 'B', 'C', 'C'],
                           'c': [2, 3, 3, 3, 2]})
 
+        expected = DataFrame({'a': [1, 0, 0],
+                              'b': [0, 1, 0],
+                              'c': [0, 0, 1]},
+                             dtype='uint8',
+                             columns=list('abc'))
         if not self.sparse:
-            exp_df_type = DataFrame
-            exp_blk_type = pd.core.internals.IntBlock
+            compare = tm.assert_frame_equal
         else:
-            exp_df_type = SparseDataFrame
-            exp_blk_type = pd.core.internals.SparseBlock
-
-        self.assertEqual(
-            type(get_dummies(s_list, sparse=self.sparse)), exp_df_type)
-        self.assertEqual(
-            type(get_dummies(s_series, sparse=self.sparse)), exp_df_type)
-
-        r = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
-        self.assertEqual(type(r), exp_df_type)
-
-        r = get_dummies(s_df, sparse=self.sparse, columns=['a'])
-        exp_blk_type = pd.core.internals.IntBlock
-        self.assertEqual(type(r[['a_0']]._data.blocks[0]), exp_blk_type)
-        self.assertEqual(type(r[['a_1']]._data.blocks[0]), exp_blk_type)
-        self.assertEqual(type(r[['a_2']]._data.blocks[0]), exp_blk_type)
+            expected = expected.to_sparse(fill_value=0, kind='integer')
+            compare = tm.assert_sp_frame_equal
+
+        result = get_dummies(s_list, sparse=self.sparse)
+        compare(result, expected)
+
+        result = get_dummies(s_series, sparse=self.sparse)
+        compare(result, expected)
+
+        result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
+        tm.assert_series_equal(result.get_dtype_counts(),
+                               Series({'uint8': 8}))
+
+        result = get_dummies(s_df, sparse=self.sparse, columns=['a'])
+        expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values()
+        tm.assert_series_equal(result.get_dtype_counts().sort_values(),
+                               expected)
 
     def test_just_na(self):
         just_na_list = [np.nan]
diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py
@@ -1913,3 +1913,14 @@ def test_concat_no_unnecessary_upcast(dt, pdt):
            pdt(np.array([5], dtype=dt, ndmin=dims))]
     x = pd.concat(dfs)
     assert x.values.dtype == dt
+
+
+@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
+@pytest.mark.parametrize('dt', np.sctypes['int'])
+def test_concat_will_upcast(dt, pdt):
+    dims = pdt().ndim
+    dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
+           pdt(np.array([np.nan], ndmin=dims)),
+           pdt(np.array([5], dtype=dt, ndmin=dims))]
+    x = pd.concat(dfs)
+    assert x.values.dtype == 'float64'