Skip to content

Commit c7c74ad

Browse files
committed
DOC/TST: clean up docs & tests, xref pandas-dev#15594
BUG: default_fill_value for get_dummies will be 0
1 parent 7d34d4d commit c7c74ad

File tree

6 files changed

+74
-27
lines changed

6 files changed

+74
-27
lines changed

doc/source/whatsnew/v0.20.0.txt

+34-3
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,39 @@ New Behavior:
516516
In [5]: df['a']['2011-12-31 23:59:59']
517517
Out[5]: 1
518518

519+
.. _whatsnew_0200.api_breaking.concat_dtypes:
520+
521+
Concat of different float dtypes will not automatically upcast
522+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
523+
524+
Previously, ``concat`` of multiple objects with different ``float`` dtypes would automatically upcast results to a dtype of ``float64``.
525+
Now the smallest acceptable dtype will be used (:issue:`13247`)
526+
527+
.. ipython:: python
528+
529+
df1 = pd.DataFrame(np.array([1.0], dtype=np.float32, ndmin=2))
530+
df1.dtypes
531+
532+
.. ipython:: python
533+
534+
df2 = pd.DataFrame(np.array([np.nan], dtype=np.float32, ndmin=2))
535+
df2.dtypes
536+
537+
Previous Behavior:
538+
539+
.. code-block:: ipython
540+
541+
In [7]: pd.concat([df1,df2]).dtypes
542+
Out[7]:
543+
0 float64
544+
dtype: object
545+
546+
New Behavior:
547+
548+
.. ipython:: python
549+
550+
pd.concat([df1,df2]).dtypes
551+
519552
.. _whatsnew_0200.api_breaking.gbq:
520553

521554
Pandas Google BigQuery support has moved
@@ -693,6 +726,7 @@ Other API Changes
693726
- Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`)
694727
- ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`)
695728
- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
729+
- ``SparseDataFrame.default_fill_value`` will be 0, previously was ``nan`` in the return from ``pd.get_dummies(..., sparse=True)`` (:issue:`15594`)
696730

697731
.. _whatsnew_0200.deprecations:
698732

@@ -784,7 +818,6 @@ Bug Fixes
784818
- Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`)
785819

786820

787-
788821
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
789822

790823
- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)
@@ -886,5 +919,3 @@ Bug Fixes
886919
- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
887920
- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
888921
- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
889-
890-
- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`)

pandas/core/internals.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -4936,9 +4936,9 @@ def get_empty_dtype_and_na(join_units):
49364936
return np.float64, np.nan
49374937
else:
49384938
return g, None
4939-
else:
4940-
msg = "invalid dtype determination in get_concat_dtype"
4941-
raise AssertionError(msg)
4939+
4940+
msg = "invalid dtype determination in get_concat_dtype"
4941+
raise AssertionError(msg)
49424942

49434943

49444944
def concatenate_join_units(join_units, concat_axis, copy):

pandas/core/reshape.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1308,7 +1308,7 @@ def get_empty_Frame(data, sparse):
13081308
if not sparse:
13091309
return DataFrame(index=index)
13101310
else:
1311-
return SparseDataFrame(index=index)
1311+
return SparseDataFrame(index=index, default_fill_value=0)
13121312

13131313
# if all NaN
13141314
if not dummy_na and len(levels) == 0:
@@ -1357,6 +1357,7 @@ def get_empty_Frame(data, sparse):
13571357
sparse_series[col] = SparseSeries(data=sarr, index=index)
13581358

13591359
out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
1360+
default_fill_value=0,
13601361
dtype=np.uint8)
13611362
return out
13621363

pandas/tests/indexing/test_partial.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def f():
205205

206206
self.assertRaises(ValueError, f)
207207

208-
# these are coerced to float unavoidably (as its a list-like to begin)
208+
# TODO: #15657, these are left as object and not coerced
209209
df = DataFrame(columns=['A', 'B'])
210210
df.loc[3] = [6, 7]
211211

pandas/tests/test_reshape.py

+23-19
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# pylint: disable-msg=W0612,E1101
33

44
from pandas import DataFrame, Series
5-
from pandas.core.sparse import SparseDataFrame
65
import pandas as pd
76

87
from numpy import nan
@@ -234,26 +233,31 @@ def test_basic_types(self):
234233
'b': ['A', 'A', 'B', 'C', 'C'],
235234
'c': [2, 3, 3, 3, 2]})
236235

236+
expected = DataFrame({'a': [1, 0, 0],
237+
'b': [0, 1, 0],
238+
'c': [0, 0, 1]},
239+
dtype='uint8',
240+
columns=list('abc'))
237241
if not self.sparse:
238-
exp_df_type = DataFrame
239-
exp_blk_type = pd.core.internals.IntBlock
242+
compare = tm.assert_frame_equal
240243
else:
241-
exp_df_type = SparseDataFrame
242-
exp_blk_type = pd.core.internals.SparseBlock
243-
244-
self.assertEqual(
245-
type(get_dummies(s_list, sparse=self.sparse)), exp_df_type)
246-
self.assertEqual(
247-
type(get_dummies(s_series, sparse=self.sparse)), exp_df_type)
248-
249-
r = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
250-
self.assertEqual(type(r), exp_df_type)
251-
252-
r = get_dummies(s_df, sparse=self.sparse, columns=['a'])
253-
exp_blk_type = pd.core.internals.IntBlock
254-
self.assertEqual(type(r[['a_0']]._data.blocks[0]), exp_blk_type)
255-
self.assertEqual(type(r[['a_1']]._data.blocks[0]), exp_blk_type)
256-
self.assertEqual(type(r[['a_2']]._data.blocks[0]), exp_blk_type)
244+
expected = expected.to_sparse(fill_value=0, kind='integer')
245+
compare = tm.assert_sp_frame_equal
246+
247+
result = get_dummies(s_list, sparse=self.sparse)
248+
compare(result, expected)
249+
250+
result = get_dummies(s_series, sparse=self.sparse)
251+
compare(result, expected)
252+
253+
result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
254+
tm.assert_series_equal(result.get_dtype_counts(),
255+
Series({'uint8': 8}))
256+
257+
result = get_dummies(s_df, sparse=self.sparse, columns=['a'])
258+
expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values()
259+
tm.assert_series_equal(result.get_dtype_counts().sort_values(),
260+
expected)
257261

258262
def test_just_na(self):
259263
just_na_list = [np.nan]

pandas/tests/tools/test_concat.py

+11
Original file line numberDiff line numberDiff line change
@@ -1913,3 +1913,14 @@ def test_concat_no_unnecessary_upcast(dt, pdt):
19131913
pdt(np.array([5], dtype=dt, ndmin=dims))]
19141914
x = pd.concat(dfs)
19151915
assert x.values.dtype == dt
1916+
1917+
1918+
@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
1919+
@pytest.mark.parametrize('dt', np.sctypes['int'])
1920+
def test_concat_will_upcast(dt, pdt):
1921+
dims = pdt().ndim
1922+
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
1923+
pdt(np.array([np.nan], ndmin=dims)),
1924+
pdt(np.array([5], dtype=dt, ndmin=dims))]
1925+
x = pd.concat(dfs)
1926+
assert x.values.dtype == 'float64'

0 commit comments

Comments
 (0)