Skip to content

ENH: astype() can now take col label -> dtype mapping as arg; GH7271 #13375

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ API changes
- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`)
- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`)
- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`)
- ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`)


.. _whatsnew_0190.api.tolist:
Expand Down
41 changes: 36 additions & 5 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# pylint: disable=W0231,E1101
import collections
import warnings
import operator
import weakref
Expand Down Expand Up @@ -161,7 +162,7 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False):

@property
def _constructor(self):
"""Used when a manipulation result has the same dimesions as the
"""Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
Expand Down Expand Up @@ -3001,18 +3002,48 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs):

Parameters
----------
dtype : numpy.dtype or Python type
dtype : data type, or dict of column name -> data type
Use a numpy.dtype or Python type to cast entire pandas object to
the same type. Alternatively, use {col: dtype, ...}, where col is a
column label and dtype is a numpy.dtype or Python type to cast one
or more of the DataFrame's columns to column-specific types.
raise_on_error : raise on invalid input
kwargs : keyword arguments to pass on to the constructor

Returns
-------
casted : type of caller
"""
if isinstance(dtype, collections.Mapping):
if self.ndim == 1: # i.e. Series
if len(dtype) > 1 or list(dtype.keys())[0] != self.name:
raise KeyError('Only the Series name can be used for '
'the key in Series dtype mappings.')
new_type = list(dtype.values())[0]
return self.astype(new_type, copy, raise_on_error, **kwargs)
elif self.ndim > 2:
raise NotImplementedError(
'astype() only accepts a dtype arg of type dict when '
'invoked on Series and DataFrames. A single dtype must be '
'specified when invoked on a Panel.'
)
for col_name in dtype.keys():
if col_name not in self:
raise KeyError('Only a column name can be used for the '
'key in a dtype mappings argument.')
from pandas import concat
results = []
for col_name, col in self.iteritems():
if col_name in dtype:
results.append(col.astype(dtype[col_name], copy=copy))
else:
results.append(results.append(col.copy() if copy else col))
return concat(results, axis=1, copy=False)

mgr = self._data.astype(dtype=dtype, copy=copy,
raise_on_error=raise_on_error, **kwargs)
return self._constructor(mgr).__finalize__(self)
# else, only a single dtype is given
new_data = self._data.astype(dtype=dtype, copy=copy,
raise_on_error=raise_on_error, **kwargs)
return self._constructor(new_data).__finalize__(self)

def copy(self, deep=True):
"""
Expand Down
65 changes: 64 additions & 1 deletion pandas/tests/frame/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import numpy as np
from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp,
compat, option_context)
compat, concat, option_context)
from pandas.compat import u
from pandas.types.dtypes import DatetimeTZDtype
from pandas.tests.frame.common import TestData
Expand Down Expand Up @@ -396,6 +396,69 @@ def test_astype_str(self):
expected = DataFrame(['1.12345678901'])
assert_frame_equal(result, expected)

def test_astype_dict(self):
# GH7271
a = Series(date_range('2010-01-04', periods=5))
b = Series(range(5))
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
d = Series(['1.0', '2', '3.14', '4', '5.4'])
df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
original = df.copy(deep=True)

# change type of a subset of columns
result = df.astype({'b': 'str', 'd': 'float32'})
expected = DataFrame({
'a': a,
'b': Series(['0', '1', '2', '3', '4']),
'c': c,
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
assert_frame_equal(result, expected)
assert_frame_equal(df, original)

result = df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64})
expected = DataFrame({
'a': a,
'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
assert_frame_equal(result, expected)
assert_frame_equal(df, original)

# change all columns
assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}),
df.astype(str))
assert_frame_equal(df, original)

# error should be raised when using something other than column labels
# in the keys of the dtype dict
self.assertRaises(KeyError, df.astype, {'b': str, 2: str})
self.assertRaises(KeyError, df.astype, {'e': str})
assert_frame_equal(df, original)

# if the dtypes provided are the same as the original dtypes, the
# resulting DataFrame should be the same as the original DataFrame
equiv = df.astype({col: df[col].dtype for col in df.columns})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a test that uses copy=False and only astype a couple of columns (this might not be completely conclusive, but just make sure it works). IOW if it is changing the dtype of say an int and you have 2 int columns, both will end up being copied regardless of the flag, but if you have ONLY a single int column and say a string column, the string colummn will not end up being copied.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. It seems that new_df = concat(casted_cols + other_cols, axis=1) leads to every column being copied because it uses the DataFrame constructor with dict data. Therefore, the copy param seems meaningless in the case when dtypes are provided as a mapping.

Copy link
Contributor

@jreback jreback Jul 4, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, there is really not way ATM not to copy everything :<. Well there is but its tricky and you have to jump thru hoops internally and not worth it.

assert_frame_equal(df, equiv)
assert_frame_equal(df, original)

def test_astype_duplicate_col(self):
a1 = Series([1, 2, 3, 4, 5], name='a')
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b')
a2 = Series([0, 1, 2, 3, 4], name='a')
df = concat([a1, b, a2], axis=1)

result = df.astype(str)
a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a')
b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str,
name='b')
a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a')
expected = concat([a1_str, b_str, a2_str], axis=1)
assert_frame_equal(result, expected)

result = df.astype({'a': 'str'})
expected = concat([a1_str, b, a2_str], axis=1)
assert_frame_equal(result, expected)

def test_timedeltas(self):
df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
freq='D')),
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/series/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,22 @@ def test_astype_unicode(self):
reload(sys) # noqa
sys.setdefaultencoding(former_encoding)

def test_astype_dict(self):
# GH7271
s = Series(range(0, 10, 2), name='abc')

result = s.astype({'abc': str})
expected = Series(['0', '2', '4', '6', '8'], name='abc')
assert_series_equal(result, expected)

result = s.astype({'abc': 'float64'})
expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64',
name='abc')
assert_series_equal(result, expected)

self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str})
self.assertRaises(KeyError, s.astype, {0: str})

def test_complexx(self):
# GH4819
# complex access for ndarray compat
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/test_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,6 +1231,18 @@ def test_dtypes(self):
expected = Series(np.dtype('float64'), index=self.panel.items)
assert_series_equal(result, expected)

def test_astype(self):
# GH7271
data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
panel = Panel(data, ['a', 'b'], ['c', 'd'], ['e', 'f'])

str_data = np.array([[['1', '2'], ['3', '4']],
[['5', '6'], ['7', '8']]])
expected = Panel(str_data, ['a', 'b'], ['c', 'd'], ['e', 'f'])
assert_panel_equal(panel.astype(str), expected)

self.assertRaises(NotImplementedError, panel.astype, {0: str})

def test_apply(self):
# GH1148

Expand Down