Skip to content

Commit 44fa303

Browse files
author
Kevin Sheppard
committed
BUG: Ensure 'coerce' actually coerces datatypes
Changes behavior of convert objects so that passing 'coerce' will ensure that data of the correct type is returned, even if all values are null-types (NaN or NaT). closes #9589
1 parent efc4a08 commit 44fa303

File tree

3 files changed

+84
-60
lines changed

3 files changed

+84
-60
lines changed

pandas/core/common.py

+41-41
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ class SettingWithCopyError(ValueError):
3434
class SettingWithCopyWarning(Warning):
3535
pass
3636

37+
class RedundantSettingWarning(Warning):
38+
pass
3739

3840
class AmbiguousIndexError(PandasError, KeyError):
3941
pass
@@ -1888,60 +1890,58 @@ def _possibly_convert_objects(values, convert_dates=True,
18881890
convert_timedeltas=True):
18891891
""" if we have an object dtype, try to coerce dates and/or numbers """
18901892

1893+
# If 1 flag is coerce, ensure 2 others are False
1894+
conversions = (convert_dates, convert_numeric, convert_timedeltas)
1895+
if 'coerce' in conversions:
1896+
coerce_count = sum([c == 'coerce' for c in conversions])
1897+
if coerce_count > 1:
1898+
raise ValueError("'coerce' can be used at most once.")
1899+
1900+
false_count = sum([not c for c in conversions])
1901+
if false_count != 2:
1902+
import warnings
1903+
warnings.warn("Soft conversion flags ignored when using 'coerce'",
1904+
RedundantSettingWarning)
1905+
18911906
# if we have passed in a list or scalar
18921907
if isinstance(values, (list, tuple)):
18931908
values = np.array(values, dtype=np.object_)
18941909
if not hasattr(values, 'dtype'):
18951910
values = np.array([values], dtype=np.object_)
18961911

1897-
# convert dates
1898-
if convert_dates and values.dtype == np.object_:
1899-
1900-
# we take an aggressive stance and convert to datetime64[ns]
1901-
if convert_dates == 'coerce':
1902-
new_values = _possibly_cast_to_datetime(
1903-
values, 'M8[ns]', coerce=True)
1912+
# If not object, do not convert
1913+
if values.dtype != np.object_:
1914+
return values
19041915

1905-
# if we are all nans then leave me alone
1906-
if not isnull(new_values).all():
1907-
values = new_values
1908-
1909-
else:
1910-
values = lib.maybe_convert_objects(
1911-
values, convert_datetime=convert_dates)
1916+
# Immediate return if coerce
1917+
if convert_dates == 'coerce':
1918+
return _possibly_cast_to_datetime(values, 'M8[ns]', coerce=True)
1919+
if convert_timedeltas == 'coerce':
1920+
from pandas.tseries.timedeltas import to_timedelta
1921+
return np.asanyarray(to_timedelta(values, coerce=True))
1922+
if convert_numeric == 'coerce':
1923+
return lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
19121924

1925+
# convert dates
1926+
if convert_dates:
1927+
values = lib.maybe_convert_objects(values,
1928+
convert_datetime=convert_dates)
19131929
# convert timedeltas
19141930
if convert_timedeltas and values.dtype == np.object_:
1915-
1916-
if convert_timedeltas == 'coerce':
1917-
from pandas.tseries.timedeltas import to_timedelta
1918-
values = to_timedelta(values, coerce=True)
1919-
1931+
# Only if previous failed
1932+
values = lib.maybe_convert_objects(values,
1933+
convert_timedelta=convert_timedeltas)
1934+
# convert to numeric
1935+
if convert_numeric and values.dtype == np.object_:
1936+
# Only if previous failed
1937+
try:
1938+
new_values = lib.maybe_convert_numeric(values, set(),
1939+
coerce_numeric=True)
19201940
# if we are all nans then leave me alone
19211941
if not isnull(new_values).all():
19221942
values = new_values
1923-
1924-
else:
1925-
values = lib.maybe_convert_objects(
1926-
values, convert_timedelta=convert_timedeltas)
1927-
1928-
# convert to numeric
1929-
if values.dtype == np.object_:
1930-
if convert_numeric:
1931-
try:
1932-
new_values = lib.maybe_convert_numeric(
1933-
values, set(), coerce_numeric=True)
1934-
1935-
# if we are all nans then leave me alone
1936-
if not isnull(new_values).all():
1937-
values = new_values
1938-
1939-
except:
1940-
pass
1941-
else:
1942-
1943-
# soft-conversion
1944-
values = lib.maybe_convert_objects(values)
1943+
except:
1944+
pass
19451945

19461946
return values
19471947

pandas/core/internals.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1484,8 +1484,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T
14841484
else:
14851485

14861486
values = com._possibly_convert_objects(
1487-
self.values.ravel(), convert_dates=convert_dates,
1488-
convert_numeric=convert_numeric
1487+
self.values.ravel(),
1488+
convert_dates=convert_dates,
1489+
convert_numeric=convert_numeric,
1490+
convert_timedeltas=convert_timedeltas
14891491
).reshape(self.values.shape)
14901492
blocks.append(make_block(values,
14911493
ndim=self.ndim, placement=self.mgr_locs))

pandas/tests/test_series.py

+39-17
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,15 @@
88
from inspect import getargspec
99
from itertools import product, starmap
1010
from distutils.version import LooseVersion
11+
import warnings
1112

1213
import nose
13-
1414
from numpy import nan, inf
1515
import numpy as np
1616
import numpy.ma as ma
17-
import pandas as pd
17+
import pandas.lib as lib
1818

19+
import pandas as pd
1920
from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range,
2021
date_range, period_range, timedelta_range)
2122
from pandas.core.index import MultiIndex
@@ -25,11 +26,8 @@
2526
from pandas.tseries.tdi import Timedelta, TimedeltaIndex
2627
import pandas.core.common as com
2728
import pandas.core.config as cf
28-
import pandas.lib as lib
29-
3029
import pandas.core.datetools as datetools
3130
import pandas.core.nanops as nanops
32-
3331
from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long
3432
from pandas import compat
3533
from pandas.util.testing import (assert_series_equal,
@@ -39,6 +37,7 @@
3937
import pandas.util.testing as tm
4038

4139

40+
4241
#------------------------------------------------------------------------------
4342
# Series test cases
4443

@@ -3432,7 +3431,6 @@ def test_ops_datetimelike_align(self):
34323431

34333432
def test_timedelta64_functions(self):
34343433

3435-
from datetime import timedelta
34363434
from pandas import date_range
34373435

34383436
# index min/max
@@ -5820,6 +5818,24 @@ def test_apply_dont_convert_dtype(self):
58205818
self.assertEqual(result.dtype, object)
58215819

58225820
def test_convert_objects(self):
5821+
# Tests: All to nans, coerce, true
5822+
# Test coercion returns correct type
5823+
s = Series(['a', 'b', 'c'])
5824+
results = s.convert_objects('coerce', False, False)
5825+
expected = Series([lib.NaT] * 3)
5826+
assert_series_equal(results, expected)
5827+
5828+
results = s.convert_objects(False, 'coerce', False)
5829+
expected = Series([np.nan] * 3)
5830+
assert_series_equal(results, expected)
5831+
5832+
results = s.convert_objects(False, False, 'coerce')
5833+
expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]'))
5834+
assert_series_equal(results, expected)
5835+
5836+
with warnings.catch_warnings(record=True) as w:
5837+
tm.assert_produces_warning(s.convert_objects(True, True, 'coerce'),
5838+
com.RedundantSettingWarning)
58235839

58245840
s = Series([1., 2, 3], index=['a', 'b', 'c'])
58255841
result = s.convert_objects(convert_dates=False, convert_numeric=True)
@@ -5875,23 +5891,28 @@ def test_convert_objects(self):
58755891
[Timestamp(
58765892
'20010101'), Timestamp('20010102'), Timestamp('20010103'),
58775893
lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]')
5878-
result = s2.convert_objects(
5879-
convert_dates='coerce', convert_numeric=False)
5894+
result = s2.convert_objects(convert_dates='coerce',
5895+
convert_numeric=False,
5896+
convert_timedeltas=False)
58805897
assert_series_equal(result, expected)
5881-
result = s2.convert_objects(
5882-
convert_dates='coerce', convert_numeric=True)
5898+
result = s2.convert_objects(convert_dates='coerce',
5899+
convert_numeric=False,
5900+
convert_timedeltas=False)
58835901
assert_series_equal(result, expected)
58845902

58855903
# preserver all-nans (if convert_dates='coerce')
58865904
s = Series(['foo', 'bar', 1, 1.0], dtype='O')
5887-
result = s.convert_objects(
5888-
convert_dates='coerce', convert_numeric=False)
5889-
assert_series_equal(result, s)
5905+
result = s.convert_objects(convert_dates='coerce',
5906+
convert_numeric=False,
5907+
convert_timedeltas=False)
5908+
expected = Series([lib.NaT]*4)
5909+
assert_series_equal(result, expected)
58905910

58915911
# preserver if non-object
58925912
s = Series([1], dtype='float32')
5893-
result = s.convert_objects(
5894-
convert_dates='coerce', convert_numeric=False)
5913+
result = s.convert_objects(convert_dates='coerce',
5914+
convert_numeric=False,
5915+
convert_timedeltas=False)
58955916
assert_series_equal(result, s)
58965917

58975918
#r = s.copy()
@@ -5900,13 +5921,14 @@ def test_convert_objects(self):
59005921
#self.assertEqual(result.dtype, 'M8[ns]')
59015922

59025923
# dateutil parses some single letters into today's value as a date
5924+
expected = Series([lib.NaT])
59035925
for x in 'abcdefghijklmnopqrstuvwxyz':
59045926
s = Series([x])
59055927
result = s.convert_objects(convert_dates='coerce')
5906-
assert_series_equal(result, s)
5928+
assert_series_equal(result, expected)
59075929
s = Series([x.upper()])
59085930
result = s.convert_objects(convert_dates='coerce')
5909-
assert_series_equal(result, s)
5931+
assert_series_equal(result, expected)
59105932

59115933
def test_convert_objects_preserve_bool(self):
59125934
s = Series([1, True, 3, 5], dtype=object)

0 commit comments

Comments
 (0)