Skip to content

Commit 417c3a9

Browse files
author
Kevin Sheppard
committed
BUG: Ensure 'coerce' actually coerces datatypes
Changes behavior of convert objects so that passing 'coerce' will ensure that data of the correct type is returned, even if all values are null-types (NaN or NaT). closes #9589
1 parent e34e4be commit 417c3a9

File tree

6 files changed

+142
-70
lines changed

6 files changed

+142
-70
lines changed

pandas/core/common.py

+38-45
Original file line numberDiff line numberDiff line change
@@ -1894,54 +1894,47 @@ def _possibly_convert_objects(values, convert_dates=True,
18941894
if not hasattr(values, 'dtype'):
18951895
values = np.array([values], dtype=np.object_)
18961896

1897-
# convert dates
1898-
if convert_dates and values.dtype == np.object_:
1899-
1900-
# we take an aggressive stance and convert to datetime64[ns]
1897+
# If not object, do not attempt conversion
1898+
if not is_object_dtype(values.dtype):
1899+
return values
1900+
1901+
# If 1 flag is coerce, ensure 2 others are False
1902+
conversions = (convert_dates, convert_numeric, convert_timedeltas)
1903+
if 'coerce' in conversions:
1904+
coerce_count = sum([c == 'coerce' for c in conversions])
1905+
if coerce_count > 1:
1906+
raise ValueError("'coerce' can be used at most once.")
1907+
1908+
# Immediate return if coerce
19011909
if convert_dates == 'coerce':
1902-
new_values = _possibly_cast_to_datetime(
1903-
values, 'M8[ns]', coerce=True)
1904-
1905-
# if we are all nans then leave me alone
1906-
if not isnull(new_values).all():
1907-
values = new_values
1908-
1909-
else:
1910-
values = lib.maybe_convert_objects(
1911-
values, convert_datetime=convert_dates)
1912-
1913-
# convert timedeltas
1914-
if convert_timedeltas and values.dtype == np.object_:
1915-
1916-
if convert_timedeltas == 'coerce':
1917-
from pandas.tseries.timedeltas import to_timedelta
1918-
values = to_timedelta(values, coerce=True)
1919-
1920-
# if we are all nans then leave me alone
1921-
if not isnull(new_values).all():
1922-
values = new_values
1923-
1924-
else:
1925-
values = lib.maybe_convert_objects(
1926-
values, convert_timedelta=convert_timedeltas)
1927-
1910+
return pd.to_datetime(values, coerce=True, box=False)
1911+
elif convert_timedeltas == 'coerce':
1912+
return pd.to_timedelta(values, coerce=True, box=False)
1913+
elif convert_numeric == 'coerce':
1914+
return lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
1915+
1916+
# Soft conversions
1917+
if convert_dates:
1918+
values = lib.maybe_convert_objects(values,
1919+
convert_datetime=convert_dates)
1920+
1921+
if convert_timedeltas and is_object_dtype(values.dtype):
1922+
# Object check to ensure only run if previous did not completely
1923+
# convert
1924+
values = lib.maybe_convert_objects(values,
1925+
convert_timedelta=convert_timedeltas)
19281926
# convert to numeric
1929-
if values.dtype == np.object_:
1930-
if convert_numeric:
1931-
try:
1932-
new_values = lib.maybe_convert_numeric(
1933-
values, set(), coerce_numeric=True)
1934-
1935-
# if we are all nans then leave me alone
1936-
if not isnull(new_values).all():
1937-
values = new_values
1938-
1939-
except:
1940-
pass
1941-
else:
1927+
if convert_numeric and is_object_dtype(values.dtype):
1928+
# Only if previous failed
1929+
try:
1930+
converted = lib.maybe_convert_numeric(values,
1931+
set(),
1932+
coerce_numeric=True)
1933+
# If all NaNs, then do not-alter
1934+
values = converted if not isnull(converted).all() else values
19421935

1943-
# soft-conversion
1944-
values = lib.maybe_convert_objects(values)
1936+
except:
1937+
pass
19451938

19461939
return values
19471940

pandas/core/groupby.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -2939,12 +2939,17 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
29392939

29402940
# if we have date/time like in the original, then coerce dates
29412941
# as we are stacking can easily have object dtypes here
2942-
if (self._selected_obj.ndim == 2
2943-
and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()):
2944-
cd = 'coerce'
2942+
if (self._selected_obj.ndim == 2 and
2943+
self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()):
2944+
result = result.convert_objects(convert_dates=False,
2945+
convert_numeric=True)
2946+
date_cols = self._selected_obj.select_dtypes(
2947+
include=list(_DATELIKE_DTYPES)).columns
2948+
result[date_cols] = result[date_cols].convert_objects(
2949+
convert_dates='coerce')
29452950
else:
2946-
cd = True
2947-
result = result.convert_objects(convert_dates=cd)
2951+
result = result.convert_objects(convert_dates=True)
2952+
29482953
return self._reindex_output(result)
29492954

29502955
else:

pandas/core/internals.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1484,8 +1484,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T
14841484
else:
14851485

14861486
values = com._possibly_convert_objects(
1487-
self.values.ravel(), convert_dates=convert_dates,
1488-
convert_numeric=convert_numeric
1487+
self.values.ravel(),
1488+
convert_dates=convert_dates,
1489+
convert_numeric=convert_numeric,
1490+
convert_timedeltas=convert_timedeltas
14891491
).reshape(self.values.shape)
14901492
blocks.append(make_block(values,
14911493
ndim=self.ndim, placement=self.mgr_locs))

pandas/io/tests/test_html.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -541,8 +541,9 @@ def try_remove_ws(x):
541541
dfnew = df.applymap(try_remove_ws).replace(old, new)
542542
gtnew = ground_truth.applymap(try_remove_ws)
543543
converted = dfnew.convert_objects(convert_numeric=True)
544-
tm.assert_frame_equal(converted.convert_objects(convert_dates='coerce'),
545-
gtnew)
544+
date_cols = ['Closing Date','Updated Date']
545+
converted[date_cols] = converted[date_cols].convert_objects(convert_dates='coerce')
546+
tm.assert_frame_equal(converted,gtnew)
546547

547548
@slow
548549
def test_gold_canyon(self):

pandas/tests/test_groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ def f(grp):
599599
return grp.iloc[0]
600600
result = df.groupby('A').apply(f)[['C']]
601601
e = df.groupby('A').first()[['C']]
602-
e.loc['Pony'] = np.nan
602+
e.loc['Pony'] = pd.NaT
603603
assert_frame_equal(result,e)
604604

605605
# scalar outputs

pandas/tests/test_series.py

+86-15
Original file line numberDiff line numberDiff line change
@@ -5830,6 +5830,72 @@ def test_apply_dont_convert_dtype(self):
58305830
self.assertEqual(result.dtype, object)
58315831

58325832
def test_convert_objects(self):
5833+
# Tests: All to nans, coerce, true
5834+
# Test coercion returns correct type
5835+
s = Series(['a', 'b', 'c'])
5836+
results = s.convert_objects('coerce', False, False)
5837+
expected = Series([lib.NaT] * 3)
5838+
assert_series_equal(results, expected)
5839+
5840+
results = s.convert_objects(False, 'coerce', False)
5841+
expected = Series([np.nan] * 3)
5842+
assert_series_equal(results, expected)
5843+
5844+
expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]'))
5845+
results = s.convert_objects(False, False, 'coerce')
5846+
assert_series_equal(results, expected)
5847+
5848+
dt = datetime(2001, 1, 1, 0, 0)
5849+
td = dt - datetime(2000, 1, 1, 0, 0)
5850+
5851+
# Test coercion with mixed types
5852+
s = Series(['a', '3.1415', dt, td])
5853+
results = s.convert_objects('coerce',False,False)
5854+
expected = Series([lib.NaT, lib.NaT, dt, lib.NaT])
5855+
assert_series_equal(results, expected)
5856+
5857+
results = s.convert_objects(False, 'coerce',False)
5858+
expected = Series([nan, 3.1415, nan, nan])
5859+
assert_series_equal(results, expected)
5860+
5861+
results = s.convert_objects(False, False, 'coerce')
5862+
expected = Series([lib.NaT, lib.NaT, lib.NaT, td],
5863+
dtype=np.dtype('m8[ns]'))
5864+
assert_series_equal(results, expected)
5865+
5866+
# Test standard conversion returns original
5867+
results = s.convert_objects(True, False, False)
5868+
assert_series_equal(results, s)
5869+
results = s.convert_objects(False, True, False)
5870+
expected = Series([nan, 3.1415, nan, nan])
5871+
assert_series_equal(results, expected)
5872+
results = s.convert_objects(False, False, True)
5873+
assert_series_equal(results, s)
5874+
5875+
# test pass-through and non-conversion when other types selected
5876+
s = Series(['1.0','2.0','3.0'])
5877+
results = s.convert_objects(True,True,True)
5878+
expected = Series([1.0,2.0,3.0])
5879+
assert_series_equal(results, expected)
5880+
results = s.convert_objects(True,False,True)
5881+
assert_series_equal(results, s)
5882+
5883+
s = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)],
5884+
dtype='O')
5885+
results = s.convert_objects(True,True,True)
5886+
expected = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)])
5887+
assert_series_equal(results, expected)
5888+
results = s.convert_objects(False,True,True)
5889+
assert_series_equal(results, s)
5890+
5891+
td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0)
5892+
s = Series([td, td], dtype='O')
5893+
results = s.convert_objects(True,True,True)
5894+
expected = Series([td, td])
5895+
assert_series_equal(results, expected)
5896+
results = s.convert_objects(True,True,False)
5897+
assert_series_equal(results, s)
5898+
58335899

58345900
s = Series([1., 2, 3], index=['a', 'b', 'c'])
58355901
result = s.convert_objects(convert_dates=False, convert_numeric=True)
@@ -5848,20 +5914,19 @@ def test_convert_objects(self):
58485914

58495915
r = s.copy().astype('O')
58505916
r['a'] = 'garbled'
5851-
expected = s.copy()
5852-
expected['a'] = np.nan
58535917
result = r.convert_objects(convert_dates=False, convert_numeric=True)
5918+
expected = s.copy()
5919+
expected['a'] = nan
58545920
assert_series_equal(result, expected)
58555921

58565922
# GH 4119, not converting a mixed type (e.g.floats and object)
58575923
s = Series([1, 'na', 3, 4])
58585924
result = s.convert_objects(convert_numeric=True)
5859-
expected = Series([1, np.nan, 3, 4])
5925+
expected = Series([1, nan, 3, 4])
58605926
assert_series_equal(result, expected)
58615927

58625928
s = Series([1, '', 3, 4])
58635929
result = s.convert_objects(convert_numeric=True)
5864-
expected = Series([1, np.nan, 3, 4])
58655930
assert_series_equal(result, expected)
58665931

58675932
# dates
@@ -5885,23 +5950,28 @@ def test_convert_objects(self):
58855950
[Timestamp(
58865951
'20010101'), Timestamp('20010102'), Timestamp('20010103'),
58875952
lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]')
5888-
result = s2.convert_objects(
5889-
convert_dates='coerce', convert_numeric=False)
5953+
result = s2.convert_objects(convert_dates='coerce',
5954+
convert_numeric=False,
5955+
convert_timedeltas=False)
58905956
assert_series_equal(result, expected)
5891-
result = s2.convert_objects(
5892-
convert_dates='coerce', convert_numeric=True)
5957+
result = s2.convert_objects(convert_dates='coerce',
5958+
convert_numeric=False,
5959+
convert_timedeltas=False)
58935960
assert_series_equal(result, expected)
58945961

58955962
# preserver all-nans (if convert_dates='coerce')
58965963
s = Series(['foo', 'bar', 1, 1.0], dtype='O')
5897-
result = s.convert_objects(
5898-
convert_dates='coerce', convert_numeric=False)
5899-
assert_series_equal(result, s)
5964+
result = s.convert_objects(convert_dates='coerce',
5965+
convert_numeric=False,
5966+
convert_timedeltas=False)
5967+
expected = Series([lib.NaT]*4)
5968+
assert_series_equal(result, expected)
59005969

59015970
# preserver if non-object
59025971
s = Series([1], dtype='float32')
5903-
result = s.convert_objects(
5904-
convert_dates='coerce', convert_numeric=False)
5972+
result = s.convert_objects(convert_dates='coerce',
5973+
convert_numeric=False,
5974+
convert_timedeltas=False)
59055975
assert_series_equal(result, s)
59065976

59075977
#r = s.copy()
@@ -5910,13 +5980,14 @@ def test_convert_objects(self):
59105980
#self.assertEqual(result.dtype, 'M8[ns]')
59115981

59125982
# dateutil parses some single letters into today's value as a date
5983+
expected = Series([lib.NaT])
59135984
for x in 'abcdefghijklmnopqrstuvwxyz':
59145985
s = Series([x])
59155986
result = s.convert_objects(convert_dates='coerce')
5916-
assert_series_equal(result, s)
5987+
assert_series_equal(result, expected)
59175988
s = Series([x.upper()])
59185989
result = s.convert_objects(convert_dates='coerce')
5919-
assert_series_equal(result, s)
5990+
assert_series_equal(result, expected)
59205991

59215992
def test_convert_objects_preserve_bool(self):
59225993
s = Series([1, True, 3, 5], dtype=object)

0 commit comments

Comments
 (0)