Skip to content

Commit 4392ae3

Browse files
committed
BUG: rework object inferernce with NaN as the first element in an array
BUG: convert datetime like better from list-of-lists BUG: make rank of M8 work via object algos
1 parent 71d4f32 commit 4392ae3

File tree

6 files changed

+70
-28
lines changed

6 files changed

+70
-28
lines changed

pandas/core/algorithms.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
154154
uniques = uniques.take(sorter)
155155

156156
if is_datetime:
157-
uniques = uniques.view('M8[ns]')
157+
uniques = uniques.astype('M8[ns]')
158158
if isinstance(values, PeriodIndex):
159159
uniques = PeriodIndex(ordinal=uniques, freq=values.freq)
160160

@@ -279,6 +279,7 @@ def rank(values, axis=0, method='average', na_option='keep',
279279
f, values = _get_data_algo(values, _rank2d_functions)
280280
ranks = f(values, axis=axis, ties_method=method,
281281
ascending=ascending, na_option=na_option)
282+
282283
return ranks
283284

284285

@@ -364,12 +365,22 @@ def _interpolate(a, b, fraction):
364365

365366

366367
def _get_data_algo(values, func_map):
368+
mask = None
367369
if com.is_float_dtype(values):
368370
f = func_map['float64']
369371
values = com._ensure_float64(values)
370372
elif com.is_datetime64_dtype(values):
371-
f = func_map['int64']
372-
values = values.view('i8')
373+
374+
# if we have NaT, punt to object dtype
375+
mask = com.isnull(values)
376+
if mask.ravel().any():
377+
f = func_map['generic']
378+
values = com._ensure_object(values)
379+
values[mask] = np.nan
380+
else:
381+
f = func_map['int64']
382+
values = values.view('i8')
383+
373384
elif com.is_integer_dtype(values):
374385
f = func_map['int64']
375386
values = com._ensure_int64(values)

pandas/core/common.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class AmbiguousIndexError(PandasError, KeyError):
4040
pass
4141

4242

43-
_POSSIBLY_CAST_DTYPES = set([np.dtype(t)
43+
_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name
4444
for t in ['M8[ns]', '>M8[ns]', '<M8[ns]',
4545
'm8[ns]', '>m8[ns]', '<m8[ns]',
4646
'O', 'int8',
@@ -1612,7 +1612,7 @@ def _possibly_convert_objects(values, convert_dates=True,
16121612

16131613

16141614
def _possibly_castable(arr):
1615-
return arr.dtype not in _POSSIBLY_CAST_DTYPES
1615+
return arr.dtype.name not in _POSSIBLY_CAST_DTYPES
16161616

16171617

16181618
def _possibly_convert_platform(values):

pandas/core/frame.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -4696,9 +4696,14 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None):
46964696
raise AssertionError('%d columns passed, passed data had %s '
46974697
'columns' % (len(columns), len(content)))
46984698

4699-
arrays = [lib.maybe_convert_objects(arr, try_float=coerce_float)
4700-
if dtype != object and dtype != np.object else arr
4701-
for arr in content]
4699+
# provide soft conversion of object dtypes
4700+
def convert(arr):
4701+
if dtype != object and dtype != np.object:
4702+
arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
4703+
arr = com._possibly_cast_to_datetime(arr, dtype)
4704+
return arr
4705+
4706+
arrays = [ convert(arr) for arr in content ]
47024707

47034708
return arrays, columns
47044709

pandas/src/inference.pyx

+8-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,14 @@ def infer_dtype(object _values):
5656
if n == 0:
5757
return 'empty'
5858

59-
val = util.get_value_1d(values, 0)
59+
# make contiguous
60+
values = values.ravel()
61+
62+
# try to use a valid value
63+
for i in range(n):
64+
val = util.get_value_1d(values, i)
65+
if not is_null_datetimelike(val):
66+
break
6067

6168
if util.is_datetime64_object(val) or val is NaT:
6269
if is_datetime64_array(values):

pandas/tests/test_frame.py

+27-8
Original file line numberDiff line numberDiff line change
@@ -10489,6 +10489,8 @@ def test_rank2(self):
1048910489
[datetime(2000, 1, 2), datetime(2000, 1, 3),
1049010490
datetime(2000, 1, 1)]]
1049110491
df = DataFrame(data)
10492+
10493+
# check the rank
1049210494
expected = DataFrame([[2., nan, 1.],
1049310495
[2., 3., 1.]])
1049410496
result = df.rank(1, numeric_only=False)
@@ -10497,14 +10499,6 @@ def test_rank2(self):
1049710499
# mixed-type frames
1049810500
self.mixed_frame['datetime'] = datetime.now()
1049910501
self.mixed_frame['timedelta'] = timedelta(days=1,seconds=1)
10500-
self.assert_(self.mixed_frame['datetime'].dtype == 'M8[ns]')
10501-
self.assert_(self.mixed_frame['timedelta'].dtype == 'm8[ns]')
10502-
result = self.mixed_frame.get_dtype_counts().order()
10503-
expected = Series({ 'float64' : 4,
10504-
'object' : 1,
10505-
'datetime64[ns]' : 1,
10506-
'timedelta64[ns]' : 1}).order()
10507-
assert_series_equal(result,expected)
1050810502

1050910503
result = self.mixed_frame.rank(1)
1051010504
expected = self.mixed_frame.rank(1, numeric_only=True)
@@ -11097,6 +11091,31 @@ def test_constructor_with_convert(self):
1109711091
None], np.object_))
1109811092
assert_series_equal(result, expected)
1109911093

11094+
def test_construction_with_mixed(self):
11095+
# test construction edge cases with mixed types
11096+
11097+
# f7u12, this does not work without extensive workaround
11098+
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
11099+
[datetime(2000, 1, 2), datetime(2000, 1, 3),
11100+
datetime(2000, 1, 1)]]
11101+
df = DataFrame(data)
11102+
11103+
# check dtypes
11104+
result = df.get_dtype_counts().order()
11105+
expected = Series({ 'datetime64[ns]' : 3 })
11106+
11107+
# mixed-type frames
11108+
self.mixed_frame['datetime'] = datetime.now()
11109+
self.mixed_frame['timedelta'] = timedelta(days=1,seconds=1)
11110+
self.assert_(self.mixed_frame['datetime'].dtype == 'M8[ns]')
11111+
self.assert_(self.mixed_frame['timedelta'].dtype == 'm8[ns]')
11112+
result = self.mixed_frame.get_dtype_counts().order()
11113+
expected = Series({ 'float64' : 4,
11114+
'object' : 1,
11115+
'datetime64[ns]' : 1,
11116+
'timedelta64[ns]' : 1}).order()
11117+
assert_series_equal(result,expected)
11118+
1110011119
def test_constructor_frame_copy(self):
1110111120
cop = DataFrame(self.frame, copy=True)
1110211121
cop['A'] = 5

pandas/tests/test_index.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex,
1616
InvalidIndexError)
17+
from pandas.tseries.index import DatetimeIndex
1718
from pandas.core.frame import DataFrame
1819
from pandas.core.series import Series
1920
from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp,
@@ -32,6 +33,9 @@
3233

3334
from pandas import _np_version_under1p7
3435

36+
def _skip_if_need_numpy_1_7():
37+
if _np_version_under1p7:
38+
raise nose.SkipTest('numpy >= 1.7 required')
3539

3640
class TestIndex(tm.TestCase):
3741
_multiprocess_can_split_ = True
@@ -236,12 +240,7 @@ def test_asof(self):
236240
tm.assert_isinstance(self.dateIndex.asof(d), Timestamp)
237241

238242
def test_nanosecond_index_access(self):
239-
if _np_version_under1p7:
240-
import nose
241-
242-
raise nose.SkipTest('numpy >= 1.7 required')
243-
244-
from pandas import Series, Timestamp, DatetimeIndex
243+
_skip_if_need_numpy_1_7()
245244

246245
s = Series([Timestamp('20130101')]).values.view('i8')[0]
247246
r = DatetimeIndex([s + 50 + i for i in range(100)])
@@ -1607,11 +1606,12 @@ def test_get_level_values_na(self):
16071606
expected = ['a', np.nan, 1]
16081607
assert_array_equal(values.values, expected)
16091608

1610-
arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])]
1611-
index = pd.MultiIndex.from_arrays(arrays)
1612-
values = index.get_level_values(1)
1613-
expected = pd.DatetimeIndex([0, 1, pd.NaT])
1614-
assert_array_equal(values.values, expected.values)
1609+
if not _np_version_under1p7:
1610+
arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])]
1611+
index = pd.MultiIndex.from_arrays(arrays)
1612+
values = index.get_level_values(1)
1613+
expected = pd.DatetimeIndex([0, 1, pd.NaT])
1614+
assert_array_equal(values.values, expected.values)
16151615

16161616
arrays = [[], []]
16171617
index = pd.MultiIndex.from_arrays(arrays)

0 commit comments

Comments
 (0)