Skip to content

BUG: not converting scalars properly to M8/m8 on assignment (GH6079) #6083

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 25, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
uniques = uniques.take(sorter)

if is_datetime:
uniques = uniques.view('M8[ns]')
uniques = uniques.astype('M8[ns]')
if isinstance(values, PeriodIndex):
uniques = PeriodIndex(ordinal=uniques, freq=values.freq)

Expand Down Expand Up @@ -279,6 +279,7 @@ def rank(values, axis=0, method='average', na_option='keep',
f, values = _get_data_algo(values, _rank2d_functions)
ranks = f(values, axis=axis, ties_method=method,
ascending=ascending, na_option=na_option)

return ranks


Expand Down Expand Up @@ -364,12 +365,22 @@ def _interpolate(a, b, fraction):


def _get_data_algo(values, func_map):
mask = None
if com.is_float_dtype(values):
f = func_map['float64']
values = com._ensure_float64(values)
elif com.is_datetime64_dtype(values):
f = func_map['int64']
values = values.view('i8')

# if we have NaT, punt to object dtype
mask = com.isnull(values)
if mask.ravel().any():
f = func_map['generic']
values = com._ensure_object(values)
values[mask] = np.nan
else:
f = func_map['int64']
values = values.view('i8')

elif com.is_integer_dtype(values):
f = func_map['int64']
values = com._ensure_int64(values)
Expand Down
15 changes: 0 additions & 15 deletions pandas/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,3 @@

NA = np.nan

# a series-like ndarray ####


class SNDArray(Array):

def __new__(cls, data, index=None, name=None):
data = data.view(SNDArray)
data.index = index
data.name = name

return data

@property
def values(self):
return self.view(Array)
12 changes: 8 additions & 4 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import codecs
import csv
import types
from datetime import datetime, timedelta

from numpy.lib.format import read_array, write_array
import numpy as np
Expand Down Expand Up @@ -39,7 +40,7 @@ class AmbiguousIndexError(PandasError, KeyError):
pass


_POSSIBLY_CAST_DTYPES = set([np.dtype(t)
_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name
for t in ['M8[ns]', '>M8[ns]', '<M8[ns]',
'm8[ns]', '>m8[ns]', '<m8[ns]',
'O', 'int8',
Expand Down Expand Up @@ -867,11 +868,14 @@ def _infer_dtype_from_scalar(val):

dtype = np.object_

elif isinstance(val, np.datetime64):
# ugly hacklet
elif isinstance(val, (np.datetime64, datetime)) and getattr(val,'tz',None) is None:
val = lib.Timestamp(val).value
dtype = np.dtype('M8[ns]')

elif isinstance(val, (np.timedelta64, timedelta)):
val = tslib.convert_to_timedelta(val,'ns')
dtype = np.dtype('m8[ns]')

elif is_bool(val):
dtype = np.bool_

Expand Down Expand Up @@ -1608,7 +1612,7 @@ def _possibly_convert_objects(values, convert_dates=True,


def _possibly_castable(arr):
return arr.dtype not in _POSSIBLY_CAST_DTYPES
return arr.dtype.name not in _POSSIBLY_CAST_DTYPES


def _possibly_convert_platform(values):
Expand Down
11 changes: 8 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4696,9 +4696,14 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None):
raise AssertionError('%d columns passed, passed data had %s '
'columns' % (len(columns), len(content)))

arrays = [lib.maybe_convert_objects(arr, try_float=coerce_float)
if dtype != object and dtype != np.object else arr
for arr in content]
# provide soft conversion of object dtypes
def convert(arr):
if dtype != object and dtype != np.object:
arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
arr = com._possibly_cast_to_datetime(arr, dtype)
return arr

arrays = [ convert(arr) for arr in content ]

return arrays, columns

Expand Down
1 change: 1 addition & 0 deletions pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def test_read_dta1(self):
def test_read_dta2(self):
if LooseVersion(sys.version) < '2.7':
raise nose.SkipTest('datetime interp under 2.6 is faulty')
skip_if_not_little_endian()

expected = DataFrame.from_records(
[
Expand Down
9 changes: 8 additions & 1 deletion pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,14 @@ def infer_dtype(object _values):
if n == 0:
return 'empty'

val = util.get_value_1d(values, 0)
# make contiguous
values = values.ravel()

# try to use a valid value
for i in range(n):
val = util.get_value_1d(values, i)
if not is_null_datetimelike(val):
break

if util.is_datetime64_object(val) or val is NaT:
if is_datetime64_array(values):
Expand Down
84 changes: 24 additions & 60 deletions pandas/src/reduce.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from numpy cimport *
import numpy as np

from pandas.core.array import SNDArray
from distutils.version import LooseVersion

is_numpy_prior_1_6_2 = LooseVersion(np.__version__) < '1.6.2'
Expand Down Expand Up @@ -114,8 +113,8 @@ cdef class Reducer:

# use the cached_typ if possible
if cached_typ is not None:
cached_typ._data._block.values = chunk
cached_typ.name = name
object.__setattr__(cached_typ._data._block, 'values', chunk)
object.__setattr__(cached_typ, 'name', name)
res = self.f(cached_typ)
else:
res = self.f(chunk)
Expand Down Expand Up @@ -164,7 +163,7 @@ cdef class SeriesBinGrouper:
bint passed_dummy

cdef public:
object arr, index, dummy_arr, dummy_index, values, f, bins, typ, ityp, name
object arr, index, dummy_arr, dummy_index, values, f, bins, typ, name

def __init__(self, object series, object f, object bins, object dummy):
n = len(series)
Expand All @@ -178,7 +177,6 @@ cdef class SeriesBinGrouper:
self.arr = values
self.index = series.index
self.typ = type(series)
self.ityp = type(series.index)
self.name = getattr(series,'name',None)

self.dummy_arr, self.dummy_index = self._check_dummy(dummy)
Expand Down Expand Up @@ -210,9 +208,10 @@ cdef class SeriesBinGrouper:
ndarray[int64_t] counts
Py_ssize_t i, n, group_size
object res
bint initialized = 0, needs_typ = 1, try_typ = 0
bint initialized = 0
Slider vslider, islider
object gin, typ, ityp, name
object gin, typ, name
object cached_typ = None

counts = np.zeros(self.ngroups, dtype=np.int64)

Expand All @@ -226,45 +225,29 @@ cdef class SeriesBinGrouper:

group_size = 0
n = len(self.arr)
typ = self.typ
ityp = self.ityp
name = self.name

vslider = Slider(self.arr, self.dummy_arr)
islider = Slider(self.index, self.dummy_index)

gin = self.dummy_index._engine

# old numpy issue, need to always create and pass the Series
if is_numpy_prior_1_6_2:
try_typ = 1
needs_typ = 1

try:
for i in range(self.ngroups):
group_size = counts[i]

islider.set_length(group_size)
vslider.set_length(group_size)

# see if we need to create the object proper
if try_typ:
if needs_typ:
res = self.f(typ(vslider.buf, index=islider.buf,
name=name, fastpath=True))
else:
res = self.f(SNDArray(vslider.buf,islider.buf,name=name))
if cached_typ is None:
cached_typ = self.typ(vslider.buf, index=islider.buf,
name=name)
else:
try:
res = self.f(SNDArray(vslider.buf,islider.buf,name=name))
needs_typ = 0
except:
res = self.f(typ(vslider.buf, index=islider.buf,
name=name, fastpath=True))
needs_typ = 1

try_typ = 1
object.__setattr__(cached_typ._data._block, 'values', vslider.buf)
object.__setattr__(cached_typ, '_index', islider.buf)
object.__setattr__(cached_typ, 'name', name)

res = self.f(cached_typ)
res = _extract_result(res)
if not initialized:
result = self._get_result_array(res)
Expand Down Expand Up @@ -309,7 +292,7 @@ cdef class SeriesGrouper:
bint passed_dummy

cdef public:
object arr, index, dummy_arr, dummy_index, f, labels, values, typ, ityp, name
object arr, index, dummy_arr, dummy_index, f, labels, values, typ, name

def __init__(self, object series, object f, object labels,
Py_ssize_t ngroups, object dummy):
Expand All @@ -324,7 +307,6 @@ cdef class SeriesGrouper:
self.arr = values
self.index = series.index
self.typ = type(series)
self.ityp = type(series.index)
self.name = getattr(series,'name',None)

self.dummy_arr, self.dummy_index = self._check_dummy(dummy)
Expand All @@ -351,28 +333,22 @@ cdef class SeriesGrouper:
ndarray[int64_t] labels, counts
Py_ssize_t i, n, group_size, lab
object res
bint initialized = 0, needs_typ = 1, try_typ = 0
bint initialized = 0
Slider vslider, islider
object gin, typ, ityp, name
object gin, typ, name
object cached_typ = None

labels = self.labels
counts = np.zeros(self.ngroups, dtype=np.int64)
group_size = 0
n = len(self.arr)
typ = self.typ
ityp = self.ityp
name = self.name

vslider = Slider(self.arr, self.dummy_arr)
islider = Slider(self.index, self.dummy_index)

gin = self.dummy_index._engine

# old numpy issue, need to always create and pass the Series
if is_numpy_prior_1_6_2:
try_typ = 1
needs_typ = 1

try:
for i in range(n):
group_size += 1
Expand All @@ -389,27 +365,15 @@ cdef class SeriesGrouper:
islider.set_length(group_size)
vslider.set_length(group_size)

# see if we need to create the object proper
# try on the first go around
if try_typ:
if needs_typ:
res = self.f(typ(vslider.buf, index=islider.buf,
name=name, fastpath=True))
else:
res = self.f(SNDArray(vslider.buf,islider.buf,name=name))
if cached_typ is None:
cached_typ = self.typ(vslider.buf, index=islider.buf,
name=name)
else:
object.__setattr__(cached_typ._data._block, 'values', vslider.buf)
object.__setattr__(cached_typ, '_index', islider.buf)
object.__setattr__(cached_typ, 'name', name)

# try with a numpy array directly
try:
res = self.f(SNDArray(vslider.buf,islider.buf,name=name))
needs_typ = 0
except (Exception), detail:
res = self.f(typ(vslider.buf, index=islider.buf,
name=name, fastpath=True))
needs_typ = 1

try_typ = 1

res = self.f(cached_typ)
res = _extract_result(res)
if not initialized:
result = self._get_result_array(res)
Expand Down
31 changes: 30 additions & 1 deletion pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10489,13 +10489,17 @@ def test_rank2(self):
[datetime(2000, 1, 2), datetime(2000, 1, 3),
datetime(2000, 1, 1)]]
df = DataFrame(data)

# check the rank
expected = DataFrame([[2., nan, 1.],
[2., 3., 1.]])
result = df.rank(1, numeric_only=False)
assert_frame_equal(result, expected)

# mixed-type frames
self.mixed_frame['foo'] = datetime.now()
self.mixed_frame['datetime'] = datetime.now()
self.mixed_frame['timedelta'] = timedelta(days=1,seconds=1)

result = self.mixed_frame.rank(1)
expected = self.mixed_frame.rank(1, numeric_only=True)
assert_frame_equal(result, expected)
Expand Down Expand Up @@ -11087,6 +11091,31 @@ def test_constructor_with_convert(self):
None], np.object_))
assert_series_equal(result, expected)

def test_construction_with_mixed(self):
# test construction edge cases with mixed types

# f7u12, this does not work without extensive workaround
data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3),
datetime(2000, 1, 1)]]
df = DataFrame(data)

# check dtypes
result = df.get_dtype_counts().order()
expected = Series({ 'datetime64[ns]' : 3 })

# mixed-type frames
self.mixed_frame['datetime'] = datetime.now()
self.mixed_frame['timedelta'] = timedelta(days=1,seconds=1)
self.assert_(self.mixed_frame['datetime'].dtype == 'M8[ns]')
self.assert_(self.mixed_frame['timedelta'].dtype == 'm8[ns]')
result = self.mixed_frame.get_dtype_counts().order()
expected = Series({ 'float64' : 4,
'object' : 1,
'datetime64[ns]' : 1,
'timedelta64[ns]' : 1}).order()
assert_series_equal(result,expected)

def test_constructor_frame_copy(self):
cop = DataFrame(self.frame, copy=True)
cop['A'] = 5
Expand Down
Loading