Skip to content

BUG: dont' always coerce reductions in a groupby always to datetimes #5790

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 29, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1527,17 +1527,22 @@ def _possibly_convert_objects(values, convert_dates=True,
values, convert_datetime=convert_dates)

# convert to numeric
if convert_numeric and values.dtype == np.object_:
try:
new_values = lib.maybe_convert_numeric(
values, set(), coerce_numeric=True)
if values.dtype == np.object_:
if convert_numeric:
try:
new_values = lib.maybe_convert_numeric(
values, set(), coerce_numeric=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values
# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

except:
pass
except:
pass
else:

# soft-conversion
values = lib.maybe_convert_objects(values)

return values

Expand Down
35 changes: 19 additions & 16 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
notnull, _DATELIKE_DTYPES, is_numeric_dtype,
is_timedelta64_dtype, is_datetime64_dtype)

from pandas import _np_version_under1p7
import pandas.lib as lib
from pandas.lib import Timestamp
import pandas.algos as _algos
Expand Down Expand Up @@ -2243,16 +2244,19 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
try:
if self.axis == 0:

stacked_values = np.vstack([np.asarray(x)
for x in values])
columns = v.index
index = key_index
# normally use vstack as its faster than concat
# and if we have mi-columns
if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
stacked_values = np.vstack([np.asarray(x) for x in values])
result = DataFrame(stacked_values,index=key_index,columns=v.index)
else:
# GH5788 instead of stacking; concat gets the dtypes correct
from pandas.tools.merge import concat
result = concat(values,keys=key_index,names=key_index.names,
axis=self.axis).unstack()
else:
stacked_values = np.vstack([np.asarray(x)
for x in values]).T

index = v.index
columns = key_index
stacked_values = np.vstack([np.asarray(x) for x in values])
result = DataFrame(stacked_values.T,index=v.index,columns=key_index)

except (ValueError, AttributeError):
# GH1738: values is list of arrays of unequal lengths fall
Expand All @@ -2261,15 +2265,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):

# if we have date/time like in the original, then coerce dates
# as we are stacking can easily have object dtypes here
cd = True
if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any():
cd = 'coerce'
return DataFrame(stacked_values, index=index,
columns=columns).convert_objects(convert_dates=cd, convert_numeric=True)
cd = 'coerce' if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any() else True
return result.convert_objects(convert_dates=cd)

else:
return Series(values, index=key_index).convert_objects(
convert_dates='coerce',convert_numeric=True)
# only coerce dates if we find at least 1 datetime
cd = 'coerce' if any([ isinstance(v,Timestamp) for v in values ]) else False
return Series(values, index=key_index).convert_objects(convert_dates=cd)

else:
# Handle cases like BinGrouper
return self._concat_objects(keys, values,
Expand Down
20 changes: 11 additions & 9 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3556,12 +3556,14 @@ def _consolidate_inplace(self):
pass


def construction_error(tot_items, block_shape, axes):
def construction_error(tot_items, block_shape, axes, e=None):
""" raise a helpful message about our construction """
raise ValueError("Shape of passed values is %s, indices imply %s" % (
tuple(map(int, [tot_items] + list(block_shape))),
tuple(map(int, [len(ax) for ax in axes]))))

passed = tuple(map(int, [tot_items] + list(block_shape)))
implied = tuple(map(int, [len(ax) for ax in axes]))
if passed == implied and e is not None:
raise e
raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
passed,implied))

def create_block_manager_from_blocks(blocks, axes):
try:
Expand All @@ -3576,10 +3578,10 @@ def create_block_manager_from_blocks(blocks, axes):
mgr._consolidate_inplace()
return mgr

except (ValueError):
except (ValueError) as e:
blocks = [getattr(b, 'values', b) for b in blocks]
tot_items = sum(b.shape[0] for b in blocks)
construction_error(tot_items, blocks[0].shape[1:], axes)
construction_error(tot_items, blocks[0].shape[1:], axes, e)


def create_block_manager_from_arrays(arrays, names, axes):
Expand All @@ -3588,8 +3590,8 @@ def create_block_manager_from_arrays(arrays, names, axes):
mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
return mgr
except (ValueError):
construction_error(len(arrays), arrays[0].shape[1:], axes)
except (ValueError) as e:
construction_error(len(arrays), arrays[0].shape[1:], axes, e)


def maybe_create_block_in_items_map(im, block):
Expand Down
32 changes: 31 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import pandas.core.nanops as nanops

import pandas.util.testing as tm

import pandas as pd

def commonSetUp(self):
self.dateRange = bdate_range('1/1/2005', periods=250)
Expand Down Expand Up @@ -481,6 +481,36 @@ def test_apply_describe_bug(self):
grouped = self.mframe.groupby(level='first')
result = grouped.describe() # it works!

def test_apply_issues(self):
# GH 5788

s="""2011.05.16,00:00,1.40893
2011.05.16,01:00,1.40760
2011.05.16,02:00,1.40750
2011.05.16,03:00,1.40649
2011.05.17,02:00,1.40893
2011.05.17,03:00,1.40760
2011.05.17,04:00,1.40750
2011.05.17,05:00,1.40649
2011.05.18,02:00,1.40893
2011.05.18,03:00,1.40760
2011.05.18,04:00,1.40750
2011.05.18,05:00,1.40649"""

df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']])
df = df.set_index('date_time')

expected = df.groupby(df.index.date).idxmax()
result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
assert_frame_equal(result,expected)

# GH 5789
# don't auto coerce dates
df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'])
expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18'])
result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()])
assert_series_equal(result,expected)

def test_len(self):
df = tm.makeTimeDataFrame()
grouped = df.groupby([lambda x: x.year,
Expand Down