Skip to content

Commit 14bc445

Browse files
committed
Merge pull request #5790 from jreback/apply_bugs
BUG: dont' always coerce reductions in a groupby always to datetimes
2 parents b6ec4e2 + e375550 commit 14bc445

File tree

4 files changed

+75
-35
lines changed

4 files changed

+75
-35
lines changed

pandas/core/common.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -1527,17 +1527,22 @@ def _possibly_convert_objects(values, convert_dates=True,
15271527
values, convert_datetime=convert_dates)
15281528

15291529
# convert to numeric
1530-
if convert_numeric and values.dtype == np.object_:
1531-
try:
1532-
new_values = lib.maybe_convert_numeric(
1533-
values, set(), coerce_numeric=True)
1530+
if values.dtype == np.object_:
1531+
if convert_numeric:
1532+
try:
1533+
new_values = lib.maybe_convert_numeric(
1534+
values, set(), coerce_numeric=True)
15341535

1535-
# if we are all nans then leave me alone
1536-
if not isnull(new_values).all():
1537-
values = new_values
1536+
# if we are all nans then leave me alone
1537+
if not isnull(new_values).all():
1538+
values = new_values
15381539

1539-
except:
1540-
pass
1540+
except:
1541+
pass
1542+
else:
1543+
1544+
# soft-conversion
1545+
values = lib.maybe_convert_objects(values)
15411546

15421547
return values
15431548

pandas/core/groupby.py

+19-16
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
notnull, _DATELIKE_DTYPES, is_numeric_dtype,
2323
is_timedelta64_dtype, is_datetime64_dtype)
2424

25+
from pandas import _np_version_under1p7
2526
import pandas.lib as lib
2627
from pandas.lib import Timestamp
2728
import pandas.algos as _algos
@@ -2243,16 +2244,19 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
22432244
try:
22442245
if self.axis == 0:
22452246

2246-
stacked_values = np.vstack([np.asarray(x)
2247-
for x in values])
2248-
columns = v.index
2249-
index = key_index
2247+
# normally use vstack as its faster than concat
2248+
# and if we have mi-columns
2249+
if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
2250+
stacked_values = np.vstack([np.asarray(x) for x in values])
2251+
result = DataFrame(stacked_values,index=key_index,columns=v.index)
2252+
else:
2253+
# GH5788 instead of stacking; concat gets the dtypes correct
2254+
from pandas.tools.merge import concat
2255+
result = concat(values,keys=key_index,names=key_index.names,
2256+
axis=self.axis).unstack()
22502257
else:
2251-
stacked_values = np.vstack([np.asarray(x)
2252-
for x in values]).T
2253-
2254-
index = v.index
2255-
columns = key_index
2258+
stacked_values = np.vstack([np.asarray(x) for x in values])
2259+
result = DataFrame(stacked_values.T,index=v.index,columns=key_index)
22562260

22572261
except (ValueError, AttributeError):
22582262
# GH1738: values is list of arrays of unequal lengths fall
@@ -2261,15 +2265,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
22612265

22622266
# if we have date/time like in the original, then coerce dates
22632267
# as we are stacking can easily have object dtypes here
2264-
cd = True
2265-
if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any():
2266-
cd = 'coerce'
2267-
return DataFrame(stacked_values, index=index,
2268-
columns=columns).convert_objects(convert_dates=cd, convert_numeric=True)
2268+
cd = 'coerce' if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any() else True
2269+
return result.convert_objects(convert_dates=cd)
22692270

22702271
else:
2271-
return Series(values, index=key_index).convert_objects(
2272-
convert_dates='coerce',convert_numeric=True)
2272+
# only coerce dates if we find at least 1 datetime
2273+
cd = 'coerce' if any([ isinstance(v,Timestamp) for v in values ]) else False
2274+
return Series(values, index=key_index).convert_objects(convert_dates=cd)
2275+
22732276
else:
22742277
# Handle cases like BinGrouper
22752278
return self._concat_objects(keys, values,

pandas/core/internals.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -3556,12 +3556,14 @@ def _consolidate_inplace(self):
35563556
pass
35573557

35583558

3559-
def construction_error(tot_items, block_shape, axes):
3559+
def construction_error(tot_items, block_shape, axes, e=None):
35603560
""" raise a helpful message about our construction """
3561-
raise ValueError("Shape of passed values is %s, indices imply %s" % (
3562-
tuple(map(int, [tot_items] + list(block_shape))),
3563-
tuple(map(int, [len(ax) for ax in axes]))))
3564-
3561+
passed = tuple(map(int, [tot_items] + list(block_shape)))
3562+
implied = tuple(map(int, [len(ax) for ax in axes]))
3563+
if passed == implied and e is not None:
3564+
raise e
3565+
raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
3566+
passed,implied))
35653567

35663568
def create_block_manager_from_blocks(blocks, axes):
35673569
try:
@@ -3576,10 +3578,10 @@ def create_block_manager_from_blocks(blocks, axes):
35763578
mgr._consolidate_inplace()
35773579
return mgr
35783580

3579-
except (ValueError):
3581+
except (ValueError) as e:
35803582
blocks = [getattr(b, 'values', b) for b in blocks]
35813583
tot_items = sum(b.shape[0] for b in blocks)
3582-
construction_error(tot_items, blocks[0].shape[1:], axes)
3584+
construction_error(tot_items, blocks[0].shape[1:], axes, e)
35833585

35843586

35853587
def create_block_manager_from_arrays(arrays, names, axes):
@@ -3588,8 +3590,8 @@ def create_block_manager_from_arrays(arrays, names, axes):
35883590
mgr = BlockManager(blocks, axes)
35893591
mgr._consolidate_inplace()
35903592
return mgr
3591-
except (ValueError):
3592-
construction_error(len(arrays), arrays[0].shape[1:], axes)
3593+
except (ValueError) as e:
3594+
construction_error(len(arrays), arrays[0].shape[1:], axes, e)
35933595

35943596

35953597
def maybe_create_block_in_items_map(im, block):

pandas/tests/test_groupby.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import pandas.core.nanops as nanops
2929

3030
import pandas.util.testing as tm
31-
31+
import pandas as pd
3232

3333
def commonSetUp(self):
3434
self.dateRange = bdate_range('1/1/2005', periods=250)
@@ -481,6 +481,36 @@ def test_apply_describe_bug(self):
481481
grouped = self.mframe.groupby(level='first')
482482
result = grouped.describe() # it works!
483483

484+
def test_apply_issues(self):
485+
# GH 5788
486+
487+
s="""2011.05.16,00:00,1.40893
488+
2011.05.16,01:00,1.40760
489+
2011.05.16,02:00,1.40750
490+
2011.05.16,03:00,1.40649
491+
2011.05.17,02:00,1.40893
492+
2011.05.17,03:00,1.40760
493+
2011.05.17,04:00,1.40750
494+
2011.05.17,05:00,1.40649
495+
2011.05.18,02:00,1.40893
496+
2011.05.18,03:00,1.40760
497+
2011.05.18,04:00,1.40750
498+
2011.05.18,05:00,1.40649"""
499+
500+
df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']])
501+
df = df.set_index('date_time')
502+
503+
expected = df.groupby(df.index.date).idxmax()
504+
result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
505+
assert_frame_equal(result,expected)
506+
507+
# GH 5789
508+
# don't auto coerce dates
509+
df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'])
510+
expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18'])
511+
result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()])
512+
assert_series_equal(result,expected)
513+
484514
def test_len(self):
485515
df = tm.makeTimeDataFrame()
486516
grouped = df.groupby([lambda x: x.year,

0 commit comments

Comments
 (0)