Skip to content

Commit 009d1df

Browse files
chris-b1jreback
authored andcommitted
PERF: DataFrame transform
closes #12737 closes #13191 Author: Chris <[email protected]> Closes #13192 from chris-b1/transform-perf and squashes the following commits: 0af1e55 [Chris] revert casting logic d61d4e0 [Chris] handle duplicate column case 9d78f65 [Chris] other categorical test name fix 045d0c7 [Chris] add back some casting b66a1c8 [Chris] PERF: DataFrame transform
1 parent 2429ec5 commit 009d1df

File tree

5 files changed

+82
-31
lines changed

5 files changed

+82
-31
lines changed

asv_bench/benchmarks/groupby.py

+15
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,21 @@ def setup(self):
773773
def time_groupby_transform_series2(self):
774774
self.df.groupby('id')['val'].transform(np.mean)
775775

776+
777+
class groupby_transform_dataframe(object):
778+
# GH 12737
779+
goal_time = 0.2
780+
781+
def setup(self):
782+
self.df = pd.DataFrame({'group': np.repeat(np.arange(1000), 10),
783+
'B': np.nan,
784+
'C': np.nan})
785+
self.df.ix[4::10, 'B':'C'] = 5
786+
787+
def time_groupby_transform_dataframe(self):
788+
self.df.groupby('group').transform('first')
789+
790+
776791
class groupby_transform_cythonized(object):
777792
goal_time = 0.2
778793

doc/source/whatsnew/v0.18.2.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ Performance Improvements
105105
- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`)
106106

107107

108-
108+
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
109109

110110

111111
.. _whatsnew_0182.bug_fixes:
@@ -125,7 +125,7 @@ Bug Fixes
125125

126126
- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`)
127127

128-
128+
- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`)
129129

130130
- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`)
131131

pandas/core/groupby.py

+25-23
Original file line numberDiff line numberDiff line change
@@ -2776,18 +2776,11 @@ def _transform_fast(self, func):
27762776
func = getattr(self, func)
27772777

27782778
ids, _, ngroup = self.grouper.group_info
2779-
mask = ids != -1
2780-
2781-
out = func().values[ids]
2782-
if not mask.all():
2783-
out = np.where(mask, out, np.nan)
2784-
2785-
obs = np.zeros(ngroup, dtype='bool')
2786-
obs[ids[mask]] = True
2787-
if not obs.all():
2788-
out = self._try_cast(out, self._selected_obj)
2789-
2790-
return Series(out, index=self.obj.index)
2779+
cast = (self.size().fillna(0) > 0).any()
2780+
out = algos.take_1d(func().values, ids)
2781+
if cast:
2782+
out = self._try_cast(out, self.obj)
2783+
return Series(out, index=self.obj.index, name=self.obj.name)
27912784

27922785
def filter(self, func, dropna=True, *args, **kwargs): # noqa
27932786
"""
@@ -3465,19 +3458,28 @@ def transform(self, func, *args, **kwargs):
34653458
if not result.columns.equals(obj.columns):
34663459
return self._transform_general(func, *args, **kwargs)
34673460

3468-
results = np.empty_like(obj.values, result.values.dtype)
3469-
for (name, group), (i, row) in zip(self, result.iterrows()):
3470-
indexer = self._get_index(name)
3471-
if len(indexer) > 0:
3472-
results[indexer] = np.tile(row.values, len(
3473-
indexer)).reshape(len(indexer), -1)
3461+
return self._transform_fast(result, obj)
34743462

3475-
counts = self.size().fillna(0).values
3476-
if any(counts == 0):
3477-
results = self._try_cast(results, obj[result.columns])
3463+
def _transform_fast(self, result, obj):
3464+
"""
3465+
Fast transform path for aggregations
3466+
"""
3467+
# if there were groups with no observations (Categorical only?)
3468+
# try casting data to original dtype
3469+
cast = (self.size().fillna(0) > 0).any()
34783470

3479-
return (DataFrame(results, columns=result.columns, index=obj.index)
3480-
._convert(datetime=True))
3471+
# for each col, reshape to to size of original frame
3472+
# by take operation
3473+
ids, _, ngroup = self.grouper.group_info
3474+
output = []
3475+
for i, _ in enumerate(result.columns):
3476+
res = algos.take_1d(result.iloc[:, i].values, ids)
3477+
if cast:
3478+
res = self._try_cast(res, obj.iloc[:, i])
3479+
output.append(res)
3480+
3481+
return DataFrame._from_arrays(output, columns=result.columns,
3482+
index=obj.index)
34813483

34823484
def _define_paths(self, func, *args, **kwargs):
34833485
if isinstance(func, compat.string_types):

pandas/tests/test_categorical.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -3025,8 +3025,7 @@ def f(x):
30253025
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
30263026

30273027
result = df.a.groupby(c).transform(sum)
3028-
tm.assert_series_equal(result, df['a'], check_names=False)
3029-
self.assertTrue(result.name is None)
3028+
tm.assert_series_equal(result, df['a'])
30303029

30313030
tm.assert_series_equal(
30323031
df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
@@ -3043,8 +3042,7 @@ def f(x):
30433042
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
30443043

30453044
result = df.a.groupby(c).transform(sum)
3046-
tm.assert_series_equal(result, df['a'], check_names=False)
3047-
self.assertTrue(result.name is None)
3045+
tm.assert_series_equal(result, df['a'])
30483046

30493047
tm.assert_series_equal(
30503048
df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])

pandas/tests/test_groupby.py

+38-2
Original file line numberDiff line numberDiff line change
@@ -1051,13 +1051,39 @@ def test_transform_fast(self):
10511051

10521052
values = np.repeat(grp.mean().values,
10531053
com._ensure_platform_int(grp.count().values))
1054-
expected = pd.Series(values, index=df.index)
1054+
expected = pd.Series(values, index=df.index, name='val')
10551055
result = grp.transform(np.mean)
10561056
assert_series_equal(result, expected)
10571057

10581058
result = grp.transform('mean')
10591059
assert_series_equal(result, expected)
10601060

1061+
# GH 12737
1062+
df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
1063+
'd': pd.date_range('2014-1-1', '2014-1-4'),
1064+
'i': [1, 2, 3, 4]},
1065+
columns=['grouping', 'f', 'i', 'd'])
1066+
result = df.groupby('grouping').transform('first')
1067+
1068+
dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
1069+
pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
1070+
expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
1071+
'd': dates,
1072+
'i': [1, 2, 2, 4]},
1073+
columns=['f', 'i', 'd'])
1074+
assert_frame_equal(result, expected)
1075+
1076+
# selection
1077+
result = df.groupby('grouping')[['f', 'i']].transform('first')
1078+
expected = expected[['f', 'i']]
1079+
assert_frame_equal(result, expected)
1080+
1081+
# dup columns
1082+
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
1083+
result = df.groupby('g').transform('first')
1084+
expected = df.drop('g', axis=1)
1085+
assert_frame_equal(result, expected)
1086+
10611087
def test_transform_broadcast(self):
10621088
grouped = self.ts.groupby(lambda x: x.month)
10631089
result = grouped.transform(np.mean)
@@ -1191,6 +1217,16 @@ def test_transform_function_aliases(self):
11911217
expected = self.df.groupby('A')['C'].transform(np.mean)
11921218
assert_series_equal(result, expected)
11931219

1220+
def test_series_fast_transform_date(self):
1221+
# GH 13191
1222+
df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3],
1223+
'd': pd.date_range('2014-1-1', '2014-1-4')})
1224+
result = df.groupby('grouping')['d'].transform('first')
1225+
dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'),
1226+
pd.Timestamp('2014-1-4')]
1227+
expected = pd.Series(dates, name='d')
1228+
assert_series_equal(result, expected)
1229+
11941230
def test_transform_length(self):
11951231
# GH 9697
11961232
df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]})
@@ -4406,7 +4442,7 @@ def test_groupby_datetime64_32_bit(self):
44064442

44074443
df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2})
44084444
result = df.groupby("A")["B"].transform(min)
4409-
expected = Series([pd.Timestamp('2000-01-1')] * 2)
4445+
expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B')
44104446
assert_series_equal(result, expected)
44114447

44124448
def test_groupby_categorical_unequal_len(self):

0 commit comments

Comments
 (0)