diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 294cb723eee1a..1f766efd21855 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -31,7 +31,6 @@ class providing the base-class of operations.
 from pandas.core.dtypes.common import (
     ensure_float,
     is_datetime64_dtype,
-    is_datetime64tz_dtype,
     is_extension_array_dtype,
     is_integer_dtype,
     is_numeric_dtype,
@@ -45,7 +44,6 @@ class providing the base-class of operations.
 from pandas.core.arrays import Categorical, try_cast_to_ea
 from pandas.core.base import DataError, PandasObject, SelectionMixin
 import pandas.core.common as com
-from pandas.core.construction import extract_array
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame
 from pandas.core.groupby import base
@@ -790,22 +788,11 @@ def _try_cast(self, result, obj, numeric_only: bool = False):
             dtype = obj.dtype
 
         if not is_scalar(result):
-            if is_datetime64tz_dtype(dtype):
-                # GH 23683
-                # Prior results _may_ have been generated in UTC.
-                # Ensure we localize to UTC first before converting
-                # to the target timezone
-                arr = extract_array(obj)
-                try:
-                    result = arr._from_sequence(result, dtype="datetime64[ns, UTC]")
-                    result = result.astype(dtype)
-                except TypeError:
-                    # _try_cast was called at a point where the result
-                    # was already tz-aware
-                    pass
-            elif is_extension_array_dtype(dtype):
+            if is_extension_array_dtype(dtype) and dtype.kind != "M":
                 # The function can return something of any type, so check
-                # if the type is compatible with the calling EA.
+                #  if the type is compatible with the calling EA.
+                # datetime64tz is handled correctly in agg_series,
+                #  so is excluded here.
 
                 # return the same type (Series) as our caller
                 cls = dtype.construct_array_type()
@@ -872,7 +859,9 @@ def _cython_agg_general(
             if numeric_only and not is_numeric:
                 continue
 
-            result, names = self.grouper.aggregate(obj.values, how, min_count=min_count)
+            result, names = self.grouper.aggregate(
+                obj._values, how, min_count=min_count
+            )
             output[name] = self._try_cast(result, obj)
 
         if len(output) == 0:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 7ed79e4b00371..47ca2b2190ecf 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -604,11 +604,11 @@ def agg_series(self, obj: Series, func):
             # SeriesGrouper would raise if we were to call _aggregate_series_fast
             return self._aggregate_series_pure_python(obj, func)
 
-        elif is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M":
+        elif is_extension_array_dtype(obj.dtype):
             # _aggregate_series_fast would raise TypeError when
             #  calling libreduction.Slider
+            # In the datetime64tz case it would incorrectly cast to tz-naive
             # TODO: can we get a performant workaround for EAs backed by ndarray?
-            # TODO: is the datetime64tz case supposed to go through here?
             return self._aggregate_series_pure_python(obj, func)
 
         elif isinstance(obj.index, MultiIndex):
@@ -657,7 +657,15 @@ def _aggregate_series_pure_python(self, obj: Series, func):
             res = func(group)
             if result is None:
                 if isinstance(res, (Series, Index, np.ndarray)):
-                    raise ValueError("Function does not reduce")
+                    if len(res) == 1:
+                        # e.g. test_agg_lambda_with_timezone lambda e: e.head(1)
+                        # FIXME: are we potentially losing import res.index info?
+
+                        # TODO: use `.item()` if/when we un-deprecate it.
+                        # For non-Series we could just do `res[0]`
+                        res = next(iter(res))
+                    else:
+                        raise ValueError("Function does not reduce")
                 result = np.empty(ngroups, dtype="O")
 
             counts[label] = group.shape[0]
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
index 1c297f3e2ada3..721045f1097f8 100644
--- a/pandas/tests/groupby/aggregate/test_other.py
+++ b/pandas/tests/groupby/aggregate/test_other.py
@@ -454,6 +454,31 @@ def test_agg_over_numpy_arrays():
     tm.assert_frame_equal(result, expected)
 
 
+def test_agg_tzaware_non_datetime_result():
+    # discussed in GH#29589, fixed in GH#29641, operating on tzaware values
+    #  with function that is not dtype-preserving
+    dti = pd.date_range("2012-01-01", periods=4, tz="UTC")
+    df = pd.DataFrame({"a": [0, 0, 1, 1], "b": dti})
+    gb = df.groupby("a")
+
+    # Case that _does_ preserve the dtype
+    result = gb["b"].agg(lambda x: x.iloc[0])
+    expected = pd.Series(dti[::2], name="b")
+    expected.index.name = "a"
+    tm.assert_series_equal(result, expected)
+
+    # Cases that do _not_ preserve the dtype
+    result = gb["b"].agg(lambda x: x.iloc[0].year)
+    expected = pd.Series([2012, 2012], name="b")
+    expected.index.name = "a"
+    tm.assert_series_equal(result, expected)
+
+    result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
+    expected = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
+    expected.index.name = "a"
+    tm.assert_series_equal(result, expected)
+
+
 def test_agg_timezone_round_trip():
     # GH 15426
     ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")