BUG: groupby and agg on read-only array gives ValueError: buffer source array is read-only (#36061)

jeet-parekh · web-flow · commit b80dcbcdc9aa · 2020-09-04T10:28:15.000-04:00
diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst
@@ -18,7 +18,7 @@ Fixed regressions
 - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`)
 - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`)
 - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`)
--
+- Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`)
 
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -229,7 +229,7 @@ def group_cumprod_float64(float64_t[:, :] out,
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_cumsum(numeric[:, :] out,
-                 numeric[:, :] values,
+                 ndarray[numeric, ndim=2] values,
                  const int64_t[:] labels,
                  int ngroups,
                  is_datetimelike,
@@ -472,7 +472,7 @@ ctypedef fused complexfloating_t:
 @cython.boundscheck(False)
 def _group_add(complexfloating_t[:, :] out,
                int64_t[:] counts,
-               complexfloating_t[:, :] values,
+               ndarray[complexfloating_t, ndim=2] values,
                const int64_t[:] labels,
                Py_ssize_t min_count=0):
     """
@@ -483,8 +483,9 @@ def _group_add(complexfloating_t[:, :] out,
         complexfloating_t val, count
         complexfloating_t[:, :] sumx
         int64_t[:, :] nobs
+        Py_ssize_t len_values = len(values), len_labels = len(labels)
 
-    if len(values) != len(labels):
+    if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
@@ -530,7 +531,7 @@ group_add_complex128 = _group_add['double complex']
 @cython.boundscheck(False)
 def _group_prod(floating[:, :] out,
                 int64_t[:] counts,
-                floating[:, :] values,
+                ndarray[floating, ndim=2] values,
                 const int64_t[:] labels,
                 Py_ssize_t min_count=0):
     """
@@ -541,8 +542,9 @@ def _group_prod(floating[:, :] out,
         floating val, count
         floating[:, :] prodx
         int64_t[:, :] nobs
+        Py_ssize_t len_values = len(values), len_labels = len(labels)
 
-    if not len(values) == len(labels):
+    if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
@@ -582,7 +584,7 @@ group_prod_float64 = _group_prod['double']
 @cython.cdivision(True)
 def _group_var(floating[:, :] out,
                int64_t[:] counts,
-               floating[:, :] values,
+               ndarray[floating, ndim=2] values,
                const int64_t[:] labels,
                Py_ssize_t min_count=-1,
                int64_t ddof=1):
@@ -591,10 +593,11 @@ def _group_var(floating[:, :] out,
         floating val, ct, oldmean
         floating[:, :] mean
         int64_t[:, :] nobs
+        Py_ssize_t len_values = len(values), len_labels = len(labels)
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
-    if not len(values) == len(labels):
+    if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
@@ -639,18 +642,19 @@ group_var_float64 = _group_var['double']
 @cython.boundscheck(False)
 def _group_mean(floating[:, :] out,
                 int64_t[:] counts,
-                floating[:, :] values,
+                ndarray[floating, ndim=2] values,
                 const int64_t[:] labels,
                 Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         floating val, count
         floating[:, :] sumx
         int64_t[:, :] nobs
+        Py_ssize_t len_values = len(values), len_labels = len(labels)
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
-    if not len(values) == len(labels):
+    if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
@@ -689,7 +693,7 @@ group_mean_float64 = _group_mean['double']
 @cython.boundscheck(False)
 def _group_ohlc(floating[:, :] out,
                 int64_t[:] counts,
-                floating[:, :] values,
+                ndarray[floating, ndim=2] values,
                 const int64_t[:] labels,
                 Py_ssize_t min_count=-1):
     """
@@ -740,7 +744,7 @@ group_ohlc_float64 = _group_ohlc['double']
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_quantile(ndarray[float64_t] out,
-                   numeric[:] values,
+                   ndarray[numeric, ndim=1] values,
                    ndarray[int64_t] labels,
                    ndarray[uint8_t] mask,
                    float64_t q,
@@ -1072,7 +1076,7 @@ def group_nth(rank_t[:, :] out,
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_rank(float64_t[:, :] out,
-               rank_t[:, :] values,
+               ndarray[rank_t, ndim=2] values,
                const int64_t[:] labels,
                int ngroups,
                bint is_datetimelike, object ties_method="average",
@@ -1424,7 +1428,7 @@ def group_min(groupby_t[:, :] out,
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_cummin(groupby_t[:, :] out,
-                 groupby_t[:, :] values,
+                 ndarray[groupby_t, ndim=2] values,
                  const int64_t[:] labels,
                  int ngroups,
                  bint is_datetimelike):
@@ -1484,7 +1488,7 @@ def group_cummin(groupby_t[:, :] out,
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_cummax(groupby_t[:, :] out,
-                 groupby_t[:, :] values,
+                 ndarray[groupby_t, ndim=2] values,
                  const int64_t[:] labels,
                  int ngroups,
                  bint is_datetimelike):
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -236,3 +236,44 @@ def test_cython_with_timestamp_and_nat(op, data):
 
     result = df.groupby("a").aggregate(op)
     tm.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+    "agg",
+    [
+        "min",
+        "max",
+        "count",
+        "sum",
+        "prod",
+        "var",
+        "mean",
+        "median",
+        "ohlc",
+        "cumprod",
+        "cumsum",
+        "shift",
+        "any",
+        "all",
+        "quantile",
+        "first",
+        "last",
+        "rank",
+        "cummin",
+        "cummax",
+    ],
+)
+def test_read_only_buffer_source_agg(agg):
+    # https://github.com/pandas-dev/pandas/issues/36014
+    df = DataFrame(
+        {
+            "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
+            "species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
+        }
+    )
+    df._mgr.blocks[0].values.flags.writeable = False
+
+    result = df.groupby(["species"]).agg({"sepal_length": agg})
+    expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
+
+    tm.assert_equal(result, expected)