From 7a74e3427da0c1368ae91112506afc0bbca67130 Mon Sep 17 00:00:00 2001
From: Mayukh Bhattacharyya <mayukh.superb@gmail.com>
Date: Wed, 5 May 2021 03:07:55 +0530
Subject: [PATCH] groupby skipna initial commit

---
 pandas/_libs/groupby.pyx       | 24 ++++++++++++++++++++++--
 pandas/core/groupby/generic.py |  5 +++--
 pandas/core/groupby/groupby.py | 11 +++++++----
 pandas/core/groupby/ops.py     | 11 ++++++++---
 4 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 3fa92ce2229c3..2f2715d4b0e5d 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -482,7 +482,8 @@ def group_add(complexfloating_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[complexfloating_t, ndim=2] values,
               const intp_t[:] labels,
-              Py_ssize_t min_count=0) -> None:
+              Py_ssize_t min_count=0,
+              bint skipna=True) -> None:
     """
     Only aggregates on axis=0 using Kahan summation
     """
@@ -520,6 +521,13 @@ def group_add(complexfloating_t[:, ::1] out,
                     t = sumx[lab, j] + y
                     compensation[lab, j] = t - sumx[lab, j] - y
                     sumx[lab, j] = t
+                # dont skip nan
+                elif skipna == False:
+                    sumx[lab, j] = NAN
+                    break
+                # skip nan
+                else:
+                    continue
 
         for i in range(ncounts):
             for j in range(K):
@@ -535,7 +543,8 @@ def group_prod(floating[:, ::1] out,
                int64_t[::1] counts,
                ndarray[floating, ndim=2] values,
                const intp_t[:] labels,
-               Py_ssize_t min_count=0) -> None:
+               Py_ssize_t min_count=0,
+               bint skipna=True) -> None:
     """
     Only aggregates on axis=0
     """
@@ -568,6 +577,11 @@ def group_prod(floating[:, ::1] out,
                 if val == val:
                     nobs[lab, j] += 1
                     prodx[lab, j] *= val
+                elif skipna == False:
+                    prodx[lab, j] = NAN
+                    break
+                else:
+                    continue
 
         for i in range(ncounts):
             for j in range(K):
@@ -585,6 +599,7 @@ def group_var(floating[:, ::1] out,
               ndarray[floating, ndim=2] values,
               const intp_t[:] labels,
               Py_ssize_t min_count=-1,
+              bint skipna=True,
               int64_t ddof=1) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
@@ -622,6 +637,11 @@ def group_var(floating[:, ::1] out,
                     oldmean = mean[lab, j]
                     mean[lab, j] += (val - oldmean) / nobs[lab, j]
                     out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
+                elif skipna == False:
+                    out[lab, j] = NAN
+                    break
+                else:
+                    continue
 
         for i in range(ncounts):
             for j in range(K):
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 9d6d2d698dfe5..d48365a79f201 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -346,9 +346,10 @@ def _aggregate_multiple_funcs(self, arg):
         return self.obj._constructor_expanddim(output, columns=columns)
 
     def _cython_agg_general(
-        self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
+        self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1, skipna: bool = True
     ):
         output: dict[base.OutputKey, ArrayLike] = {}
+        # MAYUKH
         # Ideally we would be able to enumerate self._iterate_slices and use
         # the index from enumeration as the key of output, but ohlc in particular
         # returns a (n x 4) array. Output requires 1D ndarrays as values, so we
@@ -361,7 +362,7 @@ def _cython_agg_general(
                 continue
 
             result = self.grouper._cython_operation(
-                "aggregate", obj._values, how, axis=0, min_count=min_count
+                "aggregate", obj._values, how, axis=0, min_count=min_count, skipna=skipna
             )
             assert result.ndim == 1
             key = base.OutputKey(label=name, position=idx)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index f579b04db898e..d02327d42dfec 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1269,6 +1269,7 @@ def _agg_general(
         *,
         alias: str,
         npfunc: Callable,
+        skipna=True,
     ):
         with group_selection_context(self):
             # try a cython aggregation if we can
@@ -1279,6 +1280,7 @@ def _agg_general(
                     alt=npfunc,
                     numeric_only=numeric_only,
                     min_count=min_count,
+                    skipna=skipna
                 )
             except DataError:
                 pass
@@ -1298,7 +1300,7 @@ def _agg_general(
             return result.__finalize__(self.obj, method="groupby")
 
     def _cython_agg_general(
-        self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
+        self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1, skipna: bool = False
     ):
         raise AbstractMethodError(self)
 
@@ -1691,7 +1693,7 @@ def size(self) -> FrameOrSeriesUnion:
 
     @final
     @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
-    def sum(self, numeric_only: bool = True, min_count: int = 0):
+    def sum(self, numeric_only: bool = True, min_count: int = 0, skipna=True):
 
         # If we are grouping on categoricals we want unobserved categories to
         # return zero, rather than the default of NaN which the reindexing in
@@ -1702,15 +1704,16 @@ def sum(self, numeric_only: bool = True, min_count: int = 0):
                 min_count=min_count,
                 alias="add",
                 npfunc=np.sum,
+                skipna=skipna
             )
 
         return self._reindex_output(result, fill_value=0)
 
     @final
     @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
-    def prod(self, numeric_only: bool = True, min_count: int = 0):
+    def prod(self, numeric_only: bool = True, min_count: int = 0, skipna: bool = True):
         return self._agg_general(
-            numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
+            numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod, skipna=skipna
         )
 
     @final
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 3ee185d862b01..0b1df3592c597 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -707,12 +707,14 @@ def _cython_operation(
         how: str,
         axis: int,
         min_count: int = -1,
+        skipna: bool = True,
         mask: np.ndarray | None = None,
         **kwargs,
     ) -> ArrayLike:
         """
         Returns the values of a cython operation.
         """
+        #MAYUKH
         orig_values = values
         assert kind in ["transform", "aggregate"]
 
@@ -726,6 +728,7 @@ def _cython_operation(
         dtype = values.dtype
         is_numeric = is_numeric_dtype(dtype)
 
+        #MAYUKH
         cy_op = WrappedCythonOp(kind=kind, how=how)
 
         # can we do this operation with our cython functions
@@ -736,11 +739,11 @@ def _cython_operation(
         if is_extension_array_dtype(dtype):
             if isinstance(values, BaseMaskedArray) and func_uses_mask:
                 return self._masked_ea_wrap_cython_operation(
-                    cy_op, kind, values, how, axis, min_count, **kwargs
+                    cy_op, kind, values, how, axis, min_count, skipna, **kwargs
                 )
             else:
                 return self._ea_wrap_cython_operation(
-                    cy_op, kind, values, how, axis, min_count, **kwargs
+                    cy_op, kind, values, how, axis, min_count, skipna, **kwargs
                 )
 
         elif values.ndim == 1:
@@ -752,6 +755,7 @@ def _cython_operation(
                 how=how,
                 axis=1,
                 min_count=min_count,
+                skipna=skipna,
                 mask=mask,
                 **kwargs,
             )
@@ -802,7 +806,8 @@ def _cython_operation(
                     is_datetimelike=is_datetimelike,
                 )
             else:
-                func(result, counts, values, comp_ids, min_count)
+                #MAYUKH
+                func(result, counts, values, comp_ids, min_count, skipna)
         elif kind == "transform":
             # TODO: min_count
             if func_uses_mask: