pandas-dev · jreback · May 25, 2020 · Apr 18, 2020 · Apr 19, 2020 · Apr 26, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -603,6 +603,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`)
 - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
 - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
+- Bug in :meth:`SeriesGroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`)
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -778,47 +778,48 @@ def group_quantile(ndarray[float64_t] out,
             if not mask[i]:
                 non_na_counts[lab] += 1
 
-    # Get an index of values sorted by labels and then values
-    order = (values, labels)
-    sort_arr = np.lexsort(order).astype(np.int64, copy=False)
-
-    with nogil:
-        for i in range(ngroups):
-            # Figure out how many group elements there are
-            grp_sz = counts[i]
-            non_na_sz = non_na_counts[i]
+    if labels.any():
+        # Get an index of values sorted by labels and then values
+        labels[labels==-1] = np.max(labels) + 1
+        order = (values, labels)
+        sort_arr= np.lexsort(order).astype(np.int64, copy=False)
+        with nogil:
+            for i in range(ngroups):
+                # Figure out how many group elements there are
+                grp_sz = counts[i]
+                non_na_sz = non_na_counts[i]
 
-            if non_na_sz == 0:
-                out[i] = NaN
-            else:
-                # Calculate where to retrieve the desired value
-                # Casting to int will intentionally truncate result
-                idx = grp_start + <int64_t>(q * <float64_t>(non_na_sz - 1))
-
-                val = values[sort_arr[idx]]
-                # If requested quantile falls evenly on a particular index
-                # then write that index's value out. Otherwise interpolate
-                q_idx = q * (non_na_sz - 1)
-                frac = q_idx % 1
-
-                if frac == 0.0 or interp == INTERPOLATION_LOWER:
-                    out[i] = val
+                if non_na_sz == 0:
+                    out[i] = NaN
                 else:
-                    next_val = values[sort_arr[idx + 1]]
-                    if interp == INTERPOLATION_LINEAR:
-                        out[i] = val + (next_val - val) * frac
-                    elif interp == INTERPOLATION_HIGHER:
-                        out[i] = next_val
-                    elif interp == INTERPOLATION_MIDPOINT:
-                        out[i] = (val + next_val) / 2.0
-                    elif interp == INTERPOLATION_NEAREST:
-                        if frac > .5 or (frac == .5 and q > .5):  # Always OK?
+                    # Calculate where to retrieve the desired value
+                    # Casting to int will intentionally truncate result
+                    idx = grp_start + <int64_t>(q * <float64_t>(non_na_sz - 1))
+
+                    val = values[sort_arr[idx]]
+                    # If requested quantile falls evenly on a particular index
+                    # then write that index's value out. Otherwise interpolate
+                    q_idx = q * (non_na_sz - 1)
+                    frac = q_idx % 1
+
+                    if frac == 0.0 or interp == INTERPOLATION_LOWER:
+                        out[i] = val
+                    else:
+                        next_val = values[sort_arr[idx + 1]]
+                        if interp == INTERPOLATION_LINEAR:
+                            out[i] = val + (next_val - val) * frac
+                        elif interp == INTERPOLATION_HIGHER:
                             out[i] = next_val
-                        else:
-                            out[i] = val
-
-            # Increment the index reference in sorted_arr for the next group
-            grp_start += grp_sz
+                        elif interp == INTERPOLATION_MIDPOINT:
+                            out[i] = (val + next_val) / 2.0
+                        elif interp == INTERPOLATION_NEAREST:
+                            if frac > .5 or (frac == .5 and q > .5):  # Always OK?
+                                out[i] = next_val
+                            else:
+                                out[i] = val
+
+                # Increment the index reference in sorted_arr for the next group
+                grp_start += grp_sz
 
 
 # ----------------------------------------------------------------------

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -1507,14 +1507,23 @@ def test_quantile_missing_group_values_no_segfaults():
         grp.quantile()
 
 
-def test_quantile_missing_group_values_correct_results():
-    # GH 28662
-    data = np.array([1.0, np.nan, 3.0, np.nan])
-    df = pd.DataFrame(dict(key=data, val=range(4)))
+@pytest.mark.parametrize(
+    "key, val, expected_key, expected_val",
+    [
+        ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
+        ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
+        (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
+    ],
+)
+def test_quantile_missing_group_values_correct_results(
+    key, val, expected_key, expected_val
+):
+    # GH 28662, GH 33200
+    df = pd.DataFrame({"key": key, "val": val})
 
     result = df.groupby("key").quantile()
     expected = pd.DataFrame(
-        [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"]
+        expected_val, index=pd.Index(expected_key, name="key"), columns=["val"]
     )
     tm.assert_frame_equal(result, expected)