diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index 1ef05ae5f9c6b..2fe3c21f20236 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -374,10 +374,10 @@ Groupby/Resample/Rolling
 - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`)
 - Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`)
 - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`)
+- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`)
 - Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`)
 - Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
 
-
 Reshaping
 ^^^^^^^^^
 
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index d02775cd4b328..561856ace2d9f 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -301,6 +301,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                 if observed:
                     codes = algorithms.unique1d(self.grouper.codes)
                     codes = codes[codes != -1]
+                    if sort or self.grouper.ordered:
+                        codes = np.sort(codes)
                 else:
                     codes = np.arange(len(categories))
 
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 1c18e6b1d25e0..c876700cabea4 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -451,6 +451,38 @@ def test_dataframe_categorical_with_nan(observed):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("ordered", [True, False])
+@pytest.mark.parametrize("observed", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
+    # GH 25871: Fix groupby sorting on ordered Categoricals
+    # GH 25167: Groupby with observed=True doesn't sort
+
+    # Build a dataframe with cat having one unobserved category ('missing'),
+    # and a Series with identical values
+    label = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'],
+                           categories=['a', 'b', 'missing', 'd'],
+                           ordered=ordered)
+    val = pd.Series(['d', 'a', 'b', 'a', 'd', 'b'])
+    df = pd.DataFrame({'label': label, 'val': val})
+
+    # aggregate on the Categorical
+    result = (df.groupby('label', observed=observed, sort=sort)['val']
+                .aggregate('first'))
+
+    # If ordering works, we expect index labels equal to aggregation results,
+    # except for 'observed=False': label 'missing' has aggregation None
+    label = pd.Series(result.index.array, dtype='object')
+    aggr = pd.Series(result.array)
+    if not observed:
+        aggr[aggr.isna()] = 'missing'
+    if not all(label == aggr):
+        msg = ('Labels and aggregation results not consistently sorted\n' +
+               'for (ordered={}, observed={}, sort={})\n' +
+               'Result:\n{}').format(ordered, observed, sort, result)
+        assert False, msg
+
+
 def test_datetime():
     # GH9049: ensure backward compatibility
     levels = pd.date_range('2014-01-01', periods=4)
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index f3363e1baad19..8382111ec9901 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -253,28 +253,27 @@ def test_groupby_levels_and_columns(self):
         tm.assert_frame_equal(by_levels, by_columns)
 
     def test_groupby_categorical_index_and_columns(self, observed):
-        # GH18432
+        # GH18432, adapted for GH25871
         columns = ['A', 'B', 'A', 'B']
         categories = ['B', 'A']
-        data = np.ones((5, 4), int)
+        data = np.array([[1, 2, 1, 2],
+                         [1, 2, 1, 2],
+                         [1, 2, 1, 2],
+                         [1, 2, 1, 2],
+                         [1, 2, 1, 2]], int)
         cat_columns = CategoricalIndex(columns,
                                        categories=categories,
                                        ordered=True)
         df = DataFrame(data=data, columns=cat_columns)
         result = df.groupby(axis=1, level=0, observed=observed).sum()
-        expected_data = 2 * np.ones((5, 2), int)
-
-        if observed:
-            # if we are not-observed we undergo a reindex
-            # so need to adjust the output as our expected sets us up
-            # to be non-observed
-            expected_columns = CategoricalIndex(['A', 'B'],
-                                                categories=categories,
-                                                ordered=True)
-        else:
-            expected_columns = CategoricalIndex(categories,
-                                                categories=categories,
-                                                ordered=True)
+        expected_data = np.array([[4, 2],
+                                  [4, 2],
+                                  [4, 2],
+                                  [4, 2],
+                                  [4, 2]], int)
+        expected_columns = CategoricalIndex(categories,
+                                            categories=categories,
+                                            ordered=True)
         expected = DataFrame(data=expected_data, columns=expected_columns)
         assert_frame_equal(result, expected)