diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
index a047ad46e4887..43ddfcfe74fcb 100644
--- a/doc/source/whatsnew/v0.24.2.rst
+++ b/doc/source/whatsnew/v0.24.2.rst
@@ -73,7 +73,7 @@ Bug Fixes
 
 **Reshaping**
 
--
+- Fixed bug in :meth:`DataFrame.groupby` where using ``observed=True`` didn't sort properly (:issue:`25167`)
 -
 -
 
diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py
index 85f51323a97b5..36f0ecce9f9aa 100644
--- a/pandas/core/groupby/categorical.py
+++ b/pandas/core/groupby/categorical.py
@@ -44,8 +44,7 @@ def recode_for_groupby(c, sort, observed):
         unique_codes = unique1d(c.codes)
 
         take_codes = unique_codes[unique_codes != -1]
-        if c.ordered:
-            take_codes = np.sort(take_codes)
+        take_codes = np.sort(take_codes)
 
         # we recode according to the uniques
         categories = c.categories.take(take_codes)
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 260417bc0d598..1cf9fc173e8a7 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -300,6 +300,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                 if observed:
                     codes = algorithms.unique1d(self.grouper.codes)
                     codes = codes[codes != -1]
+                    if sort:
+                        codes = np.sort(codes)
                 else:
                     codes = np.arange(len(categories))
 
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index e118135ccc75d..e75694aa52293 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -360,6 +360,14 @@ def test_observed(observed):
     expected = groups2.agg('mean').reset_index()
     tm.assert_frame_equal(result, expected)
 
+    # GH 25167
+    df = pd.DataFrame({'A': pd.Categorical(['b', 'a']), 'B': [1, 2]})
+    expected = pd.DataFrame([2, 1],
+                            index=pd.CategoricalIndex(['a', 'b'], name='A'),
+                            columns=['B'])
+    result = df.groupby('A', observed=observed).sum()
+    tm.assert_frame_equal(result, expected)
+
 
 def test_observed_codes_remap(observed):
     d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index a509a7cb57c97..36527a3c5b389 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -269,7 +269,7 @@ def test_groupby_categorical_index_and_columns(self, observed):
             # if we are not-observed we undergo a reindex
             # so need to adjust the output as our expected sets us up
             # to be non-observed
-            expected_columns = CategoricalIndex(['A', 'B'],
+            expected_columns = CategoricalIndex(['B', 'A'],
                                                 categories=categories,
                                                 ordered=True)
         else: