Merge remote-tracking branch 'upstream/main' into allow-mixed-iso

MarcoGorelli · MarcoGorelli · commit 3d65dbf4f480 · 2023-01-23T17:40:41.000Z
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -14,7 +14,10 @@ jobs:
     steps:
       - checkout
       - run: .circleci/setup_env.sh
-      - run: PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH ci/run_tests.sh
+      - run: >
+          PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH
+          LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD
+          ci/run_tests.sh
 
 workflows:
   test:
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -160,6 +160,7 @@ Other enhancements
 - Added ``name`` parameter to :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_arrays` and :meth:`IntervalIndex.from_tuples` (:issue:`48911`)
 - Improve exception message when using :func:`assert_frame_equal` on a :class:`DataFrame` to include the column that is compared (:issue:`50323`)
 - Improved error message for :func:`merge_asof` when join-columns were duplicated (:issue:`50102`)
+- Added support for extension array dtypes to :func:`get_dummies` (:func:`32430`)
 - Added :meth:`Index.infer_objects` analogous to :meth:`Series.infer_objects` (:issue:`50034`)
 - Added ``copy`` parameter to :meth:`Series.infer_objects` and :meth:`DataFrame.infer_objects`, passing ``False`` will avoid making copies for series or columns that are already non-object or where no better dtype can be inferred (:issue:`50096`)
 - :meth:`DataFrame.plot.hist` now recognizes ``xlabel`` and ``ylabel`` arguments (:issue:`49793`)
@@ -1106,6 +1107,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"idxmin"`` and ``"idxmax"`` arguments (:issue:`45986`)
 - Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`)
 - Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
+- Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`)
 -
 
 Reshaping
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -1202,7 +1202,7 @@ def _gotitem(self, key, ndim, subset=None):
 
         # Try to select from a DataFrame, falling back to a Series
         try:
-            if isinstance(key, list) and self.key not in key:
+            if isinstance(key, list) and self.key not in key and self.key is not None:
                 key.append(self.key)
             groupby = self._groupby[key]
         except IndexError:
diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
@@ -16,6 +16,7 @@
     is_integer_dtype,
     is_list_like,
     is_object_dtype,
+    pandas_dtype,
 )
 
 from pandas.core.arrays import SparseArray
@@ -240,9 +241,9 @@ def _get_dummies_1d(
 
     if dtype is None:
         dtype = np.dtype(bool)
-    dtype = np.dtype(dtype)
+    _dtype = pandas_dtype(dtype)
 
-    if is_object_dtype(dtype):
+    if is_object_dtype(_dtype):
         raise ValueError("dtype=object is not a valid dtype for get_dummies")
 
     def get_empty_frame(data) -> DataFrame:
@@ -317,7 +318,12 @@ def get_empty_frame(data) -> DataFrame:
 
     else:
         # take on axis=1 + transpose to ensure ndarray layout is column-major
-        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T
+        eye_dtype: NpDtype
+        if isinstance(_dtype, np.dtype):
+            eye_dtype = _dtype
+        else:
+            eye_dtype = np.bool_
+        dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T
 
         if not dummy_na:
             # reset NaN GH4446
@@ -327,7 +333,7 @@ def get_empty_frame(data) -> DataFrame:
             # remove first GH12042
             dummy_mat = dummy_mat[:, 1:]
             dummy_cols = dummy_cols[1:]
-        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
+        return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
 
 
 def from_dummies(
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
@@ -536,3 +536,82 @@ def test_groupby_resample_size_all_index_same():
         ),
     )
     tm.assert_series_equal(result, expected)
+
+
+def test_groupby_resample_on_index_with_list_of_keys():
+    # GH 50840
+    df = DataFrame(
+        data={
+            "group": [0, 0, 0, 0, 1, 1, 1, 1],
+            "val": [3, 1, 4, 1, 5, 9, 2, 6],
+        },
+        index=Series(
+            date_range(start="2016-01-01", periods=8),
+            name="date",
+        ),
+    )
+    result = df.groupby("group").resample("2D")[["val"]].mean()
+    expected = DataFrame(
+        data={
+            "val": [2.0, 2.5, 7.0, 4.0],
+        },
+        index=Index(
+            data=[
+                (0, Timestamp("2016-01-01")),
+                (0, Timestamp("2016-01-03")),
+                (1, Timestamp("2016-01-05")),
+                (1, Timestamp("2016-01-07")),
+            ],
+            name=("group", "date"),
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_on_index_with_list_of_keys_multi_columns():
+    # GH 50876
+    df = DataFrame(
+        data={
+            "group": [0, 0, 0, 0, 1, 1, 1, 1],
+            "first_val": [3, 1, 4, 1, 5, 9, 2, 6],
+            "second_val": [2, 7, 1, 8, 2, 8, 1, 8],
+            "third_val": [1, 4, 1, 4, 2, 1, 3, 5],
+        },
+        index=Series(
+            date_range(start="2016-01-01", periods=8),
+            name="date",
+        ),
+    )
+    result = df.groupby("group").resample("2D")[["first_val", "second_val"]].mean()
+    expected = DataFrame(
+        data={
+            "first_val": [2.0, 2.5, 7.0, 4.0],
+            "second_val": [4.5, 4.5, 5.0, 4.5],
+        },
+        index=Index(
+            data=[
+                (0, Timestamp("2016-01-01")),
+                (0, Timestamp("2016-01-03")),
+                (1, Timestamp("2016-01-05")),
+                (1, Timestamp("2016-01-07")),
+            ],
+            name=("group", "date"),
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_on_index_with_list_of_keys_missing_column():
+    # GH 50876
+    df = DataFrame(
+        data={
+            "group": [0, 0, 0, 0, 1, 1, 1, 1],
+            "val": [3, 1, 4, 1, 5, 9, 2, 6],
+        },
+        index=Series(
+            date_range(start="2016-01-01", periods=8),
+            name="date",
+        ),
+    )
+    with pytest.raises(KeyError, match="Columns not found"):
+        df.groupby("group").resample("2D")[["val_not_in_dataframe"]].mean()
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
@@ -657,3 +657,23 @@ def test_get_dummies_with_string_values(self, values):
 
         with pytest.raises(TypeError, match=msg):
             get_dummies(df, columns=values)
+
+    def test_get_dummies_ea_dtype_series(self, any_numeric_ea_dtype):
+        # GH#32430
+        ser = Series(list("abca"))
+        result = get_dummies(ser, dtype=any_numeric_ea_dtype)
+        expected = DataFrame(
+            {"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]},
+            dtype=any_numeric_ea_dtype,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_dtype):
+        # GH#32430
+        df = DataFrame({"x": list("abca")})
+        result = get_dummies(df, dtype=any_numeric_ea_dtype)
+        expected = DataFrame(
+            {"x_a": [1, 0, 0, 1], "x_b": [0, 1, 0, 0], "x_c": [0, 0, 1, 0]},
+            dtype=any_numeric_ea_dtype,
+        )
+        tm.assert_frame_equal(result, expected)