Merge branch 'master' into groupby-mean-datetimelike

AlexeyGy · web-flow · commit 7549d2f04f5f · 2021-09-11T18:04:01.000Z
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -107,8 +107,7 @@ Other enhancements
 - :meth:`DataFrame.to_stata` and :meth:`StataWriter` now accept the keyword only argument ``value_labels`` to save labels for non-categorical columns
 - Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`)
 - Add :meth:`Series.str.removeprefix` and :meth:`Series.str.removesuffix` introduced in Python 3.9 to remove pre-/suffixes from string-type :class:`Series` (:issue:`36944`)
-- :meth:`.GroupBy.mean` now supports ``NaT`` values (:issue:`43132`)
--
+- Attempting to write into a file in missing parent directory with :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_html`, :meth:`DataFrame.to_excel`, :meth:`DataFrame.to_feather`, :meth:`DataFrame.to_parquet`, :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_json`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_xml` now explicitly mentions missing parent directory, the same is true for :class:`Series` counterparts (:issue:`24306`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd
@@ -1,19 +1,8 @@
 cimport numpy as cnp
-from numpy cimport ndarray
 
 from pandas._libs.tslibs.util cimport *
 
 
-cdef extern from "numpy/ndarraytypes.h":
-    void PyArray_CLEARFLAGS(ndarray arr, int flags) nogil
-
-
-cdef extern from "numpy/arrayobject.h":
-    enum:
-        NPY_ARRAY_C_CONTIGUOUS
-        NPY_ARRAY_F_CONTIGUOUS
-
-
 cdef extern from "src/headers/stdint.h":
     enum: UINT8_MAX
     enum: UINT16_MAX
@@ -42,10 +31,3 @@ ctypedef fused numeric:
 
     cnp.float32_t
     cnp.float64_t
-
-
-cdef inline void set_array_not_contiguous(ndarray ao) nogil:
-    # Numpy>=1.8-compliant equivalent to:
-    # ao->flags &= ~(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
-    PyArray_CLEARFLAGS(ao,
-                       (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS))
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -449,7 +449,7 @@ def __ne__(self, other: Any) -> ArrayLike:  # type: ignore[override]
 
     def to_numpy(
         self,
-        dtype: Dtype | None = None,
+        dtype: npt.DTypeLike | None = None,
         copy: bool = False,
         na_value=lib.no_default,
     ) -> np.ndarray:
@@ -478,12 +478,7 @@ def to_numpy(
         -------
         numpy.ndarray
         """
-        # error: Argument "dtype" to "asarray" has incompatible type
-        # "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], Type[int],
-        # Type[complex], Type[bool], Type[object], None]"; expected "Union[dtype[Any],
-        # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
-        # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
-        result = np.asarray(self, dtype=dtype)  # type: ignore[arg-type]
+        result = np.asarray(self, dtype=dtype)
         if copy or na_value is not lib.no_default:
             result = result.copy()
         if na_value is not lib.no_default:
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -224,12 +224,9 @@ def __len__(self) -> int:
     def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
         return type(self)(~self._data, self._mask.copy())
 
-    # error: Argument 1 of "to_numpy" is incompatible with supertype "ExtensionArray";
-    # supertype defines the argument type as "Union[ExtensionDtype, str, dtype[Any],
-    # Type[str], Type[float], Type[int], Type[complex], Type[bool], Type[object], None]"
-    def to_numpy(  # type: ignore[override]
+    def to_numpy(
         self,
-        dtype: NpDtype | None = None,
+        dtype: npt.DTypeLike | None = None,
         copy: bool = False,
         na_value: Scalar = lib.no_default,
     ) -> np.ndarray:
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -10,6 +10,7 @@
     Dtype,
     NpDtype,
     Scalar,
+    npt,
 )
 from pandas.compat.numpy import function as nv
 
@@ -365,12 +366,9 @@ def skew(
     # ------------------------------------------------------------------------
     # Additional Methods
 
-    # error: Argument 1 of "to_numpy" is incompatible with supertype "ExtensionArray";
-    # supertype defines the argument type as "Union[ExtensionDtype, str, dtype[Any],
-    # Type[str], Type[float], Type[int], Type[complex], Type[bool], Type[object], None]"
-    def to_numpy(  # type: ignore[override]
+    def to_numpy(
         self,
-        dtype: NpDtype | None = None,
+        dtype: npt.DTypeLike | None = None,
         copy: bool = False,
         na_value=lib.no_default,
     ) -> np.ndarray:
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -24,6 +24,7 @@
     Scalar,
     ScalarIndexer,
     SequenceIndexer,
+    npt,
 )
 from pandas.compat import (
     pa_version_under1p0,
@@ -199,12 +200,9 @@ def __arrow_array__(self, type=None):
         """Convert myself to a pyarrow Array or ChunkedArray."""
         return self._data
 
-    # error: Argument 1 of "to_numpy" is incompatible with supertype "ExtensionArray";
-    # supertype defines the argument type as "Union[ExtensionDtype, str, dtype[Any],
-    # Type[str], Type[float], Type[int], Type[complex], Type[bool], Type[object], None]"
-    def to_numpy(  # type: ignore[override]
+    def to_numpy(
         self,
-        dtype: NpDtype | None = None,
+        dtype: npt.DTypeLike | None = None,
         copy: bool = False,
         na_value=lib.no_default,
     ) -> np.ndarray:
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -516,16 +516,8 @@ def to_numpy(
         """
         if is_extension_array_dtype(self.dtype):
             # error: Too many arguments for "to_numpy" of "ExtensionArray"
-
-            # error: Argument 1 to "to_numpy" of "ExtensionArray" has incompatible type
-            # "Optional[Union[dtype[Any], None, type, _SupportsDType[dtype[Any]], str,
-            # Union[Tuple[Any, int], Tuple[Any, Union[SupportsIndex,
-            # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]";
-            # expected "Optional[Union[ExtensionDtype, Union[str, dtype[Any]],
-            # Type[str], Type[float], Type[int], Type[complex], Type[bool],
-            # Type[object]]]"
             return self.array.to_numpy(  # type: ignore[call-arg]
-                dtype, copy=copy, na_value=na_value, **kwargs  # type: ignore[arg-type]
+                dtype, copy=copy, na_value=na_value, **kwargs
             )
         elif kwargs:
             bad_keys = list(kwargs.keys())[0]
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -792,24 +792,6 @@ def count(self) -> Series:
         )
         return self._reindex_output(result, fill_value=0)
 
-    def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):
-        """Calculate pct_change of each value to previous entry in group"""
-        # TODO: Remove this conditional when #23918 is fixed
-        if freq:
-            return self.apply(
-                lambda x: x.pct_change(
-                    periods=periods, fill_method=fill_method, limit=limit, freq=freq
-                )
-            )
-        if fill_method is None:  # GH30463
-            fill_method = "pad"
-            limit = 0
-        filled = getattr(self, fill_method)(limit=limit)
-        fill_grp = filled.groupby(self.grouper.codes)
-        shifted = fill_grp.shift(periods=periods, freq=freq)
-
-        return (filled / shifted) - 1
-
     @doc(Series.nlargest)
     def nlargest(self, n: int = 5, keep: str = "first"):
         f = partial(Series.nlargest, n=n, keep=keep)
@@ -1086,14 +1068,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
         #  test_resample_apply_product
 
         obj = self._obj_with_exclusions
-        result: dict[int | str, NDFrame] = {}
-        for i, item in enumerate(obj):
-            ser = obj.iloc[:, i]
-            colg = SeriesGroupBy(
-                ser, selection=item, grouper=self.grouper, exclusions=self.exclusions
-            )
+        result: dict[int, NDFrame] = {}
 
-            result[i] = colg.aggregate(func, *args, **kwargs)
+        for i, (item, sgb) in enumerate(self._iterate_column_groupbys(obj)):
+            result[i] = sgb.aggregate(func, *args, **kwargs)
 
         res_df = self.obj._constructor(result)
         res_df.columns = obj.columns
@@ -1168,11 +1146,7 @@ def _wrap_applied_output_series(
             applied_index = self._selected_obj._get_axis(self.axis)
             singular_series = len(values) == 1 and applied_index.nlevels == 1
 
-            # assign the name to this series
             if singular_series:
-                keys = self.grouper.group_keys_seq
-                values[0].name = keys[0]
-
                 # GH2893
                 # we have series in the values array, we want to
                 # produce a series:
@@ -1372,14 +1346,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
         #  gets here with non-unique columns
         output = {}
         inds = []
-        for i, col in enumerate(obj):
-            subset = obj.iloc[:, i]
-            sgb = SeriesGroupBy(
-                subset,
-                selection=col,
-                grouper=self.grouper,
-                exclusions=self.exclusions,
-            )
+        for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)):
             try:
                 output[i] = sgb.transform(wrapper)
             except TypeError:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1037,7 +1037,7 @@ def reset_identity(values):
             if self.as_index:
 
                 # possible MI return case
-                group_keys = self.grouper.group_keys_seq
+                group_keys = self.grouper.result_index
                 group_levels = self.grouper.levels
                 group_names = self.grouper.names
 
@@ -3236,6 +3236,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         )
         return res
 
+    @final
     @Substitution(name="groupby")
     @Appender(_common_see_also)
     def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0):
@@ -3247,6 +3248,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0
         Series or DataFrame
             Percentage changes within each group.
         """
+        # TODO: Remove this conditional for SeriesGroupBy when GH#23918 is fixed
         if freq is not None or axis != 0:
             return self.apply(
                 lambda x: x.pct_change(
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -228,6 +228,11 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
         # expected "ndarray")
         return self.values  # type: ignore[return-value]
 
+    def values_for_json(self) -> np.ndarray:
+        # Incompatible return value type (got "Union[ndarray[Any, Any],
+        # ExtensionArray]", expected "ndarray[Any, Any]")
+        return self.values  # type: ignore[return-value]
+
     @final
     @cache_readonly
     def fill_value(self):
@@ -1375,6 +1380,9 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
         # TODO(EA2D): reshape not needed with 2D EAs
         return np.asarray(values).reshape(self.shape)
 
+    def values_for_json(self) -> np.ndarray:
+        return np.asarray(self.values)
+
     def interpolate(
         self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs
     ):
@@ -1805,6 +1813,11 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
     is_numeric = False
     values: DatetimeArray | TimedeltaArray
 
+    def values_for_json(self) -> np.ndarray:
+        # special casing datetimetz to avoid conversion through
+        #  object dtype
+        return self.values._ndarray
+
 
 class DatetimeTZBlock(DatetimeLikeBlock):
     """implement a datetime64 block with a tz attribute"""
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -998,24 +998,25 @@ def column_arrays(self) -> list[np.ndarray]:
         """
         Used in the JSON C code to access column arrays.
         This optimizes compared to using `iget_values` by converting each
-        block.values to a np.ndarray only once up front
         """
-        # special casing datetimetz to avoid conversion through object dtype
-        arrays = [
-            blk.values._ndarray
-            if isinstance(blk, DatetimeTZBlock)
-            else np.asarray(blk.values)
-            for blk in self.blocks
-        ]
-        result = []
-        for i in range(len(self.items)):
-            arr = arrays[self.blknos[i]]
-            if arr.ndim == 2:
-                values = arr[self.blklocs[i]]
+        # This is an optimized equivalent to
+        #  result = [self.iget_values(i) for i in range(len(self.items))]
+        result: list[np.ndarray | None] = [None] * len(self.items)
+
+        for blk in self.blocks:
+            mgr_locs = blk._mgr_locs
+            values = blk.values_for_json()
+            if values.ndim == 1:
+                # TODO(EA2D): special casing not needed with 2D EAs
+                result[mgr_locs[0]] = values
+
             else:
-                values = arr
-            result.append(values)
-        return result
+                for i, loc in enumerate(mgr_locs):
+                    result[loc] = values[i]
+
+        # error: Incompatible return value type (got "List[None]",
+        # expected "List[ndarray[Any, Any]]")
+        return result  # type: ignore[return-value]
 
     def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
         """
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -17,6 +17,7 @@
 )
 import mmap
 import os
+from pathlib import Path
 import tempfile
 from typing import (
     IO,
@@ -520,6 +521,21 @@ def infer_compression(
     raise ValueError(msg)
 
 
+def check_parent_directory(path: Path | str) -> None:
+    """
+    Check if parent directory of a file exists, raise OSError if it does not
+
+    Parameters
+    ----------
+    path: Path or str
+        Path to check parent directory of
+
+    """
+    parent = Path(path).parent
+    if not parent.is_dir():
+        raise OSError(fr"Cannot save file into a non-existent directory: '{parent}'")
+
+
 def get_handle(
     path_or_buf: FilePathOrBuffer,
     mode: str,
@@ -632,6 +648,10 @@ def get_handle(
     compression_args = dict(ioargs.compression)
     compression = compression_args.pop("method")
 
+    # Only for write methods
+    if "r" not in mode and is_path:
+        check_parent_directory(str(handle))
+
     if compression:
         # compression libraries do not like an explicit text-mode
         ioargs.mode = ioargs.mode.replace("t", "")
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -96,7 +96,10 @@
 from pandas.core.indexes.timedeltas import TimedeltaIndex
 from pandas.core.reshape.concat import concat
 
-from pandas.io.common import stringify_path
+from pandas.io.common import (
+    check_parent_directory,
+    stringify_path,
+)
 from pandas.io.formats.printing import (
     adjoin,
     justify,
@@ -1147,6 +1150,7 @@ def get_buffer(buf: FilePathOrBuffer[str] | None, encoding: str | None = None):
     if hasattr(buf, "write"):
         yield buf
     elif isinstance(buf, str):
+        check_parent_directory(str(buf))
         with open(buf, "w", encoding=encoding, newline="") as f:
             # GH#30034 open instead of codecs.open prevents a file leak
             #  if we have an invalid encoding argument.
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py