Skip to content

Commit 7549d2f

Browse files
authored
Merge branch 'master' into groupby-mean-datetimelike
2 parents 7990797 + f525864 commit 7549d2f

File tree

16 files changed

+154
-117
lines changed

16 files changed

+154
-117
lines changed

doc/source/whatsnew/v1.4.0.rst

+1-2
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,7 @@ Other enhancements
107107
- :meth:`DataFrame.to_stata` and :meth:`StataWriter` now accept the keyword only argument ``value_labels`` to save labels for non-categorical columns
108108
- Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`)
109109
- Add :meth:`Series.str.removeprefix` and :meth:`Series.str.removesuffix` introduced in Python 3.9 to remove pre-/suffixes from string-type :class:`Series` (:issue:`36944`)
110-
- :meth:`.GroupBy.mean` now supports ``NaT`` values (:issue:`43132`)
111-
-
110+
- Attempting to write into a file in missing parent directory with :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_html`, :meth:`DataFrame.to_excel`, :meth:`DataFrame.to_feather`, :meth:`DataFrame.to_parquet`, :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_json`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_xml` now explicitly mentions missing parent directory, the same is true for :class:`Series` counterparts (:issue:`24306`)
112111

113112
.. ---------------------------------------------------------------------------
114113

pandas/_libs/util.pxd

-18
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,8 @@
11
cimport numpy as cnp
2-
from numpy cimport ndarray
32

43
from pandas._libs.tslibs.util cimport *
54

65

7-
cdef extern from "numpy/ndarraytypes.h":
8-
void PyArray_CLEARFLAGS(ndarray arr, int flags) nogil
9-
10-
11-
cdef extern from "numpy/arrayobject.h":
12-
enum:
13-
NPY_ARRAY_C_CONTIGUOUS
14-
NPY_ARRAY_F_CONTIGUOUS
15-
16-
176
cdef extern from "src/headers/stdint.h":
187
enum: UINT8_MAX
198
enum: UINT16_MAX
@@ -42,10 +31,3 @@ ctypedef fused numeric:
4231

4332
cnp.float32_t
4433
cnp.float64_t
45-
46-
47-
cdef inline void set_array_not_contiguous(ndarray ao) nogil:
48-
# Numpy>=1.8-compliant equivalent to:
49-
# ao->flags &= ~(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
50-
PyArray_CLEARFLAGS(ao,
51-
(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS))

pandas/core/arrays/base.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override]
449449

450450
def to_numpy(
451451
self,
452-
dtype: Dtype | None = None,
452+
dtype: npt.DTypeLike | None = None,
453453
copy: bool = False,
454454
na_value=lib.no_default,
455455
) -> np.ndarray:
@@ -478,12 +478,7 @@ def to_numpy(
478478
-------
479479
numpy.ndarray
480480
"""
481-
# error: Argument "dtype" to "asarray" has incompatible type
482-
# "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], Type[int],
483-
# Type[complex], Type[bool], Type[object], None]"; expected "Union[dtype[Any],
484-
# None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
485-
# Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
486-
result = np.asarray(self, dtype=dtype) # type: ignore[arg-type]
481+
result = np.asarray(self, dtype=dtype)
487482
if copy or na_value is not lib.no_default:
488483
result = result.copy()
489484
if na_value is not lib.no_default:

pandas/core/arrays/masked.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -224,12 +224,9 @@ def __len__(self) -> int:
224224
def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
225225
return type(self)(~self._data, self._mask.copy())
226226

227-
# error: Argument 1 of "to_numpy" is incompatible with supertype "ExtensionArray";
228-
# supertype defines the argument type as "Union[ExtensionDtype, str, dtype[Any],
229-
# Type[str], Type[float], Type[int], Type[complex], Type[bool], Type[object], None]"
230-
def to_numpy( # type: ignore[override]
227+
def to_numpy(
231228
self,
232-
dtype: NpDtype | None = None,
229+
dtype: npt.DTypeLike | None = None,
233230
copy: bool = False,
234231
na_value: Scalar = lib.no_default,
235232
) -> np.ndarray:

pandas/core/arrays/numpy_.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
Dtype,
1111
NpDtype,
1212
Scalar,
13+
npt,
1314
)
1415
from pandas.compat.numpy import function as nv
1516

@@ -365,12 +366,9 @@ def skew(
365366
# ------------------------------------------------------------------------
366367
# Additional Methods
367368

368-
# error: Argument 1 of "to_numpy" is incompatible with supertype "ExtensionArray";
369-
# supertype defines the argument type as "Union[ExtensionDtype, str, dtype[Any],
370-
# Type[str], Type[float], Type[int], Type[complex], Type[bool], Type[object], None]"
371-
def to_numpy( # type: ignore[override]
369+
def to_numpy(
372370
self,
373-
dtype: NpDtype | None = None,
371+
dtype: npt.DTypeLike | None = None,
374372
copy: bool = False,
375373
na_value=lib.no_default,
376374
) -> np.ndarray:

pandas/core/arrays/string_arrow.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
Scalar,
2525
ScalarIndexer,
2626
SequenceIndexer,
27+
npt,
2728
)
2829
from pandas.compat import (
2930
pa_version_under1p0,
@@ -199,12 +200,9 @@ def __arrow_array__(self, type=None):
199200
"""Convert myself to a pyarrow Array or ChunkedArray."""
200201
return self._data
201202

202-
# error: Argument 1 of "to_numpy" is incompatible with supertype "ExtensionArray";
203-
# supertype defines the argument type as "Union[ExtensionDtype, str, dtype[Any],
204-
# Type[str], Type[float], Type[int], Type[complex], Type[bool], Type[object], None]"
205-
def to_numpy( # type: ignore[override]
203+
def to_numpy(
206204
self,
207-
dtype: NpDtype | None = None,
205+
dtype: npt.DTypeLike | None = None,
208206
copy: bool = False,
209207
na_value=lib.no_default,
210208
) -> np.ndarray:

pandas/core/base.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -516,16 +516,8 @@ def to_numpy(
516516
"""
517517
if is_extension_array_dtype(self.dtype):
518518
# error: Too many arguments for "to_numpy" of "ExtensionArray"
519-
520-
# error: Argument 1 to "to_numpy" of "ExtensionArray" has incompatible type
521-
# "Optional[Union[dtype[Any], None, type, _SupportsDType[dtype[Any]], str,
522-
# Union[Tuple[Any, int], Tuple[Any, Union[SupportsIndex,
523-
# Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]";
524-
# expected "Optional[Union[ExtensionDtype, Union[str, dtype[Any]],
525-
# Type[str], Type[float], Type[int], Type[complex], Type[bool],
526-
# Type[object]]]"
527519
return self.array.to_numpy( # type: ignore[call-arg]
528-
dtype, copy=copy, na_value=na_value, **kwargs # type: ignore[arg-type]
520+
dtype, copy=copy, na_value=na_value, **kwargs
529521
)
530522
elif kwargs:
531523
bad_keys = list(kwargs.keys())[0]

pandas/core/groupby/generic.py

+4-37
Original file line numberDiff line numberDiff line change
@@ -792,24 +792,6 @@ def count(self) -> Series:
792792
)
793793
return self._reindex_output(result, fill_value=0)
794794

795-
def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):
796-
"""Calculate pct_change of each value to previous entry in group"""
797-
# TODO: Remove this conditional when #23918 is fixed
798-
if freq:
799-
return self.apply(
800-
lambda x: x.pct_change(
801-
periods=periods, fill_method=fill_method, limit=limit, freq=freq
802-
)
803-
)
804-
if fill_method is None: # GH30463
805-
fill_method = "pad"
806-
limit = 0
807-
filled = getattr(self, fill_method)(limit=limit)
808-
fill_grp = filled.groupby(self.grouper.codes)
809-
shifted = fill_grp.shift(periods=periods, freq=freq)
810-
811-
return (filled / shifted) - 1
812-
813795
@doc(Series.nlargest)
814796
def nlargest(self, n: int = 5, keep: str = "first"):
815797
f = partial(Series.nlargest, n=n, keep=keep)
@@ -1086,14 +1068,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
10861068
# test_resample_apply_product
10871069

10881070
obj = self._obj_with_exclusions
1089-
result: dict[int | str, NDFrame] = {}
1090-
for i, item in enumerate(obj):
1091-
ser = obj.iloc[:, i]
1092-
colg = SeriesGroupBy(
1093-
ser, selection=item, grouper=self.grouper, exclusions=self.exclusions
1094-
)
1071+
result: dict[int, NDFrame] = {}
10951072

1096-
result[i] = colg.aggregate(func, *args, **kwargs)
1073+
for i, (item, sgb) in enumerate(self._iterate_column_groupbys(obj)):
1074+
result[i] = sgb.aggregate(func, *args, **kwargs)
10971075

10981076
res_df = self.obj._constructor(result)
10991077
res_df.columns = obj.columns
@@ -1168,11 +1146,7 @@ def _wrap_applied_output_series(
11681146
applied_index = self._selected_obj._get_axis(self.axis)
11691147
singular_series = len(values) == 1 and applied_index.nlevels == 1
11701148

1171-
# assign the name to this series
11721149
if singular_series:
1173-
keys = self.grouper.group_keys_seq
1174-
values[0].name = keys[0]
1175-
11761150
# GH2893
11771151
# we have series in the values array, we want to
11781152
# produce a series:
@@ -1372,14 +1346,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
13721346
# gets here with non-unique columns
13731347
output = {}
13741348
inds = []
1375-
for i, col in enumerate(obj):
1376-
subset = obj.iloc[:, i]
1377-
sgb = SeriesGroupBy(
1378-
subset,
1379-
selection=col,
1380-
grouper=self.grouper,
1381-
exclusions=self.exclusions,
1382-
)
1349+
for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)):
13831350
try:
13841351
output[i] = sgb.transform(wrapper)
13851352
except TypeError:

pandas/core/groupby/groupby.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1037,7 +1037,7 @@ def reset_identity(values):
10371037
if self.as_index:
10381038

10391039
# possible MI return case
1040-
group_keys = self.grouper.group_keys_seq
1040+
group_keys = self.grouper.result_index
10411041
group_levels = self.grouper.levels
10421042
group_names = self.grouper.names
10431043

@@ -3236,6 +3236,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
32363236
)
32373237
return res
32383238

3239+
@final
32393240
@Substitution(name="groupby")
32403241
@Appender(_common_see_also)
32413242
def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0):
@@ -3247,6 +3248,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0
32473248
Series or DataFrame
32483249
Percentage changes within each group.
32493250
"""
3251+
# TODO: Remove this conditional for SeriesGroupBy when GH#23918 is fixed
32503252
if freq is not None or axis != 0:
32513253
return self.apply(
32523254
lambda x: x.pct_change(

pandas/core/internals/blocks.py

+13
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,11 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
228228
# expected "ndarray")
229229
return self.values # type: ignore[return-value]
230230

231+
def values_for_json(self) -> np.ndarray:
232+
# Incompatible return value type (got "Union[ndarray[Any, Any],
233+
# ExtensionArray]", expected "ndarray[Any, Any]")
234+
return self.values # type: ignore[return-value]
235+
231236
@final
232237
@cache_readonly
233238
def fill_value(self):
@@ -1375,6 +1380,9 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
13751380
# TODO(EA2D): reshape not needed with 2D EAs
13761381
return np.asarray(values).reshape(self.shape)
13771382

1383+
def values_for_json(self) -> np.ndarray:
1384+
return np.asarray(self.values)
1385+
13781386
def interpolate(
13791387
self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs
13801388
):
@@ -1805,6 +1813,11 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
18051813
is_numeric = False
18061814
values: DatetimeArray | TimedeltaArray
18071815

1816+
def values_for_json(self) -> np.ndarray:
1817+
# special casing datetimetz to avoid conversion through
1818+
# object dtype
1819+
return self.values._ndarray
1820+
18081821

18091822
class DatetimeTZBlock(DatetimeLikeBlock):
18101823
"""implement a datetime64 block with a tz attribute"""

pandas/core/internals/managers.py

+17-16
Original file line numberDiff line numberDiff line change
@@ -998,24 +998,25 @@ def column_arrays(self) -> list[np.ndarray]:
998998
"""
999999
Used in the JSON C code to access column arrays.
10001000
This optimizes compared to using `iget_values` by converting each
1001-
block.values to a np.ndarray only once up front
10021001
"""
1003-
# special casing datetimetz to avoid conversion through object dtype
1004-
arrays = [
1005-
blk.values._ndarray
1006-
if isinstance(blk, DatetimeTZBlock)
1007-
else np.asarray(blk.values)
1008-
for blk in self.blocks
1009-
]
1010-
result = []
1011-
for i in range(len(self.items)):
1012-
arr = arrays[self.blknos[i]]
1013-
if arr.ndim == 2:
1014-
values = arr[self.blklocs[i]]
1002+
# This is an optimized equivalent to
1003+
# result = [self.iget_values(i) for i in range(len(self.items))]
1004+
result: list[np.ndarray | None] = [None] * len(self.items)
1005+
1006+
for blk in self.blocks:
1007+
mgr_locs = blk._mgr_locs
1008+
values = blk.values_for_json()
1009+
if values.ndim == 1:
1010+
# TODO(EA2D): special casing not needed with 2D EAs
1011+
result[mgr_locs[0]] = values
1012+
10151013
else:
1016-
values = arr
1017-
result.append(values)
1018-
return result
1014+
for i, loc in enumerate(mgr_locs):
1015+
result[loc] = values[i]
1016+
1017+
# error: Incompatible return value type (got "List[None]",
1018+
# expected "List[ndarray[Any, Any]]")
1019+
return result # type: ignore[return-value]
10191020

10201021
def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
10211022
"""

pandas/io/common.py

+20
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
)
1818
import mmap
1919
import os
20+
from pathlib import Path
2021
import tempfile
2122
from typing import (
2223
IO,
@@ -520,6 +521,21 @@ def infer_compression(
520521
raise ValueError(msg)
521522

522523

524+
def check_parent_directory(path: Path | str) -> None:
525+
"""
526+
Check if parent directory of a file exists, raise OSError if it does not
527+
528+
Parameters
529+
----------
530+
path: Path or str
531+
Path to check parent directory of
532+
533+
"""
534+
parent = Path(path).parent
535+
if not parent.is_dir():
536+
raise OSError(fr"Cannot save file into a non-existent directory: '{parent}'")
537+
538+
523539
def get_handle(
524540
path_or_buf: FilePathOrBuffer,
525541
mode: str,
@@ -632,6 +648,10 @@ def get_handle(
632648
compression_args = dict(ioargs.compression)
633649
compression = compression_args.pop("method")
634650

651+
# Only for write methods
652+
if "r" not in mode and is_path:
653+
check_parent_directory(str(handle))
654+
635655
if compression:
636656
# compression libraries do not like an explicit text-mode
637657
ioargs.mode = ioargs.mode.replace("t", "")

pandas/io/formats/format.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,10 @@
9696
from pandas.core.indexes.timedeltas import TimedeltaIndex
9797
from pandas.core.reshape.concat import concat
9898

99-
from pandas.io.common import stringify_path
99+
from pandas.io.common import (
100+
check_parent_directory,
101+
stringify_path,
102+
)
100103
from pandas.io.formats.printing import (
101104
adjoin,
102105
justify,
@@ -1147,6 +1150,7 @@ def get_buffer(buf: FilePathOrBuffer[str] | None, encoding: str | None = None):
11471150
if hasattr(buf, "write"):
11481151
yield buf
11491152
elif isinstance(buf, str):
1153+
check_parent_directory(str(buf))
11501154
with open(buf, "w", encoding=encoding, newline="") as f:
11511155
# GH#30034 open instead of codecs.open prevents a file leak
11521156
# if we have an invalid encoding argument.

0 commit comments

Comments
 (0)