Skip to content

Commit 33f7906

Browse files
Merge remote-tracking branch 'upstream/master' into bisect
2 parents 8d3a176 + b7c994b commit 33f7906

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+534
-252
lines changed

doc/source/reference/io.rst

+5
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ HDFStore: PyTables (HDF5)
8383
HDFStore.groups
8484
HDFStore.walk
8585

86+
.. warning::
87+
88+
One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,
89+
but the type of the subclass is lost upon storing.
90+
8691
Feather
8792
~~~~~~~
8893
.. autosummary::

doc/source/whatsnew/v1.3.0.rst

+4-2
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ Categorical
176176
Datetimelike
177177
^^^^^^^^^^^^
178178
- Bug in :class:`DataFrame` and :class:`Series` constructors sometimes dropping nanoseconds from :class:`Timestamp` (resp. :class:`Timedelta`) ``data``, with ``dtype=datetime64[ns]`` (resp. ``timedelta64[ns]``) (:issue:`38032`)
179-
-
179+
- Bug in :meth:`DataFrame.first` and :meth:`Series.first` returning two months for offset one month when first day is last calendar day (:issue:`29623`)
180180
-
181181

182182
Timedelta
@@ -224,6 +224,7 @@ Indexing
224224
Missing
225225
^^^^^^^
226226

227+
- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True`` (:issue:`35612`)
227228
-
228229
-
229230

@@ -241,7 +242,8 @@ I/O
241242
- Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)
242243
- Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`)
243244
- Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`)
244-
-
245+
- Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply
246+
for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`).
245247

246248
Period
247249
^^^^^^

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ dependencies:
3131
# documentation
3232
- gitpython # obtain contributors from git for whatsnew
3333
- gitdb
34-
- sphinx
34+
- sphinx=3.3.1
3535

3636
# documentation (jupyter notebooks)
3737
- nbconvert>=5.4.1

pandas/_libs/tslibs/timedeltas.pyx

+2-4
Original file line numberDiff line numberDiff line change
@@ -1091,11 +1091,9 @@ cdef class _Timedelta(timedelta):
10911091
>>> td.isoformat()
10921092
'P6DT0H50M3.010010012S'
10931093
>>> pd.Timedelta(hours=1, seconds=10).isoformat()
1094-
'P0DT0H0M10S'
1095-
>>> pd.Timedelta(hours=1, seconds=10).isoformat()
1096-
'P0DT0H0M10S'
1094+
'P0DT1H0M10S'
10971095
>>> pd.Timedelta(days=500.5).isoformat()
1098-
'P500DT12H0MS'
1096+
'P500DT12H0M0S'
10991097
"""
11001098
components = self.components
11011099
seconds = (f'{components.seconds}.'

pandas/_testing.py

+21-21
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,17 @@
1010
from shutil import rmtree
1111
import string
1212
import tempfile
13-
from typing import Any, Callable, ContextManager, List, Optional, Type, Union, cast
13+
from typing import (
14+
Any,
15+
Callable,
16+
ContextManager,
17+
List,
18+
Optional,
19+
Tuple,
20+
Type,
21+
Union,
22+
cast,
23+
)
1424
import warnings
1525
import zipfile
1626

@@ -301,35 +311,25 @@ def write_to_compressed(compression, path, data, dest="test"):
301311
------
302312
ValueError : An invalid compression value was passed in.
303313
"""
314+
args: Tuple[Any, ...] = (data,)
315+
mode = "wb"
316+
method = "write"
317+
compress_method: Callable
318+
304319
if compression == "zip":
305320
compress_method = zipfile.ZipFile
321+
mode = "w"
322+
args = (dest, data)
323+
method = "writestr"
306324
elif compression == "gzip":
307-
# pandas\_testing.py:288: error: Incompatible types in assignment
308-
# (expression has type "Type[GzipFile]", variable has type
309-
# "Type[ZipFile]")
310-
compress_method = gzip.GzipFile # type: ignore[assignment]
325+
compress_method = gzip.GzipFile
311326
elif compression == "bz2":
312-
# pandas\_testing.py:290: error: Incompatible types in assignment
313-
# (expression has type "Type[BZ2File]", variable has type
314-
# "Type[ZipFile]")
315-
compress_method = bz2.BZ2File # type: ignore[assignment]
327+
compress_method = bz2.BZ2File
316328
elif compression == "xz":
317329
compress_method = get_lzma_file(lzma)
318330
else:
319331
raise ValueError(f"Unrecognized compression type: {compression}")
320332

321-
if compression == "zip":
322-
mode = "w"
323-
args = (dest, data)
324-
method = "writestr"
325-
else:
326-
mode = "wb"
327-
# pandas\_testing.py:302: error: Incompatible types in assignment
328-
# (expression has type "Tuple[Any]", variable has type "Tuple[Any,
329-
# Any]")
330-
args = (data,) # type: ignore[assignment]
331-
method = "write"
332-
333333
with compress_method(path, mode=mode) as f:
334334
getattr(f, method)(*args)
335335

pandas/_typing.py

+2
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@
133133
"Resampler",
134134
]
135135

136+
PythonFuncType = Callable[[Any], Any]
137+
136138
# filenames and file-like-objects
137139
Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]
138140
FileOrBuffer = Union[str, Buffer[T]]

pandas/core/apply.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from pandas._config import option_context
88

9-
from pandas._typing import Axis, FrameOrSeriesUnion
9+
from pandas._typing import AggFuncType, Axis, FrameOrSeriesUnion
1010
from pandas.util._decorators import cache_readonly
1111

1212
from pandas.core.dtypes.common import (
@@ -27,7 +27,7 @@
2727

2828
def frame_apply(
2929
obj: "DataFrame",
30-
func,
30+
func: AggFuncType,
3131
axis: Axis = 0,
3232
raw: bool = False,
3333
result_type: Optional[str] = None,

pandas/core/construction.py

+66-36
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,11 @@ def sanitize_array(
449449
# extract ndarray or ExtensionArray, ensure we have no PandasArray
450450
data = extract_array(data, extract_numpy=True)
451451

452+
if isinstance(data, np.ndarray) and data.ndim == 0:
453+
if dtype is None:
454+
dtype = data.dtype
455+
data = lib.item_from_zerodim(data)
456+
452457
# GH#846
453458
if isinstance(data, np.ndarray):
454459

@@ -462,7 +467,7 @@ def sanitize_array(
462467
else:
463468
subarr = np.array(data, copy=False)
464469
else:
465-
# we will try to copy be-definition here
470+
# we will try to copy by-definition here
466471
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
467472

468473
elif isinstance(data, ABCExtensionArray):
@@ -491,46 +496,19 @@ def sanitize_array(
491496
# GH#16804
492497
arr = np.arange(data.start, data.stop, data.step, dtype="int64")
493498
subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
494-
elif lib.is_scalar(data) and index is not None and dtype is not None:
499+
500+
elif not is_list_like(data):
501+
if index is None:
502+
raise ValueError("index must be specified when data is not list-like")
495503
subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype)
504+
496505
else:
497506
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
498507

499-
# scalar like, GH
500-
if getattr(subarr, "ndim", 0) == 0:
501-
if isinstance(data, list): # pragma: no cover
502-
subarr = np.array(data, dtype=object)
503-
elif index is not None:
504-
subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype)
505-
506-
else:
507-
return subarr.item()
508-
509-
# the result that we want
510-
elif subarr.ndim == 1:
511-
if index is not None:
512-
513-
# a 1-element ndarray
514-
if len(subarr) != len(index) and len(subarr) == 1:
515-
subarr = subarr.repeat(len(index))
516-
517-
elif subarr.ndim > 1:
518-
if isinstance(data, np.ndarray):
519-
raise ValueError("Data must be 1-dimensional")
520-
else:
521-
subarr = com.asarray_tuplesafe(data, dtype=dtype)
508+
subarr = _sanitize_ndim(subarr, data, dtype, index)
522509

523510
if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)):
524-
# This is to prevent mixed-type Series getting all casted to
525-
# NumPy string type, e.g. NaN --> '-1#IND'.
526-
if issubclass(subarr.dtype.type, str):
527-
# GH#16605
528-
# If not empty convert the data to dtype
529-
# GH#19853: If data is a scalar, subarr has already the result
530-
if not lib.is_scalar(data):
531-
if not np.all(isna(data)):
532-
data = np.array(data, dtype=dtype, copy=False)
533-
subarr = np.array(data, dtype=object, copy=copy)
511+
subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)
534512

535513
is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype)
536514
if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype:
@@ -541,13 +519,65 @@ def sanitize_array(
541519
return subarr
542520

543521

522+
def _sanitize_ndim(
523+
result: ArrayLike, data, dtype: Optional[DtypeObj], index: Optional[Index]
524+
) -> ArrayLike:
525+
"""
526+
Ensure we have a 1-dimensional result array.
527+
"""
528+
if getattr(result, "ndim", 0) == 0:
529+
raise ValueError("result should be arraylike with ndim > 0")
530+
531+
elif result.ndim == 1:
532+
# the result that we want
533+
result = _maybe_repeat(result, index)
534+
535+
elif result.ndim > 1:
536+
if isinstance(data, np.ndarray):
537+
raise ValueError("Data must be 1-dimensional")
538+
else:
539+
result = com.asarray_tuplesafe(data, dtype=dtype)
540+
return result
541+
542+
543+
def _sanitize_str_dtypes(
544+
result: np.ndarray, data, dtype: Optional[DtypeObj], copy: bool
545+
) -> np.ndarray:
546+
"""
547+
Ensure we have a dtype that is supported by pandas.
548+
"""
549+
550+
# This is to prevent mixed-type Series getting all casted to
551+
# NumPy string type, e.g. NaN --> '-1#IND'.
552+
if issubclass(result.dtype.type, str):
553+
# GH#16605
554+
# If not empty convert the data to dtype
555+
# GH#19853: If data is a scalar, result has already the result
556+
if not lib.is_scalar(data):
557+
if not np.all(isna(data)):
558+
data = np.array(data, dtype=dtype, copy=False)
559+
result = np.array(data, dtype=object, copy=copy)
560+
return result
561+
562+
563+
def _maybe_repeat(arr: ArrayLike, index: Optional[Index]) -> ArrayLike:
564+
"""
565+
If we have a length-1 array and an index describing how long we expect
566+
the result to be, repeat the array.
567+
"""
568+
if index is not None:
569+
if 1 == len(arr) != len(index):
570+
arr = arr.repeat(len(index))
571+
return arr
572+
573+
544574
def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool):
545575
"""
546576
Convert input to numpy ndarray and optionally cast to a given dtype.
547577
548578
Parameters
549579
----------
550-
arr : ndarray, scalar, list, tuple, iterator (catchall)
580+
arr : ndarray, list, tuple, iterator (catchall)
551581
Excludes: ExtensionArray, Series, Index.
552582
dtype : np.dtype, ExtensionDtype or None
553583
copy : bool

pandas/core/dtypes/cast.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1543,7 +1543,10 @@ def construct_1d_arraylike_from_scalar(
15431543
"""
15441544

15451545
if dtype is None:
1546-
dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)
1546+
try:
1547+
dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)
1548+
except OutOfBoundsDatetime:
1549+
dtype = np.dtype(object)
15471550

15481551
if is_extension_array_dtype(dtype):
15491552
cls = dtype.construct_array_type()

pandas/core/frame.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
IndexLabel,
6363
Label,
6464
Level,
65+
PythonFuncType,
6566
Renamer,
6667
StorageOptions,
6768
Suffixes,
@@ -7661,7 +7662,13 @@ def transform(
76617662
return result
76627663

76637664
def apply(
7664-
self, func, axis: Axis = 0, raw: bool = False, result_type=None, args=(), **kwds
7665+
self,
7666+
func: AggFuncType,
7667+
axis: Axis = 0,
7668+
raw: bool = False,
7669+
result_type=None,
7670+
args=(),
7671+
**kwds,
76657672
):
76667673
"""
76677674
Apply a function along an axis of the DataFrame.
@@ -7807,7 +7814,9 @@ def apply(
78077814
)
78087815
return op.get_result()
78097816

7810-
def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
7817+
def applymap(
7818+
self, func: PythonFuncType, na_action: Optional[str] = None
7819+
) -> DataFrame:
78117820
"""
78127821
Apply a function to a Dataframe elementwise.
78137822

pandas/core/generic.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -2505,6 +2505,11 @@ def to_hdf(
25052505
In order to add another DataFrame or Series to an existing HDF file
25062506
please use append mode and a different a key.
25072507
2508+
.. warning::
2509+
2510+
One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,
2511+
but the type of the subclass is lost upon storing.
2512+
25082513
For more information see the :ref:`user guide <io.hdf5>`.
25092514
25102515
Parameters
@@ -8424,7 +8429,12 @@ def first(self: FrameOrSeries, offset) -> FrameOrSeries:
84248429
return self
84258430

84268431
offset = to_offset(offset)
8427-
end_date = end = self.index[0] + offset
8432+
if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
8433+
# GH#29623 if first value is end of period, remove offset with n = 1
8434+
# before adding the real offset
8435+
end_date = end = self.index[0] - offset.base + offset
8436+
else:
8437+
end_date = end = self.index[0] + offset
84288438

84298439
# Tick-like, e.g. 3 weeks
84308440
if isinstance(offset, Tick):

pandas/core/groupby/generic.py

-1
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,6 @@ def _transform_general(self, func, *args, **kwargs):
553553
result = maybe_downcast_numeric(result, self._selected_obj.dtype)
554554

555555
result.name = self._selected_obj.name
556-
result.index = self._selected_obj.index
557556
return result
558557

559558
def _transform_fast(self, result) -> Series:

0 commit comments

Comments
 (0)