Skip to content

Commit ceff465

Browse files
authored
Merge branch 'main' into main
2 parents 22a462c + aa1f96b commit ceff465

File tree

22 files changed

+269
-1061
lines changed

22 files changed

+269
-1061
lines changed

.github/workflows/wheels.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171
fetch-depth: 0
7272

7373
- name: Build wheels
74-
uses: pypa/[email protected].1
74+
uses: pypa/[email protected].3
7575
env:
7676
CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
7777

.gitpod.yml

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ tasks:
1515
git fetch --tags
1616
python setup.py build_ext --inplace -j 4
1717
echo "🛠 Completed rebuilding Pandas!! 🛠 "
18+
pre-commit install
1819
echo "✨ Pre-build complete! You can close this terminal ✨ "
1920
2021
# --------------------------------------------------------

doc/source/whatsnew/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Version 2.0
2424
.. toctree::
2525
:maxdepth: 2
2626

27+
v2.0.2
2728
v2.0.1
2829
v2.0.0
2930

doc/source/whatsnew/v2.0.2.rst

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
.. _whatsnew_202:
2+
3+
What's new in 2.0.2 (May ..., 2023)
4+
-----------------------------------
5+
6+
These are the changes in pandas 2.0.2. See :ref:`release` for a full changelog
7+
including other versions of pandas.
8+
9+
{{ header }}
10+
11+
.. ---------------------------------------------------------------------------
12+
.. _whatsnew_202.regressions:
13+
14+
Fixed regressions
15+
~~~~~~~~~~~~~~~~~
16+
-
17+
18+
.. ---------------------------------------------------------------------------
19+
.. _whatsnew_202.bug_fixes:
20+
21+
Bug fixes
22+
~~~~~~~~~
23+
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
24+
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
25+
-
26+
27+
.. ---------------------------------------------------------------------------
28+
.. _whatsnew_202.other:
29+
30+
Other
31+
~~~~~
32+
-
33+
34+
.. ---------------------------------------------------------------------------
35+
.. _whatsnew_202.contributors:
36+
37+
Contributors
38+
~~~~~~~~~~~~
39+
40+
.. contributors:: v2.0.1..v2.0.2|HEAD

doc/source/whatsnew/v2.1.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,11 @@ Timezones
313313

314314
Numeric
315315
^^^^^^^
316+
- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
317+
- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`)
316318
- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`)
317319
- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
320+
- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`34671`)
318321
-
319322

320323
Conversion

pandas/_typing.py

+1
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ def closed(self) -> bool:
419419
AlignJoin = Literal["outer", "inner", "left", "right"]
420420
DtypeBackend = Literal["pyarrow", "numpy_nullable"]
421421

422+
TimeUnit = Literal["s", "ms", "us", "ns"]
422423
OpenFileErrors = Literal[
423424
"strict",
424425
"ignore",

pandas/core/generic.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
TimedeltaConvertibleTypes,
8383
TimeNonexistent,
8484
TimestampConvertibleTypes,
85+
TimeUnit,
8586
ValueKeyFunc,
8687
WriteBuffer,
8788
WriteExcelBuffer,
@@ -2284,7 +2285,7 @@ def to_json(
22842285
date_format: str | None = None,
22852286
double_precision: int = 10,
22862287
force_ascii: bool_t = True,
2287-
date_unit: str = "ms",
2288+
date_unit: TimeUnit = "ms",
22882289
default_handler: Callable[[Any], JSONSerializable] | None = None,
22892290
lines: bool_t = False,
22902291
compression: CompressionOptions = "infer",
@@ -2564,11 +2565,11 @@ def to_hdf(
25642565
self,
25652566
path_or_buf: FilePath | HDFStore,
25662567
key: str,
2567-
mode: str = "a",
2568+
mode: Literal["a", "w", "r+"] = "a",
25682569
complevel: int | None = None,
2569-
complib: str | None = None,
2570+
complib: Literal["zlib", "lzo", "bzip2", "blosc"] | None = None,
25702571
append: bool_t = False,
2571-
format: str | None = None,
2572+
format: Literal["fixed", "table"] | None = None,
25722573
index: bool_t = True,
25732574
min_itemsize: int | dict[str, int] | None = None,
25742575
nan_rep=None,

pandas/core/groupby/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4349,7 +4349,7 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde
43494349
_apply_groupings_depr = (
43504350
"{}.apply operated on the grouping columns. This behavior is deprecated, "
43514351
"and in a future version of pandas the grouping columns will be excluded "
4352-
"from the operation. Select the columns to operate on after groupby to"
4352+
"from the operation. Select the columns to operate on after groupby to "
43534353
"either explicitly include or exclude the groupings and silence "
43544354
"this warning."
43554355
)

pandas/core/interchange/from_dataframe.py

+34-70
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import numpy as np
88

9+
from pandas.compat._optional import import_optional_dependency
10+
911
import pandas as pd
1012
from pandas.core.interchange.dataframe_protocol import (
1113
Buffer,
@@ -23,7 +25,7 @@
2325
DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
2426
DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
2527
DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
26-
DtypeKind.BOOL: {8: bool},
28+
DtypeKind.BOOL: {1: bool, 8: bool},
2729
}
2830

2931

@@ -154,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
154156
buffers = col.get_buffers()
155157

156158
data_buff, data_dtype = buffers["data"]
157-
data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size())
159+
data = buffer_to_ndarray(
160+
data_buff, data_dtype, offset=col.offset, length=col.size()
161+
)
158162

159163
data = set_nulls(data, col, buffers["validity"])
160164
return data, buffers
@@ -192,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
192196
buffers = col.get_buffers()
193197

194198
codes_buff, codes_dtype = buffers["data"]
195-
codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size())
199+
codes = buffer_to_ndarray(
200+
codes_buff, codes_dtype, offset=col.offset, length=col.size()
201+
)
196202

197203
# Doing module in order to not get ``IndexError`` for
198204
# out-of-bounds sentinel values in `codes`
@@ -252,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
252258
Endianness.NATIVE,
253259
)
254260
# Specify zero offset as we don't want to chunk the string data
255-
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size())
261+
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
256262

257263
# Retrieve the offsets buffer containing the index offsets demarcating
258264
# the beginning and the ending of each string
@@ -261,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
261267
# meaning that it has more elements than in the data buffer, do `col.size() + 1`
262268
# here to pass a proper offsets buffer size
263269
offsets = buffer_to_ndarray(
264-
offset_buff, offset_dtype, col.offset, length=col.size() + 1
270+
offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
265271
)
266272

267273
null_pos = None
268274
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
269275
assert buffers["validity"], "Validity buffers cannot be empty for masks"
270276
valid_buff, valid_dtype = buffers["validity"]
271-
null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
277+
null_pos = buffer_to_ndarray(
278+
valid_buff, valid_dtype, offset=col.offset, length=col.size()
279+
)
272280
if sentinel_val == 0:
273281
null_pos = ~null_pos
274282

@@ -356,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
356364
getattr(ArrowCTypes, f"UINT{dtype[1]}"),
357365
Endianness.NATIVE,
358366
),
359-
col.offset,
360-
col.size(),
367+
offset=col.offset,
368+
length=col.size(),
361369
)
362370

363371
data = parse_datetime_format_str(format_str, data)
@@ -368,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
368376
def buffer_to_ndarray(
369377
buffer: Buffer,
370378
dtype: tuple[DtypeKind, int, str, str],
379+
*,
380+
length: int,
371381
offset: int = 0,
372-
length: int | None = None,
373382
) -> np.ndarray:
374383
"""
375384
Build a NumPy array from the passed buffer.
@@ -406,74 +415,27 @@ def buffer_to_ndarray(
406415
# and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
407416
# it since https://github.com/numpy/numpy/pull/19083
408417
ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
409-
data_pointer = ctypes.cast(
410-
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
411-
)
412418

413419
if bit_width == 1:
414420
assert length is not None, "`length` must be specified for a bit-mask buffer."
415-
arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,))
416-
return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8)
421+
pa = import_optional_dependency("pyarrow")
422+
arr = pa.BooleanArray.from_buffers(
423+
pa.bool_(),
424+
length,
425+
[None, pa.foreign_buffer(buffer.ptr, length)],
426+
offset=offset,
427+
)
428+
return np.asarray(arr)
417429
else:
430+
data_pointer = ctypes.cast(
431+
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
432+
)
418433
return np.ctypeslib.as_array(
419-
data_pointer, shape=(buffer.bufsize // (bit_width // 8),)
434+
data_pointer,
435+
shape=(length,),
420436
)
421437

422438

423-
def bitmask_to_bool_ndarray(
424-
bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
425-
) -> np.ndarray:
426-
"""
427-
Convert bit-mask to a boolean NumPy array.
428-
429-
Parameters
430-
----------
431-
bitmask : np.ndarray[uint8]
432-
NumPy array of uint8 dtype representing the bitmask.
433-
mask_length : int
434-
Number of elements in the mask to interpret.
435-
first_byte_offset : int, default: 0
436-
Number of elements to offset from the start of the first byte.
437-
438-
Returns
439-
-------
440-
np.ndarray[bool]
441-
"""
442-
bytes_to_skip = first_byte_offset // 8
443-
bitmask = bitmask[bytes_to_skip:]
444-
first_byte_offset %= 8
445-
446-
bool_mask = np.zeros(mask_length, dtype=bool)
447-
448-
# Processing the first byte separately as it has its own offset
449-
val = bitmask[0]
450-
mask_idx = 0
451-
bits_in_first_byte = min(8 - first_byte_offset, mask_length)
452-
for j in range(bits_in_first_byte):
453-
if val & (1 << (j + first_byte_offset)):
454-
bool_mask[mask_idx] = True
455-
mask_idx += 1
456-
457-
# `mask_length // 8` describes how many full bytes to process
458-
for i in range((mask_length - bits_in_first_byte) // 8):
459-
# doing `+ 1` as we already processed the first byte
460-
val = bitmask[i + 1]
461-
for j in range(8):
462-
if val & (1 << j):
463-
bool_mask[mask_idx] = True
464-
mask_idx += 1
465-
466-
if len(bitmask) > 1:
467-
# Processing reminder of last byte
468-
val = bitmask[-1]
469-
for j in range(len(bool_mask) - mask_idx):
470-
if val & (1 << j):
471-
bool_mask[mask_idx] = True
472-
mask_idx += 1
473-
474-
return bool_mask
475-
476-
477439
def set_nulls(
478440
data: np.ndarray | pd.Series,
479441
col: Column,
@@ -509,7 +471,9 @@ def set_nulls(
509471
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
510472
assert validity, "Expected to have a validity buffer for the mask"
511473
valid_buff, valid_dtype = validity
512-
null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
474+
null_pos = buffer_to_ndarray(
475+
valid_buff, valid_dtype, offset=col.offset, length=col.size()
476+
)
513477
if sentinel_val == 0:
514478
null_pos = ~null_pos
515479
elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):

pandas/core/nanops.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,8 @@ def nanmean(
716716
dtype_count = dtype
717717

718718
count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
719-
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
719+
the_sum = values.sum(axis, dtype=dtype_sum)
720+
the_sum = _ensure_numeric(the_sum)
720721

721722
if axis is not None and getattr(the_sum, "ndim", False):
722723
count = cast(np.ndarray, count)
@@ -775,6 +776,11 @@ def get_median(x, _mask=None):
775776
dtype = values.dtype
776777
values, mask = _get_values(values, skipna, mask=mask, fill_value=0)
777778
if values.dtype.kind != "f":
779+
if values.dtype == object:
780+
# GH#34671 avoid casting strings to numeric
781+
inferred = lib.infer_dtype(values)
782+
if inferred in ["string", "mixed"]:
783+
raise TypeError(f"Cannot convert {values} to numeric")
778784
try:
779785
values = values.astype("f8")
780786
except ValueError as err:
@@ -1659,6 +1665,10 @@ def _ensure_numeric(x):
16591665
if x.dtype.kind in "biu":
16601666
x = x.astype(np.float64)
16611667
elif x.dtype == object:
1668+
inferred = lib.infer_dtype(x)
1669+
if inferred in ["string", "mixed"]:
1670+
# GH#44008, GH#36703 avoid casting e.g. strings to numeric
1671+
raise TypeError(f"Could not convert {x} to numeric")
16621672
try:
16631673
x = x.astype(np.complex128)
16641674
except (TypeError, ValueError):
@@ -1671,6 +1681,9 @@ def _ensure_numeric(x):
16711681
if not np.any(np.imag(x)):
16721682
x = x.real
16731683
elif not (is_float(x) or is_integer(x) or is_complex(x)):
1684+
if isinstance(x, str):
1685+
# GH#44008, GH#36703 avoid casting e.g. strings to numeric
1686+
raise TypeError(f"Could not convert string '{x}' to numeric")
16741687
try:
16751688
x = float(x)
16761689
except (TypeError, ValueError):

pandas/io/formats/format.py

-32
Original file line numberDiff line numberDiff line change
@@ -1022,38 +1022,6 @@ class DataFrameRenderer:
10221022
def __init__(self, fmt: DataFrameFormatter) -> None:
10231023
self.fmt = fmt
10241024

1025-
def to_latex(
1026-
self,
1027-
buf: FilePath | WriteBuffer[str] | None = None,
1028-
column_format: str | None = None,
1029-
longtable: bool = False,
1030-
encoding: str | None = None,
1031-
multicolumn: bool = False,
1032-
multicolumn_format: str | None = None,
1033-
multirow: bool = False,
1034-
caption: str | tuple[str, str] | None = None,
1035-
label: str | None = None,
1036-
position: str | None = None,
1037-
) -> str | None:
1038-
"""
1039-
Render a DataFrame to a LaTeX tabular/longtable environment output.
1040-
"""
1041-
from pandas.io.formats.latex import LatexFormatter
1042-
1043-
latex_formatter = LatexFormatter(
1044-
self.fmt,
1045-
longtable=longtable,
1046-
column_format=column_format,
1047-
multicolumn=multicolumn,
1048-
multicolumn_format=multicolumn_format,
1049-
multirow=multirow,
1050-
caption=caption,
1051-
label=label,
1052-
position=position,
1053-
)
1054-
string = latex_formatter.to_string()
1055-
return save_to_buffer(string, buf=buf, encoding=encoding)
1056-
10571025
def to_html(
10581026
self,
10591027
buf: FilePath | WriteBuffer[str] | None = None,

0 commit comments

Comments
 (0)