Skip to content

Commit d864912

Browse files
authored
Merge branch 'main' into add-character-check-to-excel-cell
2 parents 74f4f81 + 9008ee5 commit d864912

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+257
-276
lines changed

asv_bench/benchmarks/io/csv.py

+3
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine):
408408
def time_read_bytescsv(self, engine):
409409
read_csv(self.data(self.BytesIO_input), engine=engine)
410410

411+
def peakmem_read_csv(self, engine):
412+
read_csv(self.data(self.BytesIO_input), engine=engine)
413+
411414

412415
class ReadCSVCategorical(BaseIO):
413416
fname = "__test__.csv"

ci/code_checks.sh

+2-10
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,8 @@ fi
6565
### DOCSTRINGS ###
6666
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
6767

68-
MSG='Validate docstrings (EX01, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
69-
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
70-
RET=$(($RET + $?)) ; echo $MSG "DONE"
71-
72-
MSG='Partially validate docstrings (EX03)' ; echo $MSG
73-
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
74-
pandas.Series.plot.line \
75-
pandas.Series.to_sql \
76-
pandas.read_json \
77-
pandas.DataFrame.to_sql # There should be no backslash in the final line, please keep this comment in the last ignored function
68+
MSG='Validate docstrings (EX01, EX03, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
69+
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX03,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
7870
RET=$(($RET + $?)) ; echo $MSG "DONE"
7971

8072
MSG='Partially validate docstrings (PR02)' ; echo $MSG

doc/source/conf.py

-1
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,6 @@
460460
"dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
461461
"matplotlib": ("https://matplotlib.org/stable/", None),
462462
"numpy": ("https://numpy.org/doc/stable/", None),
463-
"py": ("https://pylib.readthedocs.io/en/latest/", None),
464463
"python": ("https://docs.python.org/3/", None),
465464
"scipy": ("https://docs.scipy.org/doc/scipy/", None),
466465
"pyarrow": ("https://arrow.apache.org/docs/", None),

doc/source/getting_started/install.rst

+5-4
Original file line numberDiff line numberDiff line change
@@ -277,11 +277,12 @@ Installable with ``pip install "pandas[excel]"``.
277277
========================= ================== =============== =============================================================
278278
Dependency Minimum Version pip extra Notes
279279
========================= ================== =============== =============================================================
280-
xlrd 2.0.1 excel Reading Excel
281-
xlsxwriter 3.0.5 excel Writing Excel
282-
openpyxl 3.1.0 excel Reading / writing for xlsx files
280+
xlrd 2.0.1 excel Reading for xls files
281+
xlsxwriter 3.0.5 excel Writing for xlsx files
282+
openpyxl 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files
283283
pyxlsb 1.0.10 excel Reading for xlsb files
284-
python-calamine 0.1.7 excel Reading for xls/xlsx/xlsb/ods files
284+
python-calamine 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files
285+
odfpy 1.4.1 excel Reading / writing for OpenDocument 1.2 files
285286
========================= ================== =============== =============================================================
286287

287288
HTML

doc/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ Conversion
4949
DataFrame.infer_objects
5050
DataFrame.copy
5151
DataFrame.bool
52+
DataFrame.to_numpy
5253

5354
Indexing, iteration
5455
~~~~~~~~~~~~~~~~~~~

doc/source/user_guide/io.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ Basic
6161
+++++
6262

6363
filepath_or_buffer : various
64-
Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`,
65-
or :class:`py:py._path.local.LocalPath`), URL (including http, ftp, and S3
64+
Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`)
65+
URL (including http, ftp, and S3
6666
locations), or any object with a ``read()`` method (such as an open file or
6767
:class:`~python:io.StringIO`).
6868
sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_table`

doc/source/whatsnew/v2.2.1.rst

+5
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,12 @@ including other versions of pandas.
1313

1414
Fixed regressions
1515
~~~~~~~~~~~~~~~~~
16+
- Fixed memory leak in :func:`read_csv` (:issue:`57039`)
17+
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
1618
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
19+
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
20+
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
21+
- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)
1722

1823
.. ---------------------------------------------------------------------------
1924
.. _whatsnew_221.bug_fixes:

doc/source/whatsnew/v3.0.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
8484

8585
Other API changes
8686
^^^^^^^^^^^^^^^^^
87-
-
87+
- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
8888
-
8989

9090
.. ---------------------------------------------------------------------------
@@ -105,8 +105,10 @@ Performance improvements
105105
- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
106106
- Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)
107107
- Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`)
108+
- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
108109
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
109110
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
111+
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
110112
-
111113

112114
.. ---------------------------------------------------------------------------

pandas/_libs/groupby.pyx

+10-7
Original file line numberDiff line numberDiff line change
@@ -1771,6 +1771,7 @@ def group_idxmin_idxmax(
17711771
Py_ssize_t i, j, N, K, lab
17721772
numeric_object_t val
17731773
numeric_object_t[:, ::1] group_min_or_max
1774+
uint8_t[:, ::1] seen
17741775
bint uses_mask = mask is not None
17751776
bint isna_entry
17761777
bint compute_max = name == "idxmax"
@@ -1784,13 +1785,10 @@ def group_idxmin_idxmax(
17841785

17851786
if numeric_object_t is object:
17861787
group_min_or_max = np.empty((<object>out).shape, dtype=object)
1788+
seen = np.zeros((<object>out).shape, dtype=np.uint8)
17871789
else:
17881790
group_min_or_max = np.empty_like(out, dtype=values.dtype)
1789-
if N > 0 and K > 0:
1790-
# When N or K is zero, we never use group_min_or_max
1791-
group_min_or_max[:] = _get_min_or_max(
1792-
values[0, 0], compute_max, is_datetimelike
1793-
)
1791+
seen = np.zeros_like(out, dtype=np.uint8)
17941792

17951793
# When using transform, we need a valid value for take in the case
17961794
# a category is not observed; these values will be dropped
@@ -1806,6 +1804,7 @@ def group_idxmin_idxmax(
18061804
if not skipna and out[lab, j] == -1:
18071805
# Once we've hit NA there is no going back
18081806
continue
1807+
18091808
val = values[i, j]
18101809

18111810
if uses_mask:
@@ -1814,10 +1813,14 @@ def group_idxmin_idxmax(
18141813
isna_entry = _treat_as_na(val, is_datetimelike)
18151814

18161815
if isna_entry:
1817-
if not skipna:
1816+
if not skipna or not seen[lab, j]:
18181817
out[lab, j] = -1
18191818
else:
1820-
if compute_max:
1819+
if not seen[lab, j]:
1820+
seen[lab, j] = True
1821+
group_min_or_max[lab, j] = val
1822+
out[lab, j] = i
1823+
elif compute_max:
18211824
if val > group_min_or_max[lab, j]:
18221825
group_min_or_max[lab, j] = val
18231826
out[lab, j] = i

pandas/_libs/hashtable_class_helper.pxi.in

+3
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,9 @@ cdef class StringHashTable(HashTable):
933933
kh_destroy_str(self.table)
934934
self.table = NULL
935935

936+
def __len__(self) -> int:
937+
return self.table.size
938+
936939
def sizeof(self, deep: bool = False) -> int:
937940
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
938941
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)

pandas/_libs/index.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class UInt32Engine(IndexEngine): ...
5050
class UInt16Engine(IndexEngine): ...
5151
class UInt8Engine(IndexEngine): ...
5252
class ObjectEngine(IndexEngine): ...
53+
class StringEngine(IndexEngine): ...
5354
class DatetimeEngine(Int64Engine): ...
5455
class TimedeltaEngine(DatetimeEngine): ...
5556
class PeriodEngine(Int64Engine): ...

pandas/_libs/index.pyx

+11
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,17 @@ cdef class ObjectEngine(IndexEngine):
533533
return loc
534534

535535

536+
cdef class StringEngine(IndexEngine):
537+
538+
cdef _make_hash_table(self, Py_ssize_t n):
539+
return _hash.StringHashTable(n)
540+
541+
cdef _check_type(self, object val):
542+
if not isinstance(val, str):
543+
raise KeyError(val)
544+
return str(val)
545+
546+
536547
cdef class DatetimeEngine(Int64Engine):
537548

538549
cdef:

pandas/_libs/src/parser/tokenizer.c

+9
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,14 @@ void parser_set_default_options(parser_t *self) {
109109

110110
parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); }
111111

112+
static void parser_clear_data_buffers(parser_t *self) {
113+
free_if_not_null((void *)&self->stream);
114+
free_if_not_null((void *)&self->words);
115+
free_if_not_null((void *)&self->word_starts);
116+
free_if_not_null((void *)&self->line_start);
117+
free_if_not_null((void *)&self->line_fields);
118+
}
119+
112120
static void parser_cleanup(parser_t *self) {
113121
// XXX where to put this
114122
free_if_not_null((void *)&self->error_msg);
@@ -119,6 +127,7 @@ static void parser_cleanup(parser_t *self) {
119127
self->skipset = NULL;
120128
}
121129

130+
parser_clear_data_buffers(self);
122131
if (self->cb_cleanup != NULL) {
123132
self->cb_cleanup(self->source);
124133
self->cb_cleanup = NULL;

pandas/_testing/__init__.py

-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
Series,
3535
)
3636
from pandas._testing._io import (
37-
round_trip_localpath,
3837
round_trip_pathlib,
3938
round_trip_pickle,
4039
write_to_compressed,
@@ -609,7 +608,6 @@ def shares_memory(left, right) -> bool:
609608
"OBJECT_DTYPES",
610609
"raise_assert_detail",
611610
"raises_chained_assignment_error",
612-
"round_trip_localpath",
613611
"round_trip_pathlib",
614612
"round_trip_pickle",
615613
"setitem",

pandas/_testing/_io.py

-29
Original file line numberDiff line numberDiff line change
@@ -89,35 +89,6 @@ def round_trip_pathlib(writer, reader, path: str | None = None):
8989
return obj
9090

9191

92-
def round_trip_localpath(writer, reader, path: str | None = None):
93-
"""
94-
Write an object to file specified by a py.path LocalPath and read it back.
95-
96-
Parameters
97-
----------
98-
writer : callable bound to pandas object
99-
IO writing function (e.g. DataFrame.to_csv )
100-
reader : callable
101-
IO reading function (e.g. pd.read_csv )
102-
path : str, default None
103-
The path where the object is written and then read.
104-
105-
Returns
106-
-------
107-
pandas object
108-
The original object that was serialized and then re-read.
109-
"""
110-
import pytest
111-
112-
LocalPath = pytest.importorskip("py.path").local
113-
if path is None:
114-
path = "___localpath___"
115-
with ensure_clean(path) as path:
116-
writer(LocalPath(path))
117-
obj = reader(LocalPath(path))
118-
return obj
119-
120-
12192
def write_to_compressed(compression, path, data, dest: str = "test") -> None:
12293
"""
12394
Write data to a compressed file.

pandas/_testing/_warnings.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,12 @@ def _assert_raised_with_correct_stacklevel(
220220
frame = inspect.currentframe()
221221
for _ in range(4):
222222
frame = frame.f_back # type: ignore[union-attr]
223-
caller_filename = inspect.getfile(frame) # type: ignore[arg-type]
223+
try:
224+
caller_filename = inspect.getfile(frame) # type: ignore[arg-type]
225+
finally:
226+
# See note in
227+
# https://docs.python.org/3/library/inspect.html#inspect.Traceback
228+
del frame
224229
msg = (
225230
"Warning not set with correct stacklevel. "
226231
f"File where warning is raised: {actual_warning.filename} != "

pandas/compat/numpy/function.py

+2
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) ->
138138
ARGSORT_DEFAULTS["kind"] = "quicksort"
139139
ARGSORT_DEFAULTS["order"] = None
140140
ARGSORT_DEFAULTS["kind"] = None
141+
ARGSORT_DEFAULTS["stable"] = None
141142

142143

143144
validate_argsort = CompatValidator(
@@ -149,6 +150,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) ->
149150
ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {}
150151
ARGSORT_DEFAULTS_KIND["axis"] = -1
151152
ARGSORT_DEFAULTS_KIND["order"] = None
153+
ARGSORT_DEFAULTS_KIND["stable"] = None
152154
validate_argsort_kind = CompatValidator(
153155
ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both"
154156
)

pandas/core/frame.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -8012,19 +8012,17 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame:
80128012
left = self
80138013

80148014
# GH#31623, only operate on shared columns
8015-
cols, lcols, rcols = left.columns.join(
8016-
right.columns, how="inner", level=None, return_indexers=True
8015+
cols, lcol_indexer, rcol_indexer = left.columns.join(
8016+
right.columns, how="inner", return_indexers=True
80178017
)
80188018

8019-
new_left = left.iloc[:, lcols]
8020-
new_right = right.iloc[:, rcols]
8019+
new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer]
8020+
new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer]
80218021
result = op(new_left, new_right)
80228022

80238023
# Do the join on the columns instead of using left._align_for_op
80248024
# to avoid constructing two potentially large/sparse DataFrames
8025-
join_columns, _, _ = left.columns.join(
8026-
right.columns, how="outer", level=None, return_indexers=True
8027-
)
8025+
join_columns = left.columns.join(right.columns, how="outer")
80288026

80298027
if result.columns.has_duplicates:
80308028
# Avoid reindexing with a duplicate axis.

0 commit comments

Comments
 (0)