Skip to content

Commit d358efc

Browse files
committed
Merge branch 'master' into 42916
2 parents 76cd5c6 + cd13e3a commit d358efc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+930
-415
lines changed

.github/workflows/database.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ on:
44
push:
55
branches:
66
- master
7+
- 1.3.x
78
pull_request:
89
branches:
910
- master
@@ -79,7 +80,7 @@ jobs:
7980
- uses: conda-incubator/setup-miniconda@v2
8081
with:
8182
activate-environment: pandas-dev
82-
channel-priority: flexible
83+
channel-priority: strict
8384
environment-file: ${{ matrix.ENV_FILE }}
8485
use-only-tar-bz2: true
8586

asv_bench/benchmarks/algorithms.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ def setup(self, unique, sort, dtype):
4444
raise NotImplementedError
4545

4646
data = {
47-
"int": pd.Int64Index(np.arange(N)),
48-
"uint": pd.UInt64Index(np.arange(N)),
49-
"float": pd.Float64Index(np.random.randn(N)),
47+
"int": pd.Index(np.arange(N), dtype="int64"),
48+
"uint": pd.Index(np.arange(N), dtype="uint64"),
49+
"float": pd.Index(np.random.randn(N), dtype="float64"),
5050
"object": string_index,
5151
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
5252
"datetime64[ns, tz]": pd.date_range(
@@ -76,9 +76,9 @@ class Duplicated:
7676
def setup(self, unique, keep, dtype):
7777
N = 10 ** 5
7878
data = {
79-
"int": pd.Int64Index(np.arange(N)),
80-
"uint": pd.UInt64Index(np.arange(N)),
81-
"float": pd.Float64Index(np.random.randn(N)),
79+
"int": pd.Index(np.arange(N), dtype="int64"),
80+
"uint": pd.Index(np.arange(N), dtype="uint64"),
81+
"float": pd.Index(np.random.randn(N), dtype="float64"),
8282
"string": tm.makeStringIndex(N),
8383
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
8484
"datetime64[ns, tz]": pd.date_range(

asv_bench/benchmarks/groupby.py

+32
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,38 @@ def time_sum(self):
603603
self.df.groupby(["a"])["b"].sum()
604604

605605

606+
class String:
607+
# GH#41596
608+
param_names = ["dtype", "method"]
609+
params = [
610+
["str", "string[python]"],
611+
[
612+
"sum",
613+
"prod",
614+
"min",
615+
"max",
616+
"mean",
617+
"median",
618+
"var",
619+
"first",
620+
"last",
621+
"any",
622+
"all",
623+
],
624+
]
625+
626+
def setup(self, dtype, method):
627+
cols = list("abcdefghjkl")
628+
self.df = DataFrame(
629+
np.random.randint(0, 100, size=(1_000_000, len(cols))),
630+
columns=cols,
631+
dtype=dtype,
632+
)
633+
634+
def time_str_func(self, dtype, method):
635+
self.df.groupby("a")[self.df.columns[1:]].agg(method)
636+
637+
606638
class Categories:
607639
def setup(self):
608640
N = 10 ** 5

asv_bench/benchmarks/indexing_engines.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def setup(self, engine_and_dtype, index_type):
4848
"non_monotonic": np.array([1, 2, 3] * N, dtype=dtype),
4949
}[index_type]
5050

51-
self.data = engine(lambda: arr, len(arr))
51+
self.data = engine(arr)
5252
# code belows avoids populating the mapping etc. while timing.
5353
self.data.get_loc(2)
5454

@@ -70,7 +70,7 @@ def setup(self, index_type):
7070
"non_monotonic": np.array(list("abc") * N, dtype=object),
7171
}[index_type]
7272

73-
self.data = libindex.ObjectEngine(lambda: arr, len(arr))
73+
self.data = libindex.ObjectEngine(arr)
7474
# code belows avoids populating the mapping etc. while timing.
7575
self.data.get_loc("b")
7676

asv_bench/benchmarks/io/style.py

+14
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ def peakmem_format_render(self, cols, rows):
4242
self._style_format()
4343
self.st._render_html(True, True)
4444

45+
def time_apply_format_hide_render(self, cols, rows):
46+
self._style_apply_format_hide()
47+
self.st._render_html(True, True)
48+
49+
def peakmem_apply_format_hide_render(self, cols, rows):
50+
self._style_apply_format_hide()
51+
self.st._render_html(True, True)
52+
4553
def _style_apply(self):
4654
def _apply_func(s):
4755
return [
@@ -63,3 +71,9 @@ def _style_format(self):
6371
self.st = self.df.style.format(
6472
"{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"]
6573
)
74+
75+
def _style_apply_format_hide(self):
76+
self.st = self.df.style.applymap(lambda v: "color: red;")
77+
self.st.format("{:.3f}")
78+
self.st.hide_index(self.st.index[1:])
79+
self.st.hide_columns(self.st.columns[1:])

asv_bench/benchmarks/sparse.py

+15
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,19 @@ def time_min_max(self, func, fill_value):
180180
getattr(self.sp_arr, func)()
181181

182182

183+
class Take:
184+
185+
params = ([np.array([0]), np.arange(100_000), np.full(100_000, -1)], [True, False])
186+
param_names = ["indices", "allow_fill"]
187+
188+
def setup(self, indices, allow_fill):
189+
N = 1_000_000
190+
fill_value = 0.0
191+
arr = make_array(N, 1e-5, fill_value, np.float64)
192+
self.sp_arr = SparseArray(arr, fill_value=fill_value)
193+
194+
def time_take(self, indices, allow_fill):
195+
self.sp_arr.take(indices, allow_fill=allow_fill)
196+
197+
183198
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.3.4.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ Fixed regressions
1717
- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
1818
- Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
1919
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
20+
- Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`)
2021
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
2122
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
22-
-
23+
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
24+
- Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`)
2325

2426
.. ---------------------------------------------------------------------------
2527
@@ -28,6 +30,7 @@ Fixed regressions
2830
Bug fixes
2931
~~~~~~~~~
3032
- Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue:`43132`)
33+
- Fixed bug in :meth:`Series.aggregate` not passing the first ``args`` to the user supplied ``func`` in certain cases (:issue:`43357`)
3134

3235
.. ---------------------------------------------------------------------------
3336

doc/source/whatsnew/v1.4.0.rst

+6-2
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Styler
7979
- Keyword arguments ``level`` and ``names`` added to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for additional control of visibility of MultiIndexes and index names (:issue:`25475`, :issue:`43404`, :issue:`43346`)
8080
- Global options have been extended to configure default ``Styler`` properties including formatting and encoding and mathjax options and LaTeX (:issue:`41395`)
8181
- Naive sparsification is now possible for LaTeX without the multirow package (:issue:`43369`)
82+
- :meth:`Styler.to_html` omits CSSStyle rules for hidden table elements (:issue:`43619`)
8283

8384
Formerly Styler relied on ``display.html.use_mathjax``, which has now been replaced by ``styler.html.mathjax``.
8485

@@ -123,6 +124,7 @@ Other enhancements
123124
- Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`)
124125
- Add :meth:`Series.str.removeprefix` and :meth:`Series.str.removesuffix` introduced in Python 3.9 to remove pre-/suffixes from string-type :class:`Series` (:issue:`36944`)
125126
- Attempting to write into a file in missing parent directory with :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_html`, :meth:`DataFrame.to_excel`, :meth:`DataFrame.to_feather`, :meth:`DataFrame.to_parquet`, :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_json`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_xml` now explicitly mentions missing parent directory, the same is true for :class:`Series` counterparts (:issue:`24306`)
127+
- :meth:`IntegerArray.all` , :meth:`IntegerArray.any`, :meth:`FloatingArray.any`, and :meth:`FloatingArray.all` use Kleene logic (:issue:`41967`)
126128
- Added support for nullable boolean and integer types in :meth:`DataFrame.to_stata`, :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`40855`)
127129
-
128130

@@ -334,7 +336,7 @@ Other Deprecations
334336
- Deprecated the 'include_start' and 'include_end' arguments in :meth:`DataFrame.between_time`; in a future version passing 'include_start' or 'include_end' will raise (:issue:`40245`)
335337
- Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`)
336338
- Deprecated the ``index`` argument to :class:`SparseArray` construction (:issue:`23089`)
337-
-
339+
- Deprecated :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`)
338340

339341
.. ---------------------------------------------------------------------------
340342
@@ -354,7 +356,8 @@ Performance improvements
354356
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
355357
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
356358
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
357-
-
359+
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
360+
- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
358361

359362
.. ---------------------------------------------------------------------------
360363
@@ -486,6 +489,7 @@ Reshaping
486489
- Bug in :meth:`DataFrame.append` failing to retain dtypes when appended columns do not match (:issue:`43392`)
487490
- Bug in :func:`concat` of ``bool`` and ``boolean`` dtypes resulting in ``object`` dtype instead of ``boolean`` dtype (:issue:`42800`)
488491
- Bug in :func:`crosstab` when inputs are are categorical Series, there are categories that are not present in one or both of the Series, and ``margins=True``. Previously the margin value for missing categories was ``NaN``. It is now correctly reported as 0 (:issue:`43505`)
492+
- Bug in :func:`concat` would fail when the ``objs`` argument all had the same index and the ``keys`` argument contained duplicates (:issue:`43595`)
489493

490494
Sparse
491495
^^^^^^

pandas/_libs/index.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ from pandas import MultiIndex
66

77
class IndexEngine:
88
over_size_threshold: bool
9-
def __init__(self, vgetter, n: int): ...
9+
def __init__(self, values: np.ndarray): ...
1010
def __contains__(self, val: object) -> bool: ...
1111
# -> int | slice | np.ndarray[bool]
1212
def get_loc(self, val: object) -> int | slice | np.ndarray: ...

0 commit comments

Comments
 (0)