Skip to content

Commit 7e4e5e8

Browse files
committed
Merge remote-tracking branch 'upstream/main' into warn_iloc_inplace
2 parents 49da3ce + fe9e5d0 commit 7e4e5e8

File tree

174 files changed

+1274
-620
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

174 files changed

+1274
-620
lines changed

.github/workflows/code-checks.yml

+3
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ jobs:
153153
- name: Build image
154154
run: docker build --pull --no-cache --tag pandas-dev-env .
155155

156+
- name: Show environment
157+
run: docker run -w /home/pandas pandas-dev-env mamba run -n pandas-dev python -c "import pandas as pd; print(pd.show_versions())"
158+
156159
requirements-dev-text-installable:
157160
name: Test install requirements-dev.txt
158161
runs-on: ubuntu-latest

.github/workflows/macos-windows.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
defaults:
2929
run:
3030
shell: bash -el {0}
31-
timeout-minutes: 120
31+
timeout-minutes: 180
3232
strategy:
3333
matrix:
3434
os: [macos-latest, windows-latest]

.github/workflows/ubuntu.yml

+17-3
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
defaults:
2727
run:
2828
shell: bash -el {0}
29-
timeout-minutes: 120
29+
timeout-minutes: 180
3030
strategy:
3131
matrix:
3232
env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
@@ -42,18 +42,26 @@ jobs:
4242
- name: "Minimum Versions"
4343
env_file: actions-38-minimum_versions.yaml
4444
pattern: "not slow and not network and not single_cpu"
45-
- name: "Locale: it_IT.utf8"
45+
- name: "Locale: it_IT"
4646
env_file: actions-38.yaml
4747
pattern: "not slow and not network and not single_cpu"
4848
extra_apt: "language-pack-it"
49+
# Use the utf8 version as the default, it has no bad side-effect.
4950
lang: "it_IT.utf8"
5051
lc_all: "it_IT.utf8"
51-
- name: "Locale: zh_CN.utf8"
52+
# Also install it_IT (its encoding is ISO8859-1) but do not activate it.
53+
# It will be temporarily activated during tests with locale.setlocale
54+
extra_loc: "it_IT"
55+
- name: "Locale: zh_CN"
5256
env_file: actions-38.yaml
5357
pattern: "not slow and not network and not single_cpu"
5458
extra_apt: "language-pack-zh-hans"
59+
# Use the utf8 version as the default, it has no bad side-effect.
5560
lang: "zh_CN.utf8"
5661
lc_all: "zh_CN.utf8"
62+
# Also install zh_CN (its encoding is gb2312) but do not activate it.
63+
# It will be temporarily activated during tests with locale.setlocale
64+
extra_loc: "zh_CN"
5765
- name: "Copy-on-Write"
5866
env_file: actions-310.yaml
5967
pattern: "not slow and not network and not single_cpu"
@@ -148,6 +156,12 @@ jobs:
148156
# xsel for clipboard tests
149157
run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }}
150158

159+
- name: Generate extra locales
160+
# These extra locales will be available for locale.setlocale() calls in tests
161+
run: |
162+
sudo locale-gen ${{ matrix.extra_loc }}
163+
if: ${{ matrix.extra_loc }}
164+
151165
- name: Set up Conda
152166
uses: ./.github/actions/setup-conda
153167
with:

.libcst.codemod.yaml

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# String that LibCST should look for in code which indicates that the
2+
# module is generated code.
3+
generated_code_marker: '@generated'
4+
# Command line and arguments for invoking a code formatter. Anything
5+
# specified here must be capable of taking code via stdin and returning
6+
# formatted code via stdout.
7+
formatter: ['black', '-']
8+
# List of regex patterns which LibCST will evaluate against filenames to
9+
# determine if the module should be touched.
10+
blacklist_patterns: []
11+
# List of modules that contain codemods inside of them.
12+
modules:
13+
- 'libcst.codemod.commands'
14+
- 'autotyping'
15+
# Absolute or relative path of the repository root, used for providing
16+
# full-repo metadata. Relative paths should be specified with this file
17+
# location as the base.
18+
repo_root: '.'

.pre-commit-config.yaml

+16-1
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,14 @@ repos:
2222
hooks:
2323
- id: black
2424
- repo: https://github.com/codespell-project/codespell
25-
rev: v2.1.0
25+
rev: v2.2.1
2626
hooks:
2727
- id: codespell
2828
types_or: [python, rst, markdown]
29+
- repo: https://github.com/MarcoGorelli/cython-lint
30+
rev: v0.1.4
31+
hooks:
32+
- id: cython-lint
2933
- repo: https://github.com/pre-commit/pre-commit-hooks
3034
rev: v4.3.0
3135
hooks:
@@ -252,3 +256,14 @@ repos:
252256
/(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$
253257
|/tests/
254258
|/_testing/
259+
- id: autotyping
260+
name: autotyping
261+
entry: python -m libcst.tool codemod autotyping.AutotypeCommand --none-return --scalar-return --annotate-magics --annotate-imprecise-magics
262+
types_or: [python, pyi]
263+
files: ^pandas
264+
exclude: ^(pandas/tests|pandas/io/clipboard)
265+
language: python
266+
additional_dependencies:
267+
- autotyping==22.9.0
268+
- black==22.6.0
269+
- libcst==0.4.7

Dockerfile

+6-16
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM quay.io/condaforge/mambaforge:4.13.0-1
1+
FROM quay.io/condaforge/mambaforge
22

33
# if you forked pandas, you can pass in your own GitHub username to use your fork
44
# i.e. gh_username=myname
@@ -10,16 +10,12 @@ ENV DEBIAN_FRONTEND=noninteractive
1010

1111
# Configure apt and install packages
1212
RUN apt-get update \
13-
&& apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
13+
&& apt-get -y install --no-install-recommends apt-utils git tzdata dialog 2>&1 \
1414
#
15-
# Install tzdata and configure timezone (fix for tests which try to read from "/etc/localtime")
16-
&& apt-get -y install tzdata \
15+
# Configure timezone (fix for tests which try to read from "/etc/localtime")
1716
&& ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime \
1817
&& dpkg-reconfigure -f noninteractive tzdata \
1918
#
20-
# Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
21-
&& apt-get -y install git iproute2 procps iproute2 lsb-release \
22-
#
2319
# cleanup
2420
&& apt-get autoremove -y \
2521
&& apt-get clean -y \
@@ -35,18 +31,12 @@ RUN mkdir "$pandas_home" \
3531
&& git remote add upstream "https://github.com/pandas-dev/pandas.git" \
3632
&& git pull upstream main
3733

38-
# Because it is surprisingly difficult to activate a conda environment inside a DockerFile
39-
# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89),
40-
# we just update the base/root one from the 'environment.yml' file instead of creating a new one.
41-
#
4234
# Set up environment
43-
RUN mamba env update -n base -f "$pandas_home/environment.yml"
35+
RUN mamba env create -f "$pandas_home/environment.yml"
4436

4537
# Build C extensions and pandas
46-
SHELL ["/bin/bash", "-c"]
47-
RUN . /opt/conda/etc/profile.d/conda.sh \
48-
&& conda activate base \
49-
&& cd "$pandas_home" \
38+
SHELL ["mamba", "run", "--no-capture-output", "-n", "pandas-dev", "/bin/bash", "-c"]
39+
RUN cd "$pandas_home" \
5040
&& export \
5141
&& python setup.py build_ext -j 4 \
5242
&& python -m pip install --no-build-isolation -e .

asv_bench/benchmarks/array.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ def time_from_float_array(self):
3232

3333
class IntegerArray:
3434
def setup(self):
35-
self.values_integer = np.array([1, 0, 1, 0])
36-
self.data = np.array([1, 2, 3, 4], dtype="int64")
37-
self.mask = np.array([False, False, True, False])
35+
N = 250_000
36+
self.values_integer = np.array([1, 0, 1, 0] * N)
37+
self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
38+
self.mask = np.array([False, False, True, False] * N)
3839

3940
def time_constructor(self):
4041
pd.arrays.IntegerArray(self.data, self.mask)

asv_bench/benchmarks/groupby.py

+2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66

77
from pandas import (
8+
NA,
89
Categorical,
910
DataFrame,
1011
Index,
@@ -592,6 +593,7 @@ def setup(self, dtype, method):
592593
columns=list("abcdefghij"),
593594
dtype=dtype,
594595
)
596+
df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA
595597
df["key"] = np.random.randint(0, 100, size=N)
596598
self.df = df
597599

asv_bench/benchmarks/multiindex_object.py

+15
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
MultiIndex,
99
RangeIndex,
1010
Series,
11+
array,
1112
date_range,
1213
)
1314

@@ -176,6 +177,20 @@ def time_sortlevel_one(self):
176177
self.mi.sortlevel(1)
177178

178179

180+
class SortValues:
181+
182+
params = ["int64", "Int64"]
183+
param_names = ["dtype"]
184+
185+
def setup(self, dtype):
186+
a = array(np.tile(np.arange(100), 1000), dtype=dtype)
187+
b = array(np.tile(np.arange(1000), 100), dtype=dtype)
188+
self.mi = MultiIndex.from_arrays([a, b])
189+
190+
def time_sort_values(self, dtype):
191+
self.mi.sort_values()
192+
193+
179194
class Values:
180195
def setup_cache(self):
181196

asv_bench/benchmarks/series_methods.py

+14
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44

55
from pandas import (
6+
NA,
67
Index,
78
NaT,
89
Series,
@@ -166,6 +167,19 @@ def time_value_counts(self, N, dtype):
166167
self.s.value_counts()
167168

168169

170+
class ValueCountsEA:
171+
172+
params = [[10**3, 10**4, 10**5], [True, False]]
173+
param_names = ["N", "dropna"]
174+
175+
def setup(self, N, dropna):
176+
self.s = Series(np.random.randint(0, N, size=10 * N), dtype="Int64")
177+
self.s.loc[1] = NA
178+
179+
def time_value_counts(self, N, dropna):
180+
self.s.value_counts(dropna=dropna)
181+
182+
169183
class ValueCountsObjectDropNAFalse:
170184

171185
params = [10**3, 10**4, 10**5]

doc/source/development/contributing_environment.rst

+10
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,16 @@ Run Container::
237237
# Run a container and bind your local repo to the container
238238
docker run -it -w /home/pandas --rm -v path-to-local-pandas-repo:/home/pandas pandas-yourname-env
239239

240+
Then a ``pandas-dev`` virtual environment will be available with all the development dependencies.
241+
242+
.. code-block:: shell
243+
244+
root@... :/home/pandas# conda env list
245+
# conda environments:
246+
#
247+
base * /opt/conda
248+
pandas-dev /opt/conda/envs/pandas-dev
249+
240250
.. note::
241251
If you bind your local repo for the first time, you have to build the C extensions afterwards.
242252
Run the following command inside the container::

doc/source/user_guide/io.rst

+36
Original file line numberDiff line numberDiff line change
@@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
31743174
However, if XPath does not reference node names such as default, ``/*``, then
31753175
``namespaces`` is not required.
31763176

3177+
.. note::
3178+
3179+
Since ``xpath`` identifies the parent of content to be parsed, only immediate
3180+
desendants which include child nodes or current attributes are parsed.
3181+
Therefore, ``read_xml`` will not parse the text of grandchildren or other
3182+
descendants and will not parse attributes of any descendant. To retrieve
3183+
lower level content, adjust xpath to lower level. For example,
3184+
3185+
.. ipython:: python
3186+
:okwarning:
3187+
3188+
xml = """
3189+
<data>
3190+
<row>
3191+
<shape sides="4">square</shape>
3192+
<degrees>360</degrees>
3193+
</row>
3194+
<row>
3195+
<shape sides="0">circle</shape>
3196+
<degrees>360</degrees>
3197+
</row>
3198+
<row>
3199+
<shape sides="3">triangle</shape>
3200+
<degrees>180</degrees>
3201+
</row>
3202+
</data>"""
3203+
3204+
df = pd.read_xml(xml, xpath="./row")
3205+
df
3206+
3207+
shows the attribute ``sides`` on ``shape`` element was not parsed as
3208+
expected since this attribute resides on the child of ``row`` element
3209+
and not ``row`` element itself. In other words, ``sides`` attribute is a
3210+
grandchild level descendant of ``row`` element. However, the ``xpath``
3211+
targets ``row`` element which covers only its children and attributes.
3212+
31773213
With `lxml`_ as parser, you can flatten nested XML documents with an XSLT
31783214
script which also can be string/file/URL types. As background, `XSLT`_ is
31793215
a special-purpose language written in a special XML file that can transform

doc/source/user_guide/timeseries.rst

-1
Original file line numberDiff line numberDiff line change
@@ -1981,7 +1981,6 @@ frequency. Arithmetic is not allowed between ``Period`` with different ``freq``
19811981
p = pd.Period("2012-01", freq="2M")
19821982
p + 2
19831983
p - 1
1984-
@okexcept
19851984
p == pd.Period("2012-01", freq="3M")
19861985
19871986

doc/source/whatsnew/v1.5.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ Other enhancements
308308
- Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)
309309
- :class:`Series` and :class:`DataFrame` with :class:`IntegerDtype` now supports bitwise operations (:issue:`34463`)
310310
- Add ``milliseconds`` field support for :class:`.DateOffset` (:issue:`43371`)
311+
- :meth:`DataFrame.where` tries to maintain dtype of :class:`DataFrame` if fill value can be cast without loss of precision (:issue:`45582`)
311312
- :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
312313
- :func:`concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
313314
- :func:`concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
@@ -331,6 +332,7 @@ Other enhancements
331332
- Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`)
332333
- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`)
333334
- The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`)
335+
- The ``Dockerfile`` now installs a dedicated ``pandas-dev`` virtual environment for pandas development instead of using the ``base`` environment (:issue:`48427`)
334336

335337
.. ---------------------------------------------------------------------------
336338
.. _whatsnew_150.notable_bug_fixes:
@@ -1011,6 +1013,8 @@ Time Zones
10111013
Numeric
10121014
^^^^^^^
10131015
- Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`)
1016+
- Bug in arithmetic operations with nullable types without :attr:`NA` values not matching the same operation with non-nullable types (:issue:`48223`)
1017+
- Bug in ``floordiv`` when dividing by ``IntegerDtype`` ``0`` would return ``0`` instead of ``inf`` (:issue:`48223`)
10141018
- Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
10151019
- Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
10161020
- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)

doc/source/whatsnew/v1.6.0.rst

+8-4
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,11 @@ Deprecations
100100

101101
Performance improvements
102102
~~~~~~~~~~~~~~~~~~~~~~~~
103+
- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
104+
- Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
103105
- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
106+
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
107+
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
104108
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
105109
-
106110

@@ -154,7 +158,7 @@ Indexing
154158
^^^^^^^^
155159
- Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`)
156160
- Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`)
157-
-
161+
- Bug in :func:`~DataFrame.describe` when formatting percentiles in the resulting index showed more decimals than needed (:issue:`46362`)
158162

159163
Missing
160164
^^^^^^^
@@ -174,7 +178,7 @@ I/O
174178

175179
Period
176180
^^^^^^
177-
-
181+
- Bug in :meth:`Period.strftime` and :meth:`PeriodIndex.strftime`, raising ``UnicodeDecodeError`` when a locale-specific directive was passed (:issue:`46319`)
178182
-
179183

180184
Plotting
@@ -189,7 +193,7 @@ Groupby/resample/rolling
189193

190194
Reshaping
191195
^^^^^^^^^
192-
-
196+
- Bug in :func:`join` when ``left_on`` or ``right_on`` is or includes a :class:`CategoricalIndex` incorrectly raising ``AttributeError`` (:issue:`48464`)
193197
-
194198

195199
Sparse
@@ -199,7 +203,7 @@ Sparse
199203

200204
ExtensionArray
201205
^^^^^^^^^^^^^^
202-
-
206+
- Bug in :meth:`Series.mean` overflowing unnecessarily with nullable integers (:issue:`48378`)
203207
-
204208

205209
Styler

0 commit comments

Comments
 (0)