Skip to content

Commit 4b54e99

Browse files
authored
Merge branch 'pandas-dev:master' into perf-readcsv
2 parents 47e506f + bd94bb1 commit 4b54e99

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+1439
-814
lines changed

.pre-commit-config.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ repos:
99
- id: absolufy-imports
1010
files: ^pandas/
1111
- repo: https://github.com/python/black
12-
rev: 21.7b0
12+
rev: 21.9b0
1313
hooks:
1414
- id: black
1515
- repo: https://github.com/codespell-project/codespell
@@ -58,7 +58,7 @@ repos:
5858
hooks:
5959
- id: isort
6060
- repo: https://github.com/asottile/pyupgrade
61-
rev: v2.23.3
61+
rev: v2.29.0
6262
hooks:
6363
- id: pyupgrade
6464
args: [--py38-plus]

asv_bench/benchmarks/indexing_engines.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Benchmarks in this fiel depend exclusively on code in _libs/
2+
Benchmarks in this file depend exclusively on code in _libs/
33
44
If a PR does not edit anything in _libs, it is very unlikely that benchmarks
55
in this file will be affected.

asv_bench/benchmarks/sparse.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,11 @@ class ToCooFrame:
9595
def setup(self):
9696
N = 10000
9797
k = 10
98-
arr = np.full((N, k), np.nan)
98+
arr = np.zeros((N, k), dtype=float)
9999
arr[0, 0] = 3.0
100100
arr[12, 7] = -1.0
101101
arr[0, 9] = 11.2
102-
self.df = pd.DataFrame(arr, dtype=pd.SparseDtype("float"))
102+
self.df = pd.DataFrame(arr, dtype=pd.SparseDtype("float", fill_value=0.0))
103103

104104
def time_to_coo(self):
105105
self.df.sparse.to_coo()
@@ -195,4 +195,17 @@ def time_take(self, indices, allow_fill):
195195
self.sp_arr.take(indices, allow_fill=allow_fill)
196196

197197

198+
class GetItem:
199+
def setup(self):
200+
N = 1_000_000
201+
arr = make_array(N, 1e-5, np.nan, np.float64)
202+
self.sp_arr = SparseArray(arr)
203+
204+
def time_integer_indexing(self):
205+
self.sp_arr[78]
206+
207+
def time_slice(self):
208+
self.sp_arr[1:]
209+
210+
198211
from .pandas_vb_common import setup # noqa: F401 isort:skip

ci/code_checks.sh

+4
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
8484
pandas/tseries/
8585
RET=$(($RET + $?)) ; echo $MSG "DONE"
8686

87+
MSG='Cython Doctests' ; echo $MSG
88+
python -m pytest --doctest-cython pandas/_libs
89+
RET=$(($RET + $?)) ; echo $MSG "DONE"
90+
8791
fi
8892

8993
### DOCSTRINGS ###

doc/source/index.rst.template

+3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ pandas documentation
1212

1313
**Download documentation**: `PDF Version <pandas.pdf>`__ | `Zipped HTML <pandas.zip>`__
1414

15+
**Previous versions**: Documentation of previous pandas versions is available at
16+
`pandas.pydata.org <https://pandas.pydata.org/>`__.
17+
1518
**Useful links**:
1619
`Binary Installers <https://pypi.org/project/pandas>`__ |
1720
`Source Repository <https://github.com/pandas-dev/pandas>`__ |

doc/source/user_guide/basics.rst

+4
Original file line numberDiff line numberDiff line change
@@ -1045,6 +1045,9 @@ not noted for a particular column will be ``NaN``:
10451045
Mixed dtypes
10461046
++++++++++++
10471047

1048+
.. deprecated:: 1.4.0
1049+
Attempting to determine which columns cannot be aggregated and silently dropping them from the results is deprecated and will be removed in a future version. If any porition of the columns or operations provided fail, the call to ``.agg`` will raise.
1050+
10481051
When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid
10491052
aggregations. This is similar to how ``.groupby.agg`` works.
10501053

@@ -1061,6 +1064,7 @@ aggregations. This is similar to how ``.groupby.agg`` works.
10611064
mdf.dtypes
10621065
10631066
.. ipython:: python
1067+
:okwarning:
10641068
10651069
mdf.agg(["min", "sum"])
10661070

doc/source/user_guide/groupby.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ column, which produces an aggregated result with a hierarchical index:
578578

579579
.. ipython:: python
580580
581-
grouped.agg([np.sum, np.mean, np.std])
581+
grouped[["C", "D"]].agg([np.sum, np.mean, np.std])
582582
583583
584584
The resulting aggregations are named for the functions themselves. If you
@@ -597,7 +597,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner:
597597
.. ipython:: python
598598
599599
(
600-
grouped.agg([np.sum, np.mean, np.std]).rename(
600+
grouped[["C", "D"]].agg([np.sum, np.mean, np.std]).rename(
601601
columns={"sum": "foo", "mean": "bar", "std": "baz"}
602602
)
603603
)

doc/source/user_guide/merging.rst

+13-1
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ all standard database join operations between ``DataFrame`` or named ``Series``
562562
(hierarchical), the number of levels must match the number of join keys
563563
from the right DataFrame or Series.
564564
* ``right_index``: Same usage as ``left_index`` for the right DataFrame or Series
565-
* ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults
565+
* ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``, ``'cross'``. Defaults
566566
to ``inner``. See below for more detailed description of each method.
567567
* ``sort``: Sort the result DataFrame by the join keys in lexicographical
568568
order. Defaults to ``True``, setting to ``False`` will improve performance
@@ -707,6 +707,7 @@ either the left or right tables, the values in the joined table will be
707707
``right``, ``RIGHT OUTER JOIN``, Use keys from right frame only
708708
``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames
709709
``inner``, ``INNER JOIN``, Use intersection of keys from both frames
710+
``cross``, ``CROSS JOIN``, Create the cartesian product of rows of both frames
710711

711712
.. ipython:: python
712713
@@ -751,6 +752,17 @@ either the left or right tables, the values in the joined table will be
751752
p.plot([left, right], result, labels=["left", "right"], vertical=False);
752753
plt.close("all");
753754
755+
.. ipython:: python
756+
757+
result = pd.merge(left, right, how="cross")
758+
759+
.. ipython:: python
760+
:suppress:
761+
762+
@savefig merging_merge_cross.png
763+
p.plot([left, right], result, labels=["left", "right"], vertical=False);
764+
plt.close("all");
765+
754766
You can merge a mult-indexed Series and a DataFrame, if the names of
755767
the MultiIndex correspond to the columns from the DataFrame. Transform
756768
the Series to a DataFrame using :meth:`Series.reset_index` before merging,

doc/source/user_guide/visualization.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1651,7 +1651,7 @@ remedy this, ``DataFrame`` plotting supports the use of the ``colormap`` argumen
16511651
which accepts either a Matplotlib `colormap <https://matplotlib.org/api/cm_api.html>`__
16521652
or a string that is a name of a colormap registered with Matplotlib. A
16531653
visualization of the default matplotlib colormaps is available `here
1654-
<https://matplotlib.org/examples/color/colormaps_reference.html>`__.
1654+
<https://matplotlib.org/stable/gallery/color/colormap_reference.html>`__.
16551655

16561656
As matplotlib does not directly support colormaps for line-based plots, the
16571657
colors are selected based on an even spacing determined by the number of columns

doc/source/whatsnew/v0.20.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ aggregations. This is similar to how groupby ``.agg()`` works. (:issue:`15015`)
105105
df.dtypes
106106
107107
.. ipython:: python
108+
:okwarning:
108109
109110
df.agg(['min', 'sum'])
110111

doc/source/whatsnew/v1.3.4.rst

+2
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,15 @@ including other versions of pandas.
1414

1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
17+
- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`)
1718
- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
1819
- Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
1920
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
2021
- Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`)
2122
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
2223
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
2324
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
25+
- Fixed regression in :meth:`DataFrame.explode` raising ``AssertionError`` when ``column`` is any scalar which is not a string (:issue:`43314`)
2426
- Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`)
2527

2628
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.4.0.rst

+10-2
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,7 @@ Other Deprecations
338338
- Deprecated the ``index`` argument to :class:`SparseArray` construction (:issue:`23089`)
339339
- Deprecated :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`)
340340
- Deprecated silent dropping of columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a dictionary (:issue:`43740`)
341+
- Deprecated silent dropping of columns that raised a ``TypeError``, ``DataError``, and some cases of ``ValueError`` in :meth:`Series.aggregate`, :meth:`DataFrame.aggregate`, :meth:`Series.groupby.aggregate`, and :meth:`DataFrame.groupby.aggregate` when used with a list (:issue:`43740`)
341342

342343
.. ---------------------------------------------------------------------------
343344
@@ -346,6 +347,7 @@ Other Deprecations
346347
Performance improvements
347348
~~~~~~~~~~~~~~~~~~~~~~~~
348349
- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
350+
- Performance improvement when converting non-string arrays to string arrays (:issue:`34483`)
349351
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
350352
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
351353
- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`)
@@ -357,8 +359,10 @@ Performance improvements
357359
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
358360
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
359361
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
362+
- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
360363
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
361364
- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
365+
-
362366

363367
.. ---------------------------------------------------------------------------
364368
@@ -431,7 +435,8 @@ Indexing
431435
- Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`)
432436
- Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`)
433437
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
434-
438+
- Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
439+
-
435440

436441
Missing
437442
^^^^^^^
@@ -458,6 +463,8 @@ I/O
458463
- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
459464
- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
460465
- Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
466+
- Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`)
467+
-
461468

462469
Period
463470
^^^^^^
@@ -484,6 +491,7 @@ Groupby/resample/rolling
484491
- Bug in :meth:`DataFrame.rolling.corr` when the :class:`DataFrame` columns was a :class:`MultiIndex` (:issue:`21157`)
485492
- Bug in :meth:`DataFrame.groupby.rolling` when specifying ``on`` and calling ``__getitem__`` would subsequently return incorrect results (:issue:`43355`)
486493
- Bug in :meth:`GroupBy.apply` with time-based :class:`Grouper` objects incorrectly raising ``ValueError`` in corner cases where the grouping vector contains a ``NaT`` (:issue:`43500`, :issue:`43515`)
494+
- Bug in :meth:`GroupBy.mean` failing with ``complex`` dtype (:issue:`43701`)
487495

488496
Reshaping
489497
^^^^^^^^^
@@ -499,7 +507,7 @@ Sparse
499507
^^^^^^
500508
- Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`)
501509
- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`)
502-
-
510+
- Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`)
503511
-
504512

505513
ExtensionArray

environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,4 @@ dependencies:
122122
- types-PyMySQL
123123
- types-pytz
124124
- types-setuptools
125+
- pytest-cython

pandas/__init__.py

+120-4
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@
131131
qcut,
132132
)
133133

134-
import pandas.api
134+
from pandas import api, arrays, errors, io, plotting, testing, tseries
135135
from pandas.util._print_versions import show_versions
136136

137137
from pandas.io.api import (
@@ -170,8 +170,6 @@
170170
from pandas.io.json import _json_normalize as json_normalize
171171

172172
from pandas.util._tester import test
173-
import pandas.testing
174-
import pandas.arrays
175173

176174
# use the closest tagged version if possible
177175
from pandas._version import get_versions
@@ -181,7 +179,6 @@
181179
__git_version__ = v.get("full-revisionid")
182180
del get_versions, v
183181

184-
185182
# GH 27101
186183
__deprecated_num_index_names = ["Float64Index", "Int64Index", "UInt64Index"]
187184

@@ -303,3 +300,122 @@ def __getattr__(name):
303300
- Time series-specific functionality: date range generation and frequency
304301
conversion, moving window statistics, date shifting and lagging.
305302
"""
303+
304+
# Use __all__ to let type checkers know what is part of the public API.
305+
# Pandas is not (yet) a py.typed library: the public API is determined
306+
# based on the documentation.
307+
__all__ = [
308+
"BooleanDtype",
309+
"Categorical",
310+
"CategoricalDtype",
311+
"CategoricalIndex",
312+
"DataFrame",
313+
"DateOffset",
314+
"DatetimeIndex",
315+
"DatetimeTZDtype",
316+
"ExcelFile",
317+
"ExcelWriter",
318+
"Flags",
319+
"Float32Dtype",
320+
"Float64Dtype",
321+
"Grouper",
322+
"HDFStore",
323+
"Index",
324+
"IndexSlice",
325+
"Int16Dtype",
326+
"Int32Dtype",
327+
"Int64Dtype",
328+
"Int8Dtype",
329+
"Interval",
330+
"IntervalDtype",
331+
"IntervalIndex",
332+
"MultiIndex",
333+
"NA",
334+
"NaT",
335+
"NamedAgg",
336+
"NumericIndex",
337+
"Period",
338+
"PeriodDtype",
339+
"PeriodIndex",
340+
"RangeIndex",
341+
"Series",
342+
"SparseDtype",
343+
"StringDtype",
344+
"Timedelta",
345+
"TimedeltaIndex",
346+
"Timestamp",
347+
"UInt16Dtype",
348+
"UInt32Dtype",
349+
"UInt64Dtype",
350+
"UInt8Dtype",
351+
"api",
352+
"array",
353+
"arrays",
354+
"bdate_range",
355+
"concat",
356+
"crosstab",
357+
"cut",
358+
"date_range",
359+
"describe_option",
360+
"errors",
361+
"eval",
362+
"factorize",
363+
"get_dummies",
364+
"get_option",
365+
"infer_freq",
366+
"interval_range",
367+
"io",
368+
"isna",
369+
"isnull",
370+
"json_normalize",
371+
"lreshape",
372+
"melt",
373+
"merge",
374+
"merge_asof",
375+
"merge_ordered",
376+
"notna",
377+
"notnull",
378+
"offsets",
379+
"option_context",
380+
"options",
381+
"period_range",
382+
"pivot",
383+
"pivot_table",
384+
"plotting",
385+
"qcut",
386+
"read_clipboard",
387+
"read_csv",
388+
"read_excel",
389+
"read_feather",
390+
"read_fwf",
391+
"read_gbq",
392+
"read_hdf",
393+
"read_html",
394+
"read_json",
395+
"read_orc",
396+
"read_parquet",
397+
"read_pickle",
398+
"read_sas",
399+
"read_spss",
400+
"read_sql",
401+
"read_sql_query",
402+
"read_sql_table",
403+
"read_stata",
404+
"read_table",
405+
"read_xml",
406+
"reset_option",
407+
"set_eng_float_format",
408+
"set_option",
409+
"show_versions",
410+
"test",
411+
"testing",
412+
"timedelta_range",
413+
"to_datetime",
414+
"to_numeric",
415+
"to_pickle",
416+
"to_timedelta",
417+
"tseries",
418+
"unique",
419+
"value_counts",
420+
"wide_to_long",
421+
]

0 commit comments

Comments
 (0)