Skip to content

Commit 838ceb4

Browse files
committed
merging master
2 parents 11a3790 + a5f8c9a commit 838ceb4

File tree

85 files changed

+1590
-593
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+1590
-593
lines changed

asv_bench/benchmarks/groupby.py

+12
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,18 @@ def time_category_size(self):
369369
self.draws.groupby(self.cats).size()
370370

371371

372+
class Shift:
373+
def setup(self):
374+
N = 18
375+
self.df = DataFrame({"g": ["a", "b"] * 9, "v": list(range(N))})
376+
377+
def time_defaults(self):
378+
self.df.groupby("g").shift()
379+
380+
def time_fill_value(self):
381+
self.df.groupby("g").shift(fill_value=99)
382+
383+
372384
class FillNA:
373385
def setup(self):
374386
N = 100

asv_bench/benchmarks/reshape.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ def setup(self, dtype):
102102
columns = np.arange(n)
103103
if dtype == "int":
104104
values = np.arange(m * m * n).reshape(m * m, n)
105+
self.df = DataFrame(values, index, columns)
105106
else:
106107
# the category branch is ~20x slower than int. So we
107108
# cut down the size a bit. Now it's only ~3x slower.
@@ -111,7 +112,10 @@ def setup(self, dtype):
111112
values = np.take(list(string.ascii_letters), indices)
112113
values = [pd.Categorical(v) for v in values.T]
113114

114-
self.df = DataFrame(values, index, columns)
115+
self.df = DataFrame(
116+
{i: cat for i, cat in enumerate(values)}, index, columns
117+
)
118+
115119
self.df2 = self.df.iloc[:-1]
116120

117121
def time_full_product(self, dtype):

ci/code_checks.sh

+3
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
121121
pandas/io/parsers/ \
122122
pandas/io/sas/ \
123123
pandas/io/sql.py \
124+
pandas/io/formats/format.py \
125+
pandas/io/formats/style.py \
126+
pandas/io/stata.py \
124127
pandas/tseries/
125128
RET=$(($RET + $?)) ; echo $MSG "DONE"
126129

ci/deps/actions-39-slow.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ dependencies:
2323
- matplotlib
2424
- moto>=1.3.14
2525
- flask
26+
- numba
2627
- numexpr
2728
- numpy
2829
- openpyxl

ci/deps/actions-39.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies:
2222
- matplotlib
2323
- moto>=1.3.14
2424
- flask
25+
- numba
2526
- numexpr
2627
- numpy
2728
- openpyxl

ci/deps/azure-windows-39.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ dependencies:
2323
- matplotlib
2424
- moto>=1.3.14
2525
- flask
26+
- numba
2627
- numexpr
2728
- numpy
2829
- openpyxl

doc/source/_static/style/df_pipe.png

8.47 KB
Loading

doc/source/user_guide/visualization.rst

+54
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,34 @@ The ``by`` keyword can be specified to plot grouped histograms:
316316
@savefig grouped_hist.png
317317
data.hist(by=np.random.randint(0, 4, 1000), figsize=(6, 4));
318318
319+
.. ipython:: python
320+
:suppress:
321+
322+
plt.close("all")
323+
np.random.seed(123456)
324+
325+
In addition, the ``by`` keyword can also be specified in :meth:`DataFrame.plot.hist`.
326+
327+
.. versionchanged:: 1.4.0
328+
329+
.. ipython:: python
330+
331+
data = pd.DataFrame(
332+
{
333+
"a": np.random.choice(["x", "y", "z"], 1000),
334+
"b": np.random.choice(["e", "f", "g"], 1000),
335+
"c": np.random.randn(1000),
336+
"d": np.random.randn(1000) - 1,
337+
},
338+
)
339+
340+
@savefig grouped_hist_by.png
341+
data.plot.hist(by=["a", "b"], figsize=(10, 5));
342+
343+
.. ipython:: python
344+
:suppress:
345+
346+
plt.close("all")
319347
320348
.. _visualization.box:
321349

@@ -448,6 +476,32 @@ columns:
448476
449477
plt.close("all")
450478
479+
You could also create groupings with :meth:`DataFrame.plot.box`, for instance:
480+
481+
.. versionchanged:: 1.4.0
482+
483+
.. ipython:: python
484+
:suppress:
485+
486+
plt.close("all")
487+
np.random.seed(123456)
488+
489+
.. ipython:: python
490+
:okwarning:
491+
492+
df = pd.DataFrame(np.random.rand(10, 3), columns=["Col1", "Col2", "Col3"])
493+
df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
494+
495+
plt.figure();
496+
497+
@savefig box_plot_ex4.png
498+
bp = df.plot.box(column=["Col1", "Col2"], by="X")
499+
500+
.. ipython:: python
501+
:suppress:
502+
503+
plt.close("all")
504+
451505
.. _visualization.box.return:
452506

453507
In ``boxplot``, the return type can be controlled by the ``return_type``, keyword. The valid choices are ``{"axes", "dict", "both", None}``.

doc/source/whatsnew/v1.3.2.rst

+9-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,14 @@ including other versions of pandas.
1414

1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
17-
-
17+
- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`)
18+
- Regression in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
19+
- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`)
20+
- Fixed regression in :meth:`DataFrame.shift` where TypeError occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`)
21+
- Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`)
22+
- Regression in :meth:`DataFrame.drop` does nothing if :class:`MultiIndex` has duplicates and indexer is a tuple or list of tuples (:issue:`42771`)
23+
- Fixed regression where :meth:`pandas.read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to None (:issue:`42387`)
24+
- Fixed regression in comparisons between :class:`Timestamp` object and ``datetime64`` objects outside the implementation bounds for nanosecond ``datetime64`` (:issue:`42794`)
1825
-
1926

2027
.. ---------------------------------------------------------------------------
@@ -23,7 +30,7 @@ Fixed regressions
2330

2431
Bug fixes
2532
~~~~~~~~~
26-
-
33+
- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`)
2734
-
2835

2936
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.4.0.rst

+15-2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ Other enhancements
3535
- Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
3636
- :meth:`Styler.bar` now validates the input argument ``width`` and ``height`` (:issue:`42511`)
3737
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
38+
- Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`)
39+
- Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`)
40+
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
3841
-
3942

4043
.. ---------------------------------------------------------------------------
@@ -166,6 +169,10 @@ Performance improvements
166169
~~~~~~~~~~~~~~~~~~~~~~~~
167170
- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
168171
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
172+
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
173+
- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`)
174+
- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
175+
-
169176

170177
.. ---------------------------------------------------------------------------
171178
@@ -202,7 +209,7 @@ Numeric
202209
^^^^^^^
203210
- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
204211
- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
205-
-
212+
- Bug in ``numexpr`` engine still being used when the option ``compute.use_numexpr`` is set to ``False`` (:issue:`32556`)
206213

207214
Conversion
208215
^^^^^^^^^^
@@ -225,6 +232,8 @@ Indexing
225232
- Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`)
226233
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`)
227234
- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
235+
- Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
236+
- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
228237
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
229238
-
230239

@@ -261,11 +270,14 @@ Groupby/resample/rolling
261270
^^^^^^^^^^^^^^^^^^^^^^^^
262271
- Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`)
263272
- Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
264-
-
273+
- Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`)
274+
- Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`)
265275

266276
Reshaping
267277
^^^^^^^^^
278+
- Improved error message when creating a :class:`DataFrame` column from a multi-dimensional :class:`numpy.ndarray` (:issue:`42463`)
268279
- :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`)
280+
- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
269281
-
270282

271283
Sparse
@@ -285,6 +297,7 @@ Styler
285297

286298
Other
287299
^^^^^
300+
- Bug in :meth:`CustomBusinessMonthBegin.__add__` (:meth:`CustomBusinessMonthEnd.__add__`) not applying the extra ``offset`` parameter when beginning (end) of the target month is already a business day (:issue:`41356`)
288301

289302
.. ***DO NOT USE THIS SECTION***
290303

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ dependencies:
108108
- fsspec>=0.7.4, <2021.6.0 # for generic remote file operations
109109
- gcsfs>=0.6.0 # file IO when using 'gcs://...' path
110110
- sqlalchemy # pandas.read_sql, DataFrame.to_sql
111-
- xarray # DataFrame.to_xarray
111+
- xarray<0.19 # DataFrame.to_xarray
112112
- cftime # Needed for downstream xarray.CFTimeIndex test
113113
- pyreadstat # pandas.read_spss
114114
- tabulate>=0.8.3 # DataFrame.to_markdown

pandas/_libs/algos.pyx

+55-20
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,8 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
217217
This is a reverse of the label factorization process.
218218
"""
219219
cdef:
220-
Py_ssize_t i, loc, label, n
221-
ndarray[intp_t] indexer, where, counts
220+
Py_ssize_t i, label, n
221+
intp_t[::1] indexer, where, counts
222222

223223
counts = np.zeros(ngroups + 1, dtype=np.intp)
224224
n = len(index)
@@ -241,7 +241,7 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
241241
indexer[where[label]] = i
242242
where[label] += 1
243243

244-
return indexer, counts
244+
return indexer.base, counts.base
245245

246246

247247
cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
@@ -325,11 +325,14 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
325325
cdef:
326326
Py_ssize_t i, j, xi, yi, N, K
327327
bint minpv
328-
ndarray[float64_t, ndim=2] result
328+
float64_t[:, ::1] result
329+
# Initialize to None since we only use in the no missing value case
330+
float64_t[::1] means=None, ssqds=None
329331
ndarray[uint8_t, ndim=2] mask
332+
bint no_nans
330333
int64_t nobs = 0
331-
float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx
332-
float64_t ssqdmy, covxy
334+
float64_t mean, ssqd, val
335+
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
333336

334337
N, K = (<object>mat).shape
335338

@@ -340,25 +343,57 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
340343

341344
result = np.empty((K, K), dtype=np.float64)
342345
mask = np.isfinite(mat).view(np.uint8)
346+
no_nans = mask.all()
347+
348+
# Computing the online means and variances is expensive - so if possible we can
349+
# precompute these and avoid repeating the computations each time we handle
350+
# an (xi, yi) pair
351+
if no_nans:
352+
means = np.empty(K, dtype=np.float64)
353+
ssqds = np.empty(K, dtype=np.float64)
354+
355+
with nogil:
356+
for j in range(K):
357+
ssqd = mean = 0
358+
for i in range(N):
359+
val = mat[i, j]
360+
dx = val - mean
361+
mean += 1 / (i + 1) * dx
362+
ssqd += (val - mean) * dx
363+
364+
means[j] = mean
365+
ssqds[j] = ssqd
343366

344367
with nogil:
345368
for xi in range(K):
346369
for yi in range(xi + 1):
347-
# Welford's method for the variance-calculation
348-
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
349-
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
350-
for i in range(N):
351-
if mask[i, xi] and mask[i, yi]:
370+
covxy = 0
371+
if no_nans:
372+
for i in range(N):
352373
vx = mat[i, xi]
353374
vy = mat[i, yi]
354-
nobs += 1
355-
prev_meanx = meanx
356-
prev_meany = meany
357-
meanx = meanx + 1 / nobs * (vx - meanx)
358-
meany = meany + 1 / nobs * (vy - meany)
359-
ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
360-
ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
361-
covxy = covxy + (vx - meanx) * (vy - prev_meany)
375+
covxy += (vx - means[xi]) * (vy - means[yi])
376+
377+
ssqdmx = ssqds[xi]
378+
ssqdmy = ssqds[yi]
379+
nobs = N
380+
381+
else:
382+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
383+
for i in range(N):
384+
# Welford's method for the variance-calculation
385+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
386+
if mask[i, xi] and mask[i, yi]:
387+
vx = mat[i, xi]
388+
vy = mat[i, yi]
389+
nobs += 1
390+
dx = vx - meanx
391+
dy = vy - meany
392+
meanx += 1 / nobs * dx
393+
meany += 1 / nobs * dy
394+
ssqdmx += (vx - meanx) * dx
395+
ssqdmy += (vy - meany) * dy
396+
covxy += (vx - meanx) * dy
362397

363398
if nobs < minpv:
364399
result[xi, yi] = result[yi, xi] = NaN
@@ -370,7 +405,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
370405
else:
371406
result[xi, yi] = result[yi, xi] = NaN
372407

373-
return result
408+
return result.base
374409

375410
# ----------------------------------------------------------------------
376411
# Pairwise Spearman correlation

0 commit comments

Comments
 (0)