Skip to content

Commit 50ebc53

Browse files
committed
Merge branch '24893-pivot_table' of github.com:mabelvj/pandas into 24893-pivot_table
2 parents 8003d10 + 414fb3a commit 50ebc53

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+937
-606
lines changed

asv_bench/benchmarks/stat_ops.py

+11
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,23 @@ def setup(self, method, use_bottleneck):
113113
nanops._USE_BOTTLENECK = use_bottleneck
114114
self.df = pd.DataFrame(np.random.randn(1000, 30))
115115
self.df2 = pd.DataFrame(np.random.randn(1000, 30))
116+
self.df_wide = pd.DataFrame(np.random.randn(1000, 200))
117+
self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9)
116118
self.s = pd.Series(np.random.randn(1000))
117119
self.s2 = pd.Series(np.random.randn(1000))
118120

119121
def time_corr(self, method, use_bottleneck):
120122
self.df.corr(method=method)
121123

124+
def time_corr_wide(self, method, use_bottleneck):
125+
self.df_wide.corr(method=method)
126+
127+
def time_corr_wide_nans(self, method, use_bottleneck):
128+
self.df_wide_nans.corr(method=method)
129+
130+
def peakmem_corr_wide(self, method, use_bottleneck):
131+
self.df_wide.corr(method=method)
132+
122133
def time_corr_series(self, method, use_bottleneck):
123134
self.s.corr(self.s2, method=method)
124135

ci/deps/azure-windows-36.yaml

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
name: pandas-dev
22
channels:
3-
- defaults
43
- conda-forge
4+
- defaults
55
dependencies:
66
- blosc
77
- bottleneck
8-
- boost-cpp<1.67
98
- fastparquet>=0.2.1
109
- matplotlib=3.0.2
1110
- numexpr
1211
- numpy=1.15.*
1312
- openpyxl
14-
- parquet-cpp
1513
- pyarrow
1614
- pytables
1715
- python-dateutil

doc/source/development/contributing.rst

+38-3
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,44 @@ operations. To install pandas from source, you need to compile these C
135135
extensions, which means you need a C compiler. This process depends on which
136136
platform you're using.
137137

138-
* Windows: https://devguide.python.org/setup/#windows-compiling
139-
* Mac: https://devguide.python.org/setup/#macos
140-
* Unix: https://devguide.python.org/setup/#unix-compiling
138+
**Windows**
139+
140+
You will need `Build Tools for Visual Studio 2017
141+
<https://visualstudio.microsoft.com/downloads/>`_.
142+
143+
.. warning::
144+
You DO NOT need to install Visual Studio 2019.
145+
You only need "Build Tools for Visual Studio 2019" found by
146+
scrolling down to "All downloads" -> "Tools for Visual Studio 2019".
147+
148+
**Mac OS**
149+
150+
Information about compiler installation can be found here:
151+
https://devguide.python.org/setup/#macos
152+
153+
**Unix**
154+
155+
Some Linux distributions will come with a pre-installed C compiler. To find out
156+
which compilers (and versions) are installed on your system::
157+
158+
# for Debian/Ubuntu:
159+
dpkg --list | grep compiler
160+
# for Red Hat/RHEL/CentOS/Fedora:
161+
yum list installed | grep -i --color compiler
162+
163+
`GCC (GNU Compiler Collection) <https://gcc.gnu.org/>`_, is a widely used
164+
compiler, which supports C and a number of other languages. If GCC is listed
165+
as an installed compiler nothing more is required. If no C compiler is
166+
installed (or you wish to install a newer version) you can install a compiler
167+
(GCC in the example code below) with::
168+
169+
# for recent Debian/Ubuntu:
170+
sudo apt install build-essential
171+
# for Red Had/RHEL/CentOS/Fedora
172+
yum groupinstall "Development Tools"
173+
174+
For other Linux distributions, consult your favourite search engine for
175+
commpiler installation instructions.
141176

142177
Let us know if you have any difficulties by opening an issue or reaching out on
143178
`Gitter`_.

doc/source/ecosystem.rst

+15-12
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,21 @@ or `search pypi for pandas <https://pypi.org/search/?q=pandas>`_.
2323
We'd like to make it easier for users to find these projects, if you know of other
2424
substantial projects that you feel should be on this list, please let us know.
2525

26+
.. _ecosystem.data_cleaning_and_validation:
27+
28+
Data cleaning and validation
29+
----------------------------
30+
31+
`pyjanitor <https://github.com/ericmjl/pyjanitor/>`__
32+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33+
34+
Pyjanitor provides a clean API for cleaning data, using method chaining.
35+
36+
`Engarde <https://engarde.readthedocs.io/en/latest/>`__
37+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
38+
39+
Engarde is a lightweight library used to explicitly state assumptions about your datasets
40+
and check that they're *actually* true.
2641

2742
.. _ecosystem.stats:
2843

@@ -329,18 +344,6 @@ Increasingly, packages are being built on top of pandas to address specific need
329344
* vaex.from_pandas
330345
* vaex.to_pandas_df
331346

332-
333-
.. _ecosystem.data_validation:
334-
335-
Data validation
336-
---------------
337-
338-
`Engarde <https://engarde.readthedocs.io/en/latest/>`__
339-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
340-
341-
Engarde is a lightweight library used to explicitly state your assumptions about your datasets
342-
and check that they're *actually* true.
343-
344347
.. _ecosystem.extensions:
345348

346349
Extension data types

doc/source/whatsnew/v0.25.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Groupby/resample/rolling
7878
^^^^^^^^^^^^^^^^^^^^^^^^
7979

8080
- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
81+
- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)
8182
-
8283
-
8384
-

doc/source/whatsnew/v1.0.0.rst

+28-5
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,25 @@ Backwards incompatible API changes
3737
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3838

3939
- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`).
40-
-
40+
- :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`)
41+
42+
*pandas 0.25.x*
43+
44+
.. code-block:: ipython
45+
46+
In [1]: pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)])
47+
Out[2]:
48+
IntervalArray([(0, 1], (2, 3]],
49+
closed='right',
50+
dtype='interval[int64]')
51+
52+
53+
*pandas 1.0.0*
54+
55+
.. ipython:: python
56+
57+
pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)])
58+
4159
4260
.. _whatsnew_1000.api.other:
4361

@@ -75,9 +93,9 @@ Performance improvements
7593
- Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
7694
- Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`)
7795
- Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)
96+
- Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`)
7897
- Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`)
7998

80-
8199
.. _whatsnew_1000.bug_fixes:
82100

83101
Bug fixes
@@ -97,6 +115,9 @@ Datetimelike
97115
- Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`)
98116
- Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`)
99117
- Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`)
118+
- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`)
119+
- Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`)
120+
- Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`)
100121
-
101122

102123

@@ -142,7 +163,7 @@ Indexing
142163
^^^^^^^^
143164

144165
- Bug in assignment using a reverse slicer (:issue:`26939`)
145-
-
166+
- Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`)
146167

147168
Missing
148169
^^^^^^^
@@ -162,12 +183,14 @@ I/O
162183
- :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`)
163184
- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`)
164185
- Improve infinity parsing. :meth:`read_csv` now interprets ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`)
186+
- Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`)
165187

166188
Plotting
167189
^^^^^^^^
168190

169191
- Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`)
170192
-
193+
- Bug in :meth:`DataFrame.plot` not able to plot when no rows (:issue:`27758`)
171194
- Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`)
172195
- Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`)
173196
- Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`)
@@ -184,7 +207,7 @@ Groupby/resample/rolling
184207
Reshaping
185208
^^^^^^^^^
186209

187-
-
210+
- Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`)
188211
-
189212
- Bug in :meth:`pivot_table` not returning correct type ``float`` when ``margins=True`` and ``aggfunc='mean'`` (:issue:`24893`)
190213

@@ -212,7 +235,7 @@ Other
212235
- Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`)
213236
- Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`)
214237
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`)
215-
238+
-
216239

217240
.. _whatsnew_1000.contributors:
218241

pandas/_config/display.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ def detect_console_encoding():
2828
if not encoding or "ascii" in encoding.lower():
2929
try:
3030
encoding = locale.getpreferredencoding()
31-
except Exception:
31+
except locale.Error:
32+
# can be raised by locale.setlocale(), which is
33+
# called by getpreferredencoding
34+
# (on some systems, see stdlib locale docs)
3235
pass
3336

3437
# when all else fails. this will usually be "ascii"

pandas/_config/localization.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,7 @@ def _valid_locales(locales, normalize):
9898

9999

100100
def _default_locale_getter():
101-
try:
102-
raw_locales = subprocess.check_output(["locale -a"], shell=True)
103-
except subprocess.CalledProcessError as e:
104-
raise type(e)(
105-
"{exception}, the 'locale -a' command cannot be found "
106-
"on your system".format(exception=e)
107-
)
101+
raw_locales = subprocess.check_output(["locale -a"], shell=True)
108102
return raw_locales
109103

110104

@@ -139,7 +133,9 @@ def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_gette
139133
"""
140134
try:
141135
raw_locales = locale_getter()
142-
except Exception:
136+
except subprocess.CalledProcessError:
137+
# Raised on (some? all?) Windows platforms because Note: "locale -a"
138+
# is not defined
143139
return None
144140

145141
try:

pandas/_libs/algos.pyx

+16-4
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
296296
cdef:
297297
Py_ssize_t i, j, xi, yi, N, K
298298
ndarray[float64_t, ndim=2] result
299+
ndarray[float64_t, ndim=2] ranked_mat
299300
ndarray[float64_t, ndim=1] maskedx
300301
ndarray[float64_t, ndim=1] maskedy
301302
ndarray[uint8_t, ndim=2] mask
@@ -307,10 +308,18 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
307308
result = np.empty((K, K), dtype=np.float64)
308309
mask = np.isfinite(mat).view(np.uint8)
309310

311+
ranked_mat = np.empty((N, K), dtype=np.float64)
312+
313+
for i in range(K):
314+
ranked_mat[:, i] = rank_1d_float64(mat[:, i])
315+
310316
for xi in range(K):
311317
for yi in range(xi + 1):
312318
nobs = 0
319+
# Keep track of whether we need to recompute ranks
320+
all_ranks = True
313321
for i in range(N):
322+
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
314323
if mask[i, xi] and mask[i, yi]:
315324
nobs += 1
316325

@@ -320,13 +329,16 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
320329
maskedx = np.empty(nobs, dtype=np.float64)
321330
maskedy = np.empty(nobs, dtype=np.float64)
322331
j = 0
332+
323333
for i in range(N):
324334
if mask[i, xi] and mask[i, yi]:
325-
maskedx[j] = mat[i, xi]
326-
maskedy[j] = mat[i, yi]
335+
maskedx[j] = ranked_mat[i, xi]
336+
maskedy[j] = ranked_mat[i, yi]
327337
j += 1
328-
maskedx = rank_1d_float64(maskedx)
329-
maskedy = rank_1d_float64(maskedy)
338+
339+
if not all_ranks:
340+
maskedx = rank_1d_float64(maskedx)
341+
maskedy = rank_1d_float64(maskedy)
330342

331343
mean = (nobs + 1) / 2.
332344

pandas/_libs/parsers.pyx

+2-4
Original file line numberDiff line numberDiff line change
@@ -567,10 +567,8 @@ cdef class TextReader:
567567
# we need to properly close an open derived
568568
# filehandle here, e.g. and UTFRecoder
569569
if self.handle is not None:
570-
try:
571-
self.handle.close()
572-
except:
573-
pass
570+
self.handle.close()
571+
574572
# also preemptively free all allocated memory
575573
parser_free(self.parser)
576574
if self.true_set:

pandas/_libs/reduction.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,8 @@ def apply_frame_axis0(object frame, object f, object names,
528528

529529
try:
530530
piece = f(chunk)
531-
except:
531+
except Exception:
532+
# We can't be more specific without knowing something about `f`
532533
raise InvalidApply('Let this error raise above us')
533534

534535
# Need to infer if low level index slider will cause segfaults
@@ -539,6 +540,7 @@ def apply_frame_axis0(object frame, object f, object names,
539540
else:
540541
mutated = True
541542
except AttributeError:
543+
# `piece` might not have an index, could be e.g. an int
542544
pass
543545

544546
results.append(piece)

pandas/_libs/skiplist.pxd

+1-22
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
# -*- coding: utf-8 -*-
2-
3-
from cython cimport Py_ssize_t
4-
2+
# See GH#27465 for reference on related-but-unused cython code
53

64
cdef extern from "src/skiplist.h":
75
ctypedef struct node_t:
@@ -24,22 +22,3 @@ cdef extern from "src/skiplist.h":
2422
double skiplist_get(skiplist_t*, int, int*) nogil
2523
int skiplist_insert(skiplist_t*, double) nogil
2624
int skiplist_remove(skiplist_t*, double) nogil
27-
28-
29-
# Note: Node is declared here so that IndexableSkiplist can be exposed;
30-
# Node itself not intended to be exposed.
31-
cdef class Node:
32-
cdef public:
33-
double value
34-
list next
35-
list width
36-
37-
38-
cdef class IndexableSkiplist:
39-
cdef:
40-
Py_ssize_t size, maxlevels
41-
Node head
42-
43-
cpdef get(self, Py_ssize_t i)
44-
cpdef insert(self, double value)
45-
cpdef remove(self, double value)

0 commit comments

Comments
 (0)