Skip to content

Commit 19a6c00

Browse files
committed
Merge master
2 parents 7c56581 + 612c244 commit 19a6c00

File tree

132 files changed

+1693
-1114
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

132 files changed

+1693
-1114
lines changed

.github/CONTRIBUTING.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Whether you are a novice or experienced software developer, all contributions and suggestions are welcome!
44

5-
Our main contributing guide can be found [in this repo](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst) or [on the website](https://pandas-docs.github.io/pandas-docs-travis/contributing.html). If you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant sections of that document for further information.
5+
Our main contributing guide can be found [in this repo](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst) or [on the website](https://pandas-docs.github.io/pandas-docs-travis/development/contributing.html). If you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant sections of that document for further information.
66

77
## Getting Started
88

asv_bench/benchmarks/io/csv.py

+29
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,35 @@ def time_read_csv(self, infer_datetime_format, format):
9696
infer_datetime_format=infer_datetime_format)
9797

9898

99+
class ReadCSVConcatDatetime(StringIORewind):
100+
101+
iso8601 = '%Y-%m-%d %H:%M:%S'
102+
103+
def setup(self):
104+
rng = date_range('1/1/2000', periods=50000, freq='S')
105+
self.StringIO_input = StringIO('\n'.join(
106+
rng.strftime(self.iso8601).tolist()))
107+
108+
def time_read_csv(self):
109+
read_csv(self.data(self.StringIO_input),
110+
header=None, names=['foo'], parse_dates=['foo'],
111+
infer_datetime_format=False)
112+
113+
114+
class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
115+
116+
params = (['nan', '0', ''],)
117+
param_names = ['bad_date_value']
118+
119+
def setup(self, bad_date_value):
120+
self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000)
121+
122+
def time_read_csv(self, bad_date_value):
123+
read_csv(self.data(self.StringIO_input),
124+
header=None, names=['foo', 'bar'], parse_dates=['foo'],
125+
infer_datetime_format=False)
126+
127+
99128
class ReadCSVSkipRows(BaseIO):
100129

101130
fname = '__test__.csv'

asv_bench/benchmarks/io/parsers.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import numpy as np
2+
3+
from pandas._libs.tslibs.parsing import (
4+
_concat_date_cols, _does_string_look_like_datetime)
5+
6+
7+
class DoesStringLookLikeDatetime(object):
8+
9+
params = (['2Q2005', '0.0', '10000'],)
10+
param_names = ['value']
11+
12+
def setup(self, value):
13+
self.objects = [value] * 1000000
14+
15+
def time_check_datetimes(self, value):
16+
for obj in self.objects:
17+
_does_string_look_like_datetime(obj)
18+
19+
20+
class ConcatDateCols(object):
21+
22+
params = ([1234567890, 'AAAA'], [1, 2])
23+
param_names = ['value', 'dim']
24+
25+
def setup(self, value, dim):
26+
count_elem = 10000
27+
if dim == 1:
28+
self.object = (np.array([value] * count_elem),)
29+
if dim == 2:
30+
self.object = (np.array([value] * count_elem),
31+
np.array([value] * count_elem))
32+
33+
def time_check_concat(self, value, dim):
34+
_concat_date_cols(self.object)

ci/code_checks.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,8 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
148148
invgrep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas
149149
RET=$(($RET + $?)) ; echo $MSG "DONE"
150150

151-
MSG='Check for python2 new-style classes' ; echo $MSG
152-
invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\(object\):" pandas scripts
151+
MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG
152+
invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas scripts
153153
RET=$(($RET + $?)) ; echo $MSG "DONE"
154154

155155
MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG

ci/deps/azure-windows-37.yaml

-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
name: pandas-dev
22
channels:
33
- defaults
4-
- conda-forge
54
dependencies:
65
- beautifulsoup4
76
- bottleneck
8-
- gcsfs
97
- html5lib
108
- jinja2
119
- lxml

ci/incremental/setup_conda_environment.cmd

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ conda remove --all -q -y -n pandas-dev
1616
conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml
1717

1818
call activate pandas-dev
19+
@rem gh-26345: we need to separate this out so that Azure doesn't complain
20+
conda install -c conda-forge gcsfs
1921
conda list
2022

2123
if %errorlevel% neq 0 exit /b %errorlevel%

ci/run_with_env.cmd

-95
This file was deleted.

doc/source/reference/frame.rst

+23
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
311311
DataFrame.boxplot
312312
DataFrame.hist
313313

314+
315+
.. _api.frame.sparse:
316+
317+
Sparse Accessor
318+
~~~~~~~~~~~~~~~
319+
320+
Sparse-dtype specific methods and attributes are provided under the
321+
``DataFrame.sparse`` accessor.
322+
323+
.. autosummary::
324+
:toctree: api/
325+
:template: autosummary/accessor_attribute.rst
326+
327+
DataFrame.sparse.density
328+
329+
.. autosummary::
330+
:toctree: api/
331+
332+
DataFrame.sparse.from_spmatrix
333+
DataFrame.sparse.to_coo
334+
DataFrame.sparse.to_dense
335+
336+
314337
Serialization / IO / Conversion
315338
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
316339
.. autosummary::

doc/source/user_guide/enhancingperf.rst

+13-6
Original file line numberDiff line numberDiff line change
@@ -234,14 +234,18 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra
234234

235235
.. code-block:: ipython
236236
237-
In [4]: %timeit apply_integrate_f(df['a'].values, df['b'].values, df['N'].values)
237+
In [4]: %timeit apply_integrate_f(df['a'].to_numpy(),
238+
df['b'].to_numpy(),
239+
df['N'].to_numpy())
238240
1000 loops, best of 3: 1.25 ms per loop
239241
240242
We've gotten another big improvement. Let's check again where the time is spent:
241243

242244
.. ipython:: python
243245
244-
%prun -l 4 apply_integrate_f(df['a'].values, df['b'].values, df['N'].values)
246+
%prun -l 4 apply_integrate_f(df['a'].to_numpy(),
247+
df['b'].to_numpy(),
248+
df['N'].to_numpy())
245249
246250
As one might expect, the majority of the time is now spent in ``apply_integrate_f``,
247251
so if we wanted to make anymore efficiencies we must continue to concentrate our
@@ -286,7 +290,9 @@ advanced Cython techniques:
286290

287291
.. code-block:: ipython
288292
289-
In [4]: %timeit apply_integrate_f_wrap(df['a'].values, df['b'].values, df['N'].values)
293+
In [4]: %timeit apply_integrate_f_wrap(df['a'].to_numpy(),
294+
df['b'].to_numpy(),
295+
df['N'].to_numpy())
290296
1000 loops, best of 3: 987 us per loop
291297
292298
Even faster, with the caveat that a bug in our Cython code (an off-by-one error,
@@ -349,8 +355,9 @@ take the plain Python code from above and annotate with the ``@jit`` decorator.
349355
350356
351357
def compute_numba(df):
352-
result = apply_integrate_f_numba(df['a'].values, df['b'].values,
353-
df['N'].values)
358+
result = apply_integrate_f_numba(df['a'].to_numpy(),
359+
df['b'].to_numpy(),
360+
df['N'].to_numpy())
354361
return pd.Series(result, index=df.index, name='result')
355362
356363
Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a
@@ -394,7 +401,7 @@ Consider the following toy example of doubling each observation:
394401
1000 loops, best of 3: 233 us per loop
395402
396403
# Custom function with numba
397-
In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df.a.values)
404+
In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df.a.to_numpy())
398405
1000 loops, best of 3: 145 us per loop
399406
400407
Caveats

doc/source/whatsnew/v0.25.0.rst

+8-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Other Enhancements
3535
- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
3636
- :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`)
3737
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
38+
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)
3839
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3940
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
4041
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
@@ -252,6 +253,8 @@ Performance Improvements
252253
- Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`)
253254
- Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`)
254255
- Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`)
256+
- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero
257+
and float NaN; by faster checking the string for the possibility of being a date (:issue:`25754`)
255258

256259
.. _whatsnew_0250.bug_fixes:
257260

@@ -276,6 +279,7 @@ Datetimelike
276279
- Improved :class:`Timestamp` type checking in various datetime functions to prevent exceptions when using a subclassed ``datetime`` (:issue:`25851`)
277280
- Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`)
278281
- Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`)
282+
- Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`)
279283

280284
Timedelta
281285
^^^^^^^^^
@@ -305,8 +309,9 @@ Numeric
305309
- Bug in :meth:`Series.divmod` and :meth:`Series.rdivmod` which would raise an (incorrect) ``ValueError`` rather than return a pair of :class:`Series` objects as result (:issue:`25557`)
306310
- Raises a helpful exception when a non-numeric index is sent to :meth:`interpolate` with methods which require numeric index. (:issue:`21662`)
307311
- Bug in :meth:`~pandas.eval` when comparing floats with scalar operators, for example: ``x < -0.1`` (:issue:`25928`)
312+
- Fixed bug where casting all-boolean array to integer extension array failed (:issue:`25211`)
313+
-
308314
-
309-
310315

311316
Conversion
312317
^^^^^^^^^^
@@ -415,6 +420,7 @@ Reshaping
415420
- Bug in :func:`pivot_table` where columns with ``NaN`` values are dropped even if ``dropna`` argument is ``False``, when the ``aggfunc`` argument contains a ``list`` (:issue:`22159`)
416421
- Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`).
417422
- Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`)
423+
- bug in :class:`DataFrame` instantiating with a ``range`` (e.g. ``pd.DataFrame(range(3))``) raised an error (:issue:`26342`).
418424
- Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`)
419425
- Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an integer overflow (:issue:`26045`)
420426
- Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed DataFrame is sorted on all levels with the initial level sorted last (:issue:`26053`)
@@ -433,6 +439,7 @@ Other
433439

434440
- Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`)
435441
- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`).
442+
- Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions.
436443

437444

438445
.. _whatsnew_0.250.contributors:

mypy.ini

-18
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,6 @@ follow_imports=silent
55
[mypy-pandas.conftest,pandas.tests.*]
66
ignore_errors=True
77

8-
[mypy-pandas.core.api]
9-
ignore_errors=True
10-
11-
[mypy-pandas.core.base]
12-
ignore_errors=True
13-
14-
[mypy-pandas.core.computation.expr]
15-
ignore_errors=True
16-
17-
[mypy-pandas.core.computation.ops]
18-
ignore_errors=True
19-
20-
[mypy-pandas.core.computation.pytables]
21-
ignore_errors=True
22-
23-
[mypy-pandas.core.indexes.base]
24-
ignore_errors=True
25-
268
[mypy-pandas.core.indexes.datetimes]
279
ignore_errors=True
2810

pandas/_libs/lib.pyx

+5-8
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@ import warnings
88
import cython
99
from cython import Py_ssize_t
1010

11-
from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
12-
PyTuple_New,
13-
Py_EQ,
14-
PyObject_RichCompareBool)
11+
from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, PyTuple_New, PyObject_Str,
12+
Py_EQ, Py_SIZE, PyObject_RichCompareBool,
13+
PyUnicode_Join, PyList_New)
1514

1615
from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
1716
PyTime_Check, PyDelta_Check,
@@ -23,10 +22,8 @@ cimport numpy as cnp
2322
from numpy cimport (ndarray, PyArray_GETITEM,
2423
PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew,
2524
flatiter, NPY_OBJECT,
26-
int64_t,
27-
float32_t, float64_t,
28-
uint8_t, uint64_t,
29-
complex128_t)
25+
int64_t, float32_t, float64_t,
26+
uint8_t, uint64_t, complex128_t)
3027
cnp.import_array()
3128

3229
cdef extern from "numpy/arrayobject.h":

0 commit comments

Comments
 (0)