Skip to content

Commit 5c253a4

Browse files
committed
Merge remote-tracking branch 'upstream/master' into ea-repr
2 parents 27db397 + 2946745 commit 5c253a4

File tree

140 files changed

+5527
-4410
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+5527
-4410
lines changed

.pep8speaks.yml

-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ pycodestyle:
1313
- W503, # line break before binary operator
1414
- W504, # line break after binary operator
1515
- E402, # module level import not at top of file
16-
- E722, # do not use bare except
1716
- E731, # do not assign a lambda expression, use a def
1817
- C406, # Unnecessary list literal - rewrite as a dict literal.
1918
- C408, # Unnecessary dict call - rewrite as a literal.

asv_bench/benchmarks/timeseries.py

+31-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import timedelta
22

3+
import dateutil
34
import numpy as np
45
from pandas import to_datetime, date_range, Series, DataFrame, period_range
56
from pandas.tseries.frequencies import infer_freq
@@ -57,7 +58,10 @@ def time_to_pydatetime(self, index_type):
5758

5859
class TzLocalize(object):
5960

60-
def setup(self):
61+
params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
62+
param_names = 'tz'
63+
64+
def setup(self, tz):
6165
dst_rng = date_range(start='10/29/2000 1:00:00',
6266
end='10/29/2000 1:59:59', freq='S')
6367
self.index = date_range(start='10/29/2000',
@@ -68,8 +72,8 @@ def setup(self):
6872
end='10/29/2000 3:00:00',
6973
freq='S'))
7074

71-
def time_infer_dst(self):
72-
self.index.tz_localize('US/Eastern', ambiguous='infer')
75+
def time_infer_dst(self, tz):
76+
self.index.tz_localize(tz, ambiguous='infer')
7377

7478

7579
class ResetIndex(object):
@@ -377,15 +381,35 @@ def time_dup_string_tzoffset_dates(self, cache):
377381

378382
class DatetimeAccessor(object):
379383

380-
def setup(self):
384+
params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
385+
param_names = 'tz'
386+
387+
def setup(self, tz):
381388
N = 100000
382-
self.series = Series(date_range(start='1/1/2000', periods=N, freq='T'))
389+
self.series = Series(
390+
date_range(start='1/1/2000', periods=N, freq='T', tz=tz)
391+
)
383392

384-
def time_dt_accessor(self):
393+
def time_dt_accessor(self, tz):
385394
self.series.dt
386395

387-
def time_dt_accessor_normalize(self):
396+
def time_dt_accessor_normalize(self, tz):
388397
self.series.dt.normalize()
389398

399+
def time_dt_accessor_month_name(self, tz):
400+
self.series.dt.month_name()
401+
402+
def time_dt_accessor_day_name(self, tz):
403+
self.series.dt.day_name()
404+
405+
def time_dt_accessor_time(self, tz):
406+
self.series.dt.time
407+
408+
def time_dt_accessor_date(self, tz):
409+
self.series.dt.date
410+
411+
def time_dt_accessor_year(self, tz):
412+
self.series.dt.year
413+
390414

391415
from .pandas_vb_common import setup # noqa: F401

asv_bench/benchmarks/timestamp.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from pandas import Timestamp
44
import pytz
5+
import dateutil
56

67

78
class TimestampConstruction(object):
@@ -29,7 +30,8 @@ def time_fromtimestamp(self):
2930

3031

3132
class TimestampProperties(object):
32-
_tzs = [None, pytz.timezone('Europe/Amsterdam')]
33+
_tzs = [None, pytz.timezone('Europe/Amsterdam'), pytz.UTC,
34+
dateutil.tz.tzutc()]
3335
_freqs = [None, 'B']
3436
params = [_tzs, _freqs]
3537
param_names = ['tz', 'freq']
@@ -87,7 +89,8 @@ def time_microsecond(self, tz, freq):
8789

8890

8991
class TimestampOps(object):
90-
params = [None, 'US/Eastern']
92+
params = [None, 'US/Eastern', pytz.UTC,
93+
dateutil.tz.tzutc()]
9194
param_names = ['tz']
9295

9396
def setup(self, tz):
@@ -102,6 +105,17 @@ def time_replace_None(self, tz):
102105
def time_to_pydatetime(self, tz):
103106
self.ts.to_pydatetime()
104107

108+
def time_normalize(self, tz):
109+
self.ts.normalize()
110+
111+
def time_tz_convert(self, tz):
112+
if self.ts.tz is not None:
113+
self.ts.tz_convert(tz)
114+
115+
def time_tz_localize(self, tz):
116+
if self.ts.tz is None:
117+
self.ts.tz_localize(tz)
118+
105119

106120
class TimestampAcrossDst(object):
107121
def setup(self):

doc/source/conf.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ def linkcode_resolve(domain, info):
586586
for part in fullname.split('.'):
587587
try:
588588
obj = getattr(obj, part)
589-
except:
589+
except AttributeError:
590590
return None
591591

592592
try:
@@ -595,14 +595,14 @@ def linkcode_resolve(domain, info):
595595
fn = inspect.getsourcefile(inspect.unwrap(obj))
596596
else:
597597
fn = inspect.getsourcefile(obj)
598-
except:
598+
except TypeError:
599599
fn = None
600600
if not fn:
601601
return None
602602

603603
try:
604604
source, lineno = inspect.getsourcelines(obj)
605-
except:
605+
except OSError:
606606
lineno = None
607607

608608
if lineno:

doc/source/install.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,9 @@ Optional Dependencies
286286
`xsel <http://www.vergenet.net/~conrad/software/xsel/>`__, or
287287
`xclip <https://github.com/astrand/xclip/>`__: necessary to use
288288
:func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation.
289-
* `pandas-gbq <https://pandas-gbq.readthedocs.io/en/latest/install.html#dependencies>`__: for Google BigQuery I/O.
289+
* `pandas-gbq
290+
<https://pandas-gbq.readthedocs.io/en/latest/install.html#dependencies>`__:
291+
for Google BigQuery I/O. (pandas-gbq >= 0.8.0)
290292

291293

292294
* `Backports.lzma <https://pypi.org/project/backports.lzma/>`__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library.

doc/source/whatsnew/v0.24.0.rst

+31-10
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ New features
2424
the user to override the engine's default behavior to include or omit the
2525
dataframe's indexes from the resulting Parquet file. (:issue:`20768`)
2626
- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`)
27-
27+
- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing
28+
the user to specify which decimal separator should be used in the output. (:issue:`23614`)
2829

2930
.. _whatsnew_0240.enhancements.extension_array_operators:
3031

@@ -259,9 +260,12 @@ Other Enhancements
259260
- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`)
260261
- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`)
261262
- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to
262-
reflect changes from the `Pandas-GBQ library version 0.6.0
263-
<https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-6-0>`__.
264-
(:issue:`21627`, :issue:`22557`)
263+
reflect changes from the `Pandas-GBQ library version 0.8.0
264+
<https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-8-0>`__.
265+
Adds a ``credentials`` argument, which enables the use of any kind of
266+
`google-auth credentials
267+
<https://google-auth.readthedocs.io/en/latest/>`__. (:issue:`21627`,
268+
:issue:`22557`, :issue:`23662`)
265269
- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
266270
- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
267271
- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
@@ -287,6 +291,7 @@ Other Enhancements
287291
- :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`)
288292
- :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object.
289293
- :meth:`DataFrame.to_stata` and :class:` pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
294+
- :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue: `8839`)
290295

291296
.. _whatsnew_0240.api_breaking:
292297

@@ -312,17 +317,19 @@ If installed, we now require:
312317
+-----------------+-----------------+----------+
313318
| bottleneck | 1.2.0 | |
314319
+-----------------+-----------------+----------+
320+
| fastparquet | 0.1.2 | |
321+
+-----------------+-----------------+----------+
315322
| matplotlib | 2.0.0 | |
316323
+-----------------+-----------------+----------+
317324
| numexpr | 2.6.1 | |
318325
+-----------------+-----------------+----------+
319-
| pytables | 3.4.2 | |
320-
+-----------------+-----------------+----------+
321-
| scipy | 0.18.1 | |
326+
| pandas-gbq | 0.8.0 | |
322327
+-----------------+-----------------+----------+
323328
| pyarrow | 0.7.0 | |
324329
+-----------------+-----------------+----------+
325-
| fastparquet | 0.1.2 | |
330+
| pytables | 3.4.2 | |
331+
+-----------------+-----------------+----------+
332+
| scipy | 0.18.1 | |
326333
+-----------------+-----------------+----------+
327334

328335
Additionally we no longer depend on `feather-format` for feather based storage
@@ -1002,7 +1009,10 @@ Other API Changes
10021009
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
10031010
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
10041011
- Comparing :class:`Timedelta` to be less or greater than unknown types now raises a ``TypeError`` instead of returning ``False`` (:issue:`20829`)
1012+
- :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`).
1013+
- :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`).
10051014
- :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`).
1015+
- The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`)
10061016

10071017
.. _whatsnew_0240.deprecations:
10081018

@@ -1029,6 +1039,9 @@ Deprecations
10291039
- :meth:`ExtensionArray._formatting_values` is deprecated. Use `ExtensionArray._formatter` instead. (:issue:`23601`)
10301040
- :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`)
10311041
- Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`)
1042+
- The ``keep_tz=False`` option (the default) of the ``keep_tz`` keyword of
1043+
:meth:`DatetimeIndex.to_series` is deprecated (:issue:`17832`).
1044+
- Timezone converting a tz-aware ``datetime.datetime`` or :class:`Timestamp` with :class:`Timestamp` and the ``tz`` argument is now deprecated. Instead, use :meth:`Timestamp.tz_convert` (:issue:`23579`)
10321045

10331046
.. _whatsnew_0240.deprecations.datetimelike_int_ops:
10341047

@@ -1132,6 +1145,8 @@ Performance Improvements
11321145
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
11331146
- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`)
11341147
- Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`)
1148+
- Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`)
1149+
- Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`)
11351150

11361151

11371152
.. _whatsnew_0240.docs:
@@ -1262,8 +1277,8 @@ Numeric
12621277
Strings
12631278
^^^^^^^
12641279

1265-
-
1266-
-
1280+
- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`).
1281+
- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`).
12671282
-
12681283

12691284
Interval
@@ -1360,6 +1375,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
13601375
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
13611376
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
13621377
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
1378+
- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`)
13631379
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
13641380
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
13651381
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
@@ -1369,8 +1385,12 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
13691385
- Bug in :func:`DataFrame.to_string()` that caused representations of :class:`DataFrame` to not take up the whole window (:issue:`22984`)
13701386
- Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`).
13711387
- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
1388+
- Bug in :func:`read_csv()` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`)
1389+
- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`)
13721390
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
1391+
- Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`)
13731392
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
1393+
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
13741394
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
13751395
- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
13761396

@@ -1434,6 +1454,7 @@ Sparse
14341454
- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`)
14351455
- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`)
14361456
- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`)
1457+
- Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`)
14371458

14381459
Build Changes
14391460
^^^^^^^^^^^^^

pandas/_libs/lib.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -2273,7 +2273,7 @@ def to_object_array_tuples(rows: list):
22732273

22742274
k = 0
22752275
for i in range(n):
2276-
tmp = len(rows[i])
2276+
tmp = 1 if checknull(rows[i]) else len(rows[i])
22772277
if tmp > k:
22782278
k = tmp
22792279

@@ -2287,7 +2287,7 @@ def to_object_array_tuples(rows: list):
22872287
except Exception:
22882288
# upcast any subclasses to tuple
22892289
for i in range(n):
2290-
row = tuple(rows[i])
2290+
row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
22912291
for j in range(len(row)):
22922292
result[i, j] = row[j]
22932293

pandas/_libs/parsers.pyx

+28-18
Original file line numberDiff line numberDiff line change
@@ -1070,18 +1070,6 @@ cdef class TextReader:
10701070

10711071
conv = self._get_converter(i, name)
10721072

1073-
# XXX
1074-
na_flist = set()
1075-
if self.na_filter:
1076-
na_list, na_flist = self._get_na_list(i, name)
1077-
if na_list is None:
1078-
na_filter = 0
1079-
else:
1080-
na_filter = 1
1081-
na_hashset = kset_from_list(na_list)
1082-
else:
1083-
na_filter = 0
1084-
10851073
col_dtype = None
10861074
if self.dtype is not None:
10871075
if isinstance(self.dtype, dict):
@@ -1106,13 +1094,34 @@ cdef class TextReader:
11061094
self.c_encoding)
11071095
continue
11081096

1109-
# Should return as the desired dtype (inferred or specified)
1110-
col_res, na_count = self._convert_tokens(
1111-
i, start, end, name, na_filter, na_hashset,
1112-
na_flist, col_dtype)
1097+
# Collect the list of NaN values associated with the column.
1098+
# If we aren't supposed to do that, or none are collected,
1099+
# we set `na_filter` to `0` (`1` otherwise).
1100+
na_flist = set()
1101+
1102+
if self.na_filter:
1103+
na_list, na_flist = self._get_na_list(i, name)
1104+
if na_list is None:
1105+
na_filter = 0
1106+
else:
1107+
na_filter = 1
1108+
na_hashset = kset_from_list(na_list)
1109+
else:
1110+
na_filter = 0
11131111

1114-
if na_filter:
1115-
self._free_na_set(na_hashset)
1112+
# Attempt to parse tokens and infer dtype of the column.
1113+
# Should return as the desired dtype (inferred or specified).
1114+
try:
1115+
col_res, na_count = self._convert_tokens(
1116+
i, start, end, name, na_filter, na_hashset,
1117+
na_flist, col_dtype)
1118+
finally:
1119+
# gh-21353
1120+
#
1121+
# Cleanup the NaN hash that we generated
1122+
# to avoid memory leaks.
1123+
if na_filter:
1124+
self._free_na_set(na_hashset)
11161125

11171126
if upcast_na and na_count > 0:
11181127
col_res = _maybe_upcast(col_res)
@@ -2059,6 +2068,7 @@ cdef kh_str_t* kset_from_list(list values) except NULL:
20592068

20602069
# None creeps in sometimes, which isn't possible here
20612070
if not isinstance(val, bytes):
2071+
kh_destroy_str(table)
20622072
raise ValueError('Must be all encoded bytes')
20632073

20642074
k = kh_put_str(table, PyBytes_AsString(val), &ret)

pandas/_libs/tslibs/ccalendar.pyx

+3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ DAYS_FULL = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
4949
int_to_weekday = {num: name for num, name in enumerate(DAYS)}
5050
weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday}
5151

52+
DAY_SECONDS = 86400
53+
HOUR_SECONDS = 3600
54+
5255
# ----------------------------------------------------------------------
5356

5457

0 commit comments

Comments
 (0)