From 2f981d6502e552bda67dde581e8caf230d3c595e Mon Sep 17 00:00:00 2001 From: Vaibhav Vishal Date: Fri, 31 May 2019 13:59:58 +0530 Subject: [PATCH 01/43] convert some Unions to TypeVar --- pandas/_typing.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index f5bf0dcd3e220..24ee65645905b 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import IO, AnyStr, Type, Union +from typing import IO, AnyStr, Type, TypeVar, Union import numpy as np @@ -11,12 +11,13 @@ from pandas.core.dtypes.generic import ( ABCExtensionArray, ABCIndexClass, ABCSeries, ABCSparseSeries) -AnyArrayLike = Union[ABCExtensionArray, - ABCIndexClass, - ABCSeries, - ABCSparseSeries, - np.ndarray] -ArrayLike = Union[ABCExtensionArray, np.ndarray] +AnyArrayLike = TypeVar('AnyArrayLike', + ABCExtensionArray, + ABCIndexClass, + ABCSeries, + ABCSparseSeries, + np.ndarray) +ArrayLike = TypeVar('ArrayLike', ABCExtensionArray, np.ndarray) DatetimeLikeScalar = Type[Union[Period, Timestamp, Timedelta]] Dtype = Union[str, np.dtype, ExtensionDtype] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] From c2e62676cf20c9aa179a1745ea4fbcf3c65fbe73 Mon Sep 17 00:00:00 2001 From: lrjball <50599110+lrjball@users.noreply.github.com> Date: Fri, 31 May 2019 13:41:10 +0100 Subject: [PATCH 02/43] DOC: Fixed redirects in various parts of the documentation (#26497) --- pandas/core/arrays/categorical.py | 3 ++- pandas/core/arrays/interval.py | 2 +- pandas/core/dtypes/concat.py | 2 +- pandas/core/generic.py | 10 +++++----- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/indexes/datetimes.py | 8 ++++---- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 3 ++- pandas/core/indexes/period.py | 2 +- pandas/core/indexes/timedeltas.py | 4 ++-- pandas/core/indexing.py | 4 ++-- pandas/core/reshape/concat.py | 2 +- pandas/core/tools/datetimes.py | 2 +- pandas/core/window.py | 6 +++--- pandas/io/json/json.py | 4 ++-- pandas/io/parsers.py | 4 ++-- pandas/io/pytables.py | 6 +++--- 18 files changed, 35 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0fa705369908a..89b86c66d7b05 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -272,7 +272,8 @@ class Categorical(ExtensionArray, PandasObject): Notes ----- See the `user guide - `_ for more. + `_ + for more. Examples -------- diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 94b9dc8ebab55..4f628eff43167 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -95,7 +95,7 @@ Notes ----- See the `user guide -`_ +`_ for more. %(examples)s\ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f8488b7a153e3..b22ed45642cf6 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -244,7 +244,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): ----- To learn more about categories, see `link - `__ + `__ Examples -------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 87db069d94893..0596d0ab844ec 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3328,8 +3328,8 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): "A value is trying to be set on a copy of a slice from a " "DataFrame\n\n" "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/" - "indexing.html#indexing-view-versus-copy" + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" ) else: @@ -3338,8 +3338,8 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): "DataFrame.\n" "Try using .loc[row_indexer,col_indexer] = value " "instead\n\nSee the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/" - "indexing.html#indexing-view-versus-copy" + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" ) if value == 'raise': @@ -7762,7 +7762,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False, Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 91bb71a1a8af7..2b190c53da53d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -219,7 +219,7 @@ class providing the base-class of operations. Notes ----- See more `here -`_ +`_ Examples -------- diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 04d407ebc670d..febfdc7bdf908 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -49,7 +49,7 @@ class Grouper: This will groupby the specified frequency if the target selection (via key or level) is a datetime-like object. For full specification of available frequencies, please see `here - `_. + `_. axis : number/name of the axis, defaults to 0 sort : boolean, default to False whether to sort the resulting labels diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e68431b79dcd3..1bf3cb86811cb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -215,7 +215,7 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Creating a DatetimeIndex based on `start`, `periods`, and `end` has been deprecated in favor of :func:`date_range`. @@ -1377,7 +1377,7 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1533,7 +1533,7 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, desired. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1605,7 +1605,7 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, must be specified. To learn more about the frequency strings, please see `this link - `__. + `__. Returns ------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 53e1a36c48994..41cf23c5542a9 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1215,7 +1215,7 @@ def interval_range(start=None, end=None, periods=None, freq=None, ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f1553d9db835f..ec2cc70d1a352 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -182,7 +182,8 @@ class MultiIndex(Index): Notes ----- See the `user guide - `_ for more. + `_ + for more. Examples -------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 64272431cf703..b20b0c6f853d9 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -939,7 +939,7 @@ def period_range(start=None, end=None, periods=None, freq=None, name=None): must be specified. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 6ae17e62b49c6..0574a4b41c920 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -141,7 +141,7 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Creating a TimedeltaIndex based on `start`, `periods`, and `end` has been deprecated in favor of :func:`timedelta_range`. @@ -730,7 +730,7 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 86158fa9ee529..7f4827be6dff7 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1190,7 +1190,7 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike""") # noqa + https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""") # noqa if not (ax.is_categorical() or ax.is_interval()): warnings.warn(_missing_key_warning, @@ -1339,7 +1339,7 @@ class _IXIndexer(_NDFrameIndexer): .iloc for positional indexing See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated""") # noqa + http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""") # noqa def __init__(self, name, obj): warnings.warn(self._ix_deprecation_warning, diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index ee3ed3899a55f..4523a6ad48f19 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -100,7 +100,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, A walkthrough of how this method fits in with other tools for combining pandas objects can be found `here - `__. + `__. Examples -------- diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 817d539d4ad6f..0756bdb3777ec 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -533,7 +533,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, dtype: datetime64[ns] If a date does not meet the `timestamp limitations - `_, passing errors='ignore' will return the original input instead of raising any exception. diff --git a/pandas/core/window.py b/pandas/core/window.py index d51e12035c829..f332075380c79 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -462,7 +462,7 @@ class Window(_Window): See the notes below for further information. on : str, optional For a DataFrame, column on which to calculate - the rolling window, rather than the index + the rolling window, rather than the index. axis : int or str, default 0 closed : str, default None Make the interval closed on the 'right', 'left', 'both' or @@ -488,7 +488,7 @@ class Window(_Window): changed to the center of the window by setting ``center=True``. To learn more about the offsets & frequency strings, please see `this link - `__. + `__. The recognized win_types are: @@ -2188,7 +2188,7 @@ class EWM(_Rolling): (if adjust is True), and 1-alpha and alpha (if adjust is False). More details can be found at - http://pandas.pydata.org/pandas-docs/stable/computation.html#exponentially-weighted-windows + http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows Examples -------- diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index ee9d9e000d7e3..20bed9bff7383 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -330,8 +330,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, chunksize : integer, default None Return JsonReader object for iteration. - See the `line-delimted json docs - `_ + See the `line-delimited json docs + `_ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c65c11e840c27..bcbdd80865360 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -58,7 +58,7 @@ into chunks. Additional help can be found in the online docs for -`IO Tools `_. +`IO Tools `_. Parameters ---------- @@ -753,7 +753,7 @@ def read_fwf(filepath_or_buffer: FilePathOrBuffer, into chunks. Additional help can be found in the `online docs for IO Tools - `_. + `_. Parameters ---------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 11f705e88179d..53ef2395a302a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -867,8 +867,8 @@ def put(self, key, value, format=None, append=False, **kwargs): This will force Table format, append the input data to the existing. data_columns : list of columns to create as data columns, or True to - use all columns. See - `here `__ # noqa + use all columns. See `here + `__. encoding : default None, provide an encoding for strings dropna : boolean, default False, do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table' @@ -949,7 +949,7 @@ def append(self, key, value, format=None, append=True, columns=None, List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here - `__. + `__. min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing From 805d7e8c219f804f1129fdc9e4115cf3d65b2b57 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sat, 1 Jun 2019 02:17:53 +0200 Subject: [PATCH 03/43] TST: Datetime conftest.py improvements (#26596) xref gh-23537 --- pandas/conftest.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 3c411f8ba3e31..8f71028f51ab4 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -376,10 +376,16 @@ def unique_nulls_fixture(request): FixedOffset(0), FixedOffset(-300), timezone.utc, timezone(timedelta(hours=1)), timezone(timedelta(hours=-1), name='foo')] +TIMEZONE_IDS = ['None', 'UTC', 'US/Eastern', 'Asia/Tokyp', + 'dateutil/US/Pacific', 'dateutil/Asia/Singapore', + 'dateutil.tz.tzutz()', 'dateutil.tz.tzlocal()', + 'pytz.FixedOffset(300)', 'pytz.FixedOffset(0)', + 'pytz.FixedOffset(-300)', 'datetime.timezone.utc', + 'datetime.timezone.+1', 'datetime.timezone.-1.named'] -@td.parametrize_fixture_doc(str(TIMEZONES)) -@pytest.fixture(params=TIMEZONES) +@td.parametrize_fixture_doc(str(TIMEZONE_IDS)) +@pytest.fixture(params=TIMEZONES, ids=TIMEZONE_IDS) def tz_naive_fixture(request): """ Fixture for trying timezones including default (None): {0} @@ -387,8 +393,8 @@ def tz_naive_fixture(request): return request.param -@td.parametrize_fixture_doc(str(TIMEZONES[1:])) -@pytest.fixture(params=TIMEZONES[1:]) +@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:])) +@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:]) def tz_aware_fixture(request): """ Fixture for trying explicit timezones: {0} @@ -398,6 +404,8 @@ def tz_aware_fixture(request): # ---------------------------------------------------------------- # Dtypes +# ---------------------------------------------------------------- + UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] @@ -409,8 +417,8 @@ def tz_aware_fixture(request): COMPLEX_DTYPES = [complex, "complex64", "complex128"] STRING_DTYPES = [str, 'str', 'U'] -DATETIME_DTYPES = ['datetime64[ns]', 'M8[ns]'] -TIMEDELTA_DTYPES = ['timedelta64[ns]', 'm8[ns]'] +DATETIME64_DTYPES = ['datetime64[ns]', 'M8[ns]'] +TIMEDELTA64_DTYPES = ['timedelta64[ns]', 'm8[ns]'] BOOL_DTYPES = [bool, 'bool'] BYTES_DTYPES = [bytes, 'bytes'] @@ -418,7 +426,7 @@ def tz_aware_fixture(request): ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES + - DATETIME_DTYPES + TIMEDELTA_DTYPES + BOOL_DTYPES + + DATETIME64_DTYPES + TIMEDELTA64_DTYPES + BOOL_DTYPES + OBJECT_DTYPES + BYTES_DTYPES) From c591569f5fb55b73bef1bcd541689afc03f0861d Mon Sep 17 00:00:00 2001 From: Alexander Nordin Date: Sat, 1 Jun 2019 10:04:14 -0400 Subject: [PATCH 04/43] ERR: better error message on too large excel sheet (#26080) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/formats/excel.py | 10 ++++++++++ pandas/tests/io/test_excel.py | 18 ++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 89a9da4a73b35..ae5b6aafe4c7d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -533,6 +533,7 @@ I/O - Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`) - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) - Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) +- :meth:`DataFrame.to_excel` now raises a ``ValueError`` when the caller's dimensions exceed the limitations of Excel (:issue:`26051`) Plotting ^^^^^^^^ diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index fd6e3304ec4ef..4db00e34b39e2 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -341,6 +341,9 @@ class ExcelFormatter: This is only called for body cells. """ + max_rows = 2**20 + max_cols = 2**14 + def __init__(self, df, na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, merge_cells=False, inf_rep='inf', style_converter=None): @@ -648,6 +651,13 @@ def write(self, writer, sheet_name='Sheet1', startrow=0, from pandas.io.excel import ExcelWriter from pandas.io.common import _stringify_path + num_rows, num_cols = self.df.shape + if num_rows > self.max_rows or num_cols > self.max_cols: + raise ValueError("This sheet is too large! Your sheet size is: " + + "{}, {} ".format(num_rows, num_cols) + + "Max sheet size is: {}, {}". + format(self.max_rows, self.max_cols)) + if isinstance(writer, ExcelWriter): need_save = False else: diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 1421fc94b67f4..7693caf3b31d2 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1118,6 +1118,24 @@ class and any subclasses, on account of the `autouse=True` class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. + def test_excel_sheet_size(self): + + # GH 26080 + breaking_row_count = 2**20 + 1 + breaking_col_count = 2**14 + 1 + # purposely using two arrays to prevent memory issues while testing + row_arr = np.zeros(shape=(breaking_row_count, 1)) + col_arr = np.zeros(shape=(1, breaking_col_count)) + row_df = pd.DataFrame(row_arr) + col_df = pd.DataFrame(col_arr) + + msg = "sheet is too large" + with pytest.raises(ValueError, match=msg): + row_df.to_excel(self.path) + + with pytest.raises(ValueError, match=msg): + col_df.to_excel(self.path) + def test_excel_sheet_by_name_raise(self, *_): import xlrd From cfa03b6d1c4f6ea1b0cddbff3213c47405005c41 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 1 Jun 2019 14:08:20 +0000 Subject: [PATCH 05/43] CLN: remove sample_time attributes from benchmarks (#26598) --- asv_bench/benchmarks/index_object.py | 1 - asv_bench/benchmarks/rolling.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 0fdf46e7c64de..896a20bae2069 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -52,7 +52,6 @@ def time_is_dates_only(self): class Ops: - sample_time = 0.2 params = ['float', 'int'] param_names = ['dtype'] diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 2532d326dff4b..033b466c8b9be 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -4,7 +4,6 @@ class Methods: - sample_time = 0.2 params = (['DataFrame', 'Series'], [10, 1000], ['int', 'float'], @@ -23,7 +22,6 @@ def time_rolling(self, constructor, window, dtype, method): class ExpandingMethods: - sample_time = 0.2 params = (['DataFrame', 'Series'], ['int', 'float'], ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', @@ -41,7 +39,6 @@ def time_expanding(self, constructor, dtype, method): class EWMMethods: - sample_time = 0.2 params = (['DataFrame', 'Series'], [10, 1000], ['int', 'float'], @@ -58,7 +55,6 @@ def time_ewm(self, constructor, window, dtype, method): class VariableWindowMethods(Methods): - sample_time = 0.2 params = (['DataFrame', 'Series'], ['50s', '1h', '1d'], ['int', 'float'], @@ -75,7 +71,6 @@ def setup(self, constructor, window, dtype, method): class Pairwise: - sample_time = 0.2 params = ([10, 1000, None], ['corr', 'cov'], [True, False]) @@ -95,7 +90,6 @@ def time_pairwise(self, window, method, pairwise): class Quantile: - sample_time = 0.2 params = (['DataFrame', 'Series'], [10, 1000], ['int', 'float'], From e6f21d89d5e7dc66cc5c4526ff331a5309cd815e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 1 Jun 2019 15:09:27 +0100 Subject: [PATCH 06/43] TST: add concrete examples of dataframe fixtures to docstrings (#26593) --- pandas/tests/frame/conftest.py | 169 +++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 27c0e070c10c2..c451cd58f1497 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -11,6 +11,25 @@ def float_frame(): Fixture for DataFrame of floats with index of unique strings Columns are ['A', 'B', 'C', 'D']. + + A B C D + P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 + qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 + tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 + wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 + M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 + QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 + r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 + ... ... ... ... ... + IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 + lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 + qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 + yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 + 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 + eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 + xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 + + [30 rows x 4 columns] """ return DataFrame(tm.getSeriesData()) @@ -21,6 +40,25 @@ def float_frame_with_na(): Fixture for DataFrame of floats with index of unique strings Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997 + DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872 + neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522 + 0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018 + 3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826 + soujjZ0A08 NaN NaN NaN NaN + 7W6NLGsjB9 NaN NaN NaN NaN + ... ... ... ... ... + uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590 + n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717 + ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189 + uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503 + 3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947 + 2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083 + sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517 + + [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) # set some NAs @@ -35,6 +73,25 @@ def bool_frame_with_na(): Fixture for DataFrame of booleans with index of unique strings Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + zBZxY2IDGd False False False False + IhBWBMWllt False True True True + ctjdvZSR6R True False True True + AVTujptmxb False True False True + G9lrImrSWq False False False True + sFFwdIUfz2 NaN NaN NaN NaN + s15ptEJnRb NaN NaN NaN NaN + ... ... ... ... ... + UW41KkDyZ4 True True False False + l9l6XkOdqV True False False False + X2MeZfzDYA False True False False + xWkIKU7vfX False True False True + QOhL6VmpGU False False False True + 22PwkRJdat False True False False + kfboQ3VeIK True False True False + + [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) > 0 df = df.astype(object) @@ -50,6 +107,25 @@ def int_frame(): Fixture for DataFrame of ints with index of unique strings Columns are ['A', 'B', 'C', 'D'] + + A B C D + vpBeWjM651 1 0 1 0 + 5JyxmrP1En -1 0 0 0 + qEDaoD49U2 -1 1 0 0 + m66TkTfsFe 0 0 0 0 + EHPaNzEUFm -1 0 -1 0 + fpRJCevQhi 2 0 0 0 + OlQvnmfi3Q 0 0 -2 0 + ... .. .. .. .. + uB1FPlz4uP 0 0 0 1 + EcSe6yNzCU 0 0 -1 0 + L50VudaiI8 -1 1 -2 0 + y3bpw4nwIp 0 -1 0 0 + H0RdLLwrCT 1 1 0 0 + rY82K0vMwm 0 0 0 0 + 1OPIUjnkjk 2 0 0 0 + + [30 rows x 4 columns] """ df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) # force these all to int64 to avoid platform testing issues @@ -62,6 +138,25 @@ def datetime_frame(): Fixture for DataFrame of floats with DatetimeIndex Columns are ['A', 'B', 'C', 'D'] + + A B C D + 2000-01-03 -1.122153 0.468535 0.122226 1.693711 + 2000-01-04 0.189378 0.486100 0.007864 -1.216052 + 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 + 2000-01-06 0.430050 0.894352 0.090719 0.036939 + 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 + 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 + 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 + ... ... ... ... ... + 2000-02-03 1.642618 -0.579288 0.046005 1.385249 + 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 + 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 + 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 + 2000-02-09 1.377373 0.398619 1.008453 -0.928207 + 2000-02-10 0.473194 -0.636677 0.984058 0.511519 + 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 + + [30 rows x 4 columns] """ return DataFrame(tm.getTimeSeriesData()) @@ -72,6 +167,25 @@ def float_string_frame(): Fixture for DataFrame of floats and strings with index of unique strings Columns are ['A', 'B', 'C', 'D', 'foo']. + + A B C D foo + w3orJvq07g -1.594062 -1.084273 -1.252457 0.356460 bar + PeukuVdmz2 0.109855 -0.955086 -0.809485 0.409747 bar + ahp2KvwiM8 -1.533729 -0.142519 -0.154666 1.302623 bar + 3WSJ7BUCGd 2.484964 0.213829 0.034778 -2.327831 bar + khdAmufk0U -0.193480 -0.743518 -0.077987 0.153646 bar + LE2DZiFlrE -0.193566 -1.343194 -0.107321 0.959978 bar + HJXSJhVn7b 0.142590 1.257603 -0.659409 -0.223844 bar + ... ... ... ... ... ... + 9a1Vypttgw -1.316394 1.601354 0.173596 1.213196 bar + h5d1gVFbEy 0.609475 1.106738 -0.155271 0.294630 bar + mK9LsTQG92 1.303613 0.857040 -1.019153 0.369468 bar + oOLksd9gKH 0.558219 -0.134491 -0.289869 -0.951033 bar + 9jgoOjKyHg 0.058270 -0.496110 -0.413212 -0.852659 bar + jZLDHclHAO 0.096298 1.267510 0.549206 -0.005235 bar + lR0nxDp1C2 -2.119350 -0.794384 0.544118 0.145849 bar + + [30 rows x 5 columns] """ df = DataFrame(tm.getSeriesData()) df['foo'] = 'bar' @@ -84,6 +198,25 @@ def mixed_float_frame(): Fixture for DataFrame of different float types with index of unique strings Columns are ['A', 'B', 'C', 'D']. + + A B C D + GI7bbDaEZe -0.237908 -0.246225 -0.468506 0.752993 + KGp9mFepzA -1.140809 -0.644046 -1.225586 0.801588 + VeVYLAb1l2 -1.154013 -1.677615 0.690430 -0.003731 + kmPME4WKhO 0.979578 0.998274 -0.776367 0.897607 + CPyopdXTiz 0.048119 -0.257174 0.836426 0.111266 + 0kJZQndAj0 0.274357 -0.281135 -0.344238 0.834541 + tqdwQsaHG8 -0.979716 -0.519897 0.582031 0.144710 + ... ... ... ... ... + 7FhZTWILQj -2.906357 1.261039 -0.780273 -0.537237 + 4pUDPM4eGq -2.042512 -0.464382 -0.382080 1.132612 + B8dUgUzwTi -1.506637 -0.364435 1.087891 0.297653 + hErlVYjVv9 1.477453 -0.495515 -0.713867 1.438427 + 1BKN3o7YLs 0.127535 -0.349812 -0.881836 0.489827 + 9S4Ekn7zga 1.445518 -2.095149 0.031982 0.373204 + xN1dNn6OV6 1.425017 -0.983995 -0.363281 -0.224502 + + [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) df.A = df.A.astype('float32') @@ -99,6 +232,25 @@ def mixed_int_frame(): Fixture for DataFrame of different int types with index of unique strings Columns are ['A', 'B', 'C', 'D']. + + A B C D + mUrCZ67juP 0 1 2 2 + rw99ACYaKS 0 1 0 0 + 7QsEcpaaVU 0 1 1 1 + xkrimI2pcE 0 1 0 0 + dz01SuzoS8 0 1 255 255 + ccQkqOHX75 -1 1 0 0 + DN0iXaoDLd 0 1 0 0 + ... .. .. ... ... + Dfb141wAaQ 1 1 254 254 + IPD8eQOVu5 0 1 0 0 + CcaKulsCmv 0 1 0 0 + rIBa8gu7E5 0 1 0 0 + RP6peZmh5o 0 1 1 1 + NMb9pipQWQ 0 1 0 0 + PqgbJEzjib 0 1 3 3 + + [30 rows x 4 columns] """ df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) df.A = df.A.astype('int32') @@ -114,6 +266,11 @@ def timezone_frame(): Fixture for DataFrame of date_range Series with different time zones Columns are ['A', 'B', 'C']; some entries are missing + + A B C + 0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00 + 1 2013-01-02 NaT NaT + 2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00 """ df = DataFrame({'A': date_range('20130101', periods=3), 'B': date_range('20130101', periods=3, @@ -131,6 +288,11 @@ def simple_frame(): Fixture for simple 3x3 DataFrame Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. + + one two three + a 1.0 2.0 3.0 + b 4.0 5.0 6.0 + c 7.0 8.0 9.0 """ arr = np.array([[1., 2., 3.], [4., 5., 6.], @@ -147,6 +309,13 @@ def frame_of_index_cols(): Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. + + A B C D E (tuple, as, label) + 0 foo one a 0.608477 -0.012500 -1.664297 + 1 foo two b -0.633460 0.249614 -0.364411 + 2 foo three c 0.615256 2.154968 -0.834666 + 3 bar one d 0.234246 1.085675 0.718445 + 4 bar two e 0.533841 -0.005702 -3.533912 """ df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], From dbafe6f0cb4c9e5b38c2dc159f461f651382a153 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 1 Jun 2019 15:12:40 +0100 Subject: [PATCH 07/43] CI/DOC: Building documentation with azure (#26591) --- .travis.yml | 4 +-- azure-pipelines.yml | 62 +++++++++++++++++++++++++++++++++++++- ci/deps/travis-36-doc.yaml | 46 ---------------------------- 3 files changed, 63 insertions(+), 49 deletions(-) delete mode 100644 ci/deps/travis-36-doc.yaml diff --git a/.travis.yml b/.travis.yml index ce8817133a477..90dd904e6cb1e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,14 +51,14 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true + - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true allow_failures: - dist: trusty env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true + - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true before_install: - echo "before_install" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 17eaee5458af8..9f83917024049 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -15,7 +15,7 @@ jobs: name: Windows vmImage: vs2017-win2016 -- job: 'Checks_and_doc' +- job: 'Checks' pool: vmImage: ubuntu-16.04 timeoutInMinutes: 90 @@ -116,3 +116,63 @@ jobs: fi displayName: 'Running benchmarks' condition: true + +- job: 'Docs' + pool: + vmImage: ubuntu-16.04 + timeoutInMinutes: 90 + steps: + - script: | + echo '##vso[task.setvariable variable=CONDA_ENV]pandas-dev' + echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' + displayName: 'Setting environment variables' + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + sudo apt-get install -y libc6-dev-i386 + ci/setup_env.sh + displayName: 'Setup environment and build pandas' + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + doc/make.py + displayName: 'Build documentation' + + - script: | + cd doc/build/html + git init + touch .nojekyll + git add --all . + git config user.email "pandas-dev@python.org" + git config user.name "pandas-docs-bot" + git commit -m "pandas documentation in master" + displayName: 'Create git repo for docs build' + condition : | + and(not(eq(variables['Build.Reason'], 'PullRequest')), + eq(variables['Build.SourceBranch'], 'refs/heads/master')) + + # This task to work requires next steps: + # 1. Got to "Library > Secure files" in the azure-pipelines dashboard: https://dev.azure.com/pandas-dev/pandas/_library?itemType=SecureFiles + # 2. Click on "+ Secure file" + # 3. Upload the private key (the name of the file must match with the specified in "sshKeySecureFile" input below, "pandas_docs_key") + # 4. Click on file name after it is created, tick the box "Authorize for use in all pipelines" and save + # 5. The public key specified in "sshPublicKey" is the pair of the uploaded private key, and needs to be specified as a deploy key of the repo where the docs will be pushed: https://github.com/pandas-dev/pandas-dev.github.io/settings/keys + - task: InstallSSHKey@0 + inputs: + hostName: 'github.com' + sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDHmz3l/EdqrgNxEUKkwDUuUcLv91unig03pYFGO/DMIgCmPdMG96zAgfnESd837Rm0wSSqylwSzkRJt5MV/TpFlcVifDLDQmUhqCeO8Z6dLl/oe35UKmyYICVwcvQTAaHNnYRpKC5IUlTh0JEtw9fGlnp1Ta7U1ENBLbKdpywczElhZu+hOQ892zqOj3CwA+U2329/d6cd7YnqIKoFN9DWT3kS5K6JE4IoBfQEVekIOs23bKjNLvPoOmi6CroAhu/K8j+NCWQjge5eJf2x/yTnIIP1PlEcXoHIr8io517posIx3TBup+CN8bNS1PpDW3jyD3ttl1uoBudjOQrobNnJeR6Rn67DRkG6IhSwr3BWj8alwUG5mTdZzwV5Pa9KZFdIiqX7NoDGg+itsR39QCn0thK8lGRNSR8KrWC1PSjecwelKBO7uQ7rnk/rkrZdBWR4oEA8YgNH8tirUw5WfOr5a0AIaJicKxGKNdMxZt+zmC+bS7F4YCOGIm9KHa43RrKhoGRhRf9fHHHKUPwFGqtWG4ykcUgoamDOURJyepesBAO3FiRE9rLU6ILbB3yEqqoekborHmAJD5vf7PWItW3Q/YQKuk3kkqRcKnexPyzyyq5lUgTi8CxxZdaASIOu294wjBhhdyHlXEkVTNJ9JKkj/obF+XiIIp0cBDsOXY9hDQ== pandas-dev@python.org' + sshKeySecureFile: 'pandas_docs_key' + displayName: 'Install GitHub ssh deployment key' + condition : | + and(not(eq(variables['Build.Reason'], 'PullRequest')), + eq(variables['Build.SourceBranch'], 'refs/heads/master')) + + - script: | + cd doc/build/html + git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git + git push origin master -f + displayName: 'Publish docs to GitHub pages' + condition : | + and(not(eq(variables['Build.Reason'], 'PullRequest')), + eq(variables['Build.SourceBranch'], 'refs/heads/master')) diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml deleted file mode 100644 index 9d6cbd82fdc05..0000000000000 --- a/ci/deps/travis-36-doc.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - beautifulsoup4 - - bottleneck - - cython>=0.28.2 - - fastparquet>=0.2.1 - - gitpython - - html5lib - - hypothesis>=3.58.0 - - ipykernel - - ipython - - ipywidgets - - lxml - - matplotlib - - nbconvert>=5.4.1 - - nbformat - - nbsphinx - - notebook>=5.7.5 - - numexpr - - numpy - - numpydoc - - openpyxl - - pandoc - - pyarrow - - pyqt - - pytables - - python-dateutil - - python-snappy - - python=3.6.* - - pytz - - scipy - - seaborn - - sphinx - - sqlalchemy - - statsmodels - - xarray - - xlrd - - xlsxwriter - - xlwt - # universal - - pytest>=4.0.2 - - pytest-xdist - - isort From eb4b0b5fae97d6a7ef6f83f6993103a80413f55d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 1 Jun 2019 09:35:25 -0500 Subject: [PATCH 08/43] DOC: sparse doc fixups (#26571) --- doc/source/user_guide/sparse.rst | 2 +- doc/source/whatsnew/v0.16.0.rst | 2 ++ doc/source/whatsnew/v0.18.1.rst | 2 ++ doc/source/whatsnew/v0.19.0.rst | 2 ++ doc/source/whatsnew/v0.20.0.rst | 1 + pandas/core/sparse/frame.py | 2 +- pandas/core/sparse/series.py | 2 +- 7 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 8fed29d7a6316..09ed895a847ff 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -269,7 +269,7 @@ have no replacement. Interaction with scipy.sparse ----------------------------- -Use :meth:`DataFrame.sparse.from_coo` to create a ``DataFrame`` with sparse values from a sparse matrix. +Use :meth:`DataFrame.sparse.from_spmatrix` to create a ``DataFrame`` with sparse values from a sparse matrix. .. versionadded:: 0.25.0 diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index 1e4ec682f0504..2cb09325c9466 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -92,6 +92,7 @@ Interaction with scipy.sparse Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` instances (see :ref:`here `). For example, given a SparseSeries with MultiIndex we can convert to a `scipy.sparse.coo_matrix` by specifying the row and column labels as index levels: .. ipython:: python + :okwarning: s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), @@ -121,6 +122,7 @@ The from_coo method is a convenience method for creating a ``SparseSeries`` from a ``scipy.sparse.coo_matrix``: .. ipython:: python + :okwarning: from scipy import sparse A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index f099ccf284bc2..069395c2e0f36 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -394,6 +394,7 @@ used in the ``pandas`` implementation (:issue:`12644`, :issue:`12638`, :issue:`1 An example of this signature augmentation is illustrated below: .. ipython:: python + :okwarning: sp = pd.SparseDataFrame([1, 2, 3]) sp @@ -409,6 +410,7 @@ Previous behaviour: New behaviour: .. ipython:: python + :okwarning: np.cumsum(sp, axis=0) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 29eeb415e2f6d..de29a1eb93709 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1236,6 +1236,7 @@ Operators now preserve dtypes - Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) .. ipython:: python + :okwarning: s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) s.dtype @@ -1245,6 +1246,7 @@ Operators now preserve dtypes - Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) .. ipython:: python + :okwarning: s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) s diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 741aa6ca143bb..6a88a5810eca4 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -339,6 +339,7 @@ See the :ref:`documentation ` for more information. (:issue: All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. .. ipython:: python + :okwarning: from scipy.sparse import csr_matrix arr = np.random.random(size=(1000, 5)) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index fa3cd781eaf88..bf1cec7571f4d 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -42,7 +42,7 @@ class SparseDataFrame(DataFrame): DataFrame containing sparse floating point data in the form of SparseSeries objects - .. deprectaed:: 0.25.0 + .. deprecated:: 0.25.0 Use a DataFrame with sparse values instead. diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index e4f8579a398dd..3f95acdbfb42c 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -46,7 +46,7 @@ class SparseSeries(Series): """Data structure for labeled, sparse floating point data - .. deprectaed:: 0.25.0 + .. deprecated:: 0.25.0 Use a Series with sparse values instead. From 5dedbfa2ba1c770d5b58d4d7dcc0aca2e8b4059d Mon Sep 17 00:00:00 2001 From: nathalier Date: Sat, 1 Jun 2019 15:45:06 +0100 Subject: [PATCH 09/43] BUG: ignore errors for invalid dates in to_datetime() with errors=coerce (#25512) (#26561) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/tools/datetimes.py | 6 +++--- pandas/tests/indexes/datetimes/test_tools.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ae5b6aafe4c7d..a62cac7a94bbd 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -427,6 +427,7 @@ Datetimelike - Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`) - Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) - Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`) +- Bug in :func:`to_datetime` which raises unhandled ``OverflowError`` when called with mix of invalid dates and ``NaN`` values with ``format='%Y%m%d'`` and ``error='coerce'`` (:issue:`25512`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0756bdb3777ec..73119671550a5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -775,21 +775,21 @@ def calc_with_mask(carg, mask): # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) - except ValueError: + except (ValueError, OverflowError): pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, notna(carg)) - except ValueError: + except (ValueError, OverflowError): pass # string with NaN-like try: mask = ~algorithms.isin(arg, list(tslib.nat_strings)) return calc_with_mask(arg, mask) - except ValueError: + except (ValueError, OverflowError): pass return None diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index d62d8d1276fec..c507c31ee54dd 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -96,6 +96,25 @@ def test_to_datetime_format_YYYYMMDD(self, cache): result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("input_s, expected", [ + # NaN before strings with invalid date values + [Series(['19801222', np.nan, '20010012', '10019999']), + Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], + # NaN after strings with invalid date values + [Series(['19801222', '20010012', '10019999', np.nan]), + Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], + # NaN before integers with invalid date values + [Series([20190813, np.nan, 20010012, 20019999]), + Series([Timestamp('20190813'), np.nan, np.nan, np.nan])], + # NaN after integers with invalid date values + [Series([20190813, 20010012, np.nan, 20019999]), + Series([Timestamp('20190813'), np.nan, np.nan, np.nan])]]) + def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): + # GH 25512 + # format='%Y%m%d', errors='coerce' + result = pd.to_datetime(input_s, format='%Y%m%d', errors='coerce') assert_series_equal(result, expected) @pytest.mark.parametrize('cache', [True, False]) From 3457fb2f7370317f8927e7e5e2b79f5b93357c66 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Sat, 1 Jun 2019 22:48:37 +0800 Subject: [PATCH 10/43] TST/CLN: Fixturize tests/frame/test_quantile.py (#26556) --- pandas/tests/frame/test_quantile.py | 56 +++++++++++++++++------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index a5771839e0997..9ccbd290923ba 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -3,24 +3,24 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -class TestDataFrameQuantile(TestData): +class TestDataFrameQuantile: - def test_quantile(self): + def test_quantile(self, datetime_frame): from numpy import percentile - q = self.tsframe.quantile(0.1, axis=0) - assert q['A'] == percentile(self.tsframe['A'], 10) - tm.assert_index_equal(q.index, self.tsframe.columns) + df = datetime_frame + q = df.quantile(0.1, axis=0) + assert q['A'] == percentile(df['A'], 10) + tm.assert_index_equal(q.index, df.columns) - q = self.tsframe.quantile(0.9, axis=1) + q = df.quantile(0.9, axis=1) assert (q['2000-01-17'] == - percentile(self.tsframe.loc['2000-01-17'], 90)) - tm.assert_index_equal(q.index, self.tsframe.index) + percentile(df.loc['2000-01-17'], 90)) + tm.assert_index_equal(q.index, df.index) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) @@ -99,18 +99,6 @@ def test_quantile_axis_parameter(self): def test_quantile_interpolation(self): # see gh-10174 - from numpy import percentile - - # interpolation = linear (default case) - q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') - assert q['A'] == percentile(self.tsframe['A'], 10) - q = self.intframe.quantile(0.1) - assert q['A'] == percentile(self.intframe['A'], 10) - - # test with and without interpolation keyword - q1 = self.intframe.quantile(0.1) - assert q1['A'] == np.percentile(self.intframe['A'], 10) - tm.assert_series_equal(q, q1) # interpolation method other than default linear df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) @@ -155,6 +143,28 @@ def test_quantile_interpolation(self): index=[.25, .5], columns=['a', 'b', 'c']) assert_frame_equal(result, expected) + def test_quantile_interpolation_datetime(self, datetime_frame): + # see gh-10174 + + # interpolation = linear (default case) + df = datetime_frame + q = df.quantile(0.1, axis=0, interpolation='linear') + assert q['A'] == np.percentile(df['A'], 10) + + def test_quantile_interpolation_int(self, int_frame): + # see gh-10174 + + df = int_frame + # interpolation = linear (default case) + q = df.quantile(0.1) + assert q['A'] == np.percentile(df['A'], 10) + + # test with and without interpolation keyword + # TODO: q1 is not different from q + q1 = df.quantile(0.1) + assert q1['A'] == np.percentile(df['A'], 10) + tm.assert_series_equal(q, q1) + def test_quantile_multi(self): df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['a', 'b', 'c']) @@ -214,11 +224,11 @@ def test_quantile_datetime(self): # result = df[['a', 'c']].quantile(.5) # result = df[['a', 'c']].quantile([.5]) - def test_quantile_invalid(self): + def test_quantile_invalid(self, datetime_frame): msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): - self.tsframe.quantile(invalid) + datetime_frame.quantile(invalid) def test_quantile_box(self): df = DataFrame({'A': [pd.Timestamp('2011-01-01'), From 605476ebb6e42be17196b295a3d3aa97f385896b Mon Sep 17 00:00:00 2001 From: Big Head Date: Sat, 1 Jun 2019 10:51:27 -0400 Subject: [PATCH 11/43] BUG: fix categorical comparison with missing values (#26504 ) (#26514) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/arrays/categorical.py | 13 +++++--- .../arrays/categorical/test_operators.py | 32 ++++++++++++++++++- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a62cac7a94bbd..61182b9fa32f2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -414,7 +414,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- +- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) - Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 89b86c66d7b05..44bb44457bc25 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -89,18 +89,23 @@ def f(self, other): else: other_codes = other._codes - na_mask = (self._codes == -1) | (other_codes == -1) + mask = (self._codes == -1) | (other_codes == -1) f = getattr(self._codes, op) ret = f(other_codes) - if na_mask.any(): + if mask.any(): # In other series, the leads to False, so do that here too - ret[na_mask] = False + ret[mask] = False return ret if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - return getattr(self._codes, op)(i) + ret = getattr(self._codes, op)(i) + + # check for NaN in self + mask = (self._codes == -1) + ret[mask] = False + return ret else: if op == '__eq__': return np.repeat(False, len(self)) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index dc6e1a5bc36b3..a443408bf9479 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -1,4 +1,5 @@ import operator +import warnings import numpy as np import pytest @@ -17,7 +18,6 @@ def test_categories_none_comparisons(self): tm.assert_categorical_equal(factor, self.factor) def test_comparisons(self): - result = self.factor[self.factor == 'a'] expected = self.factor[np.asarray(self.factor) == 'a'] tm.assert_categorical_equal(result, expected) @@ -186,6 +186,36 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) + def test_comparison_of_ordered_categorical_with_nan_to_scalar( + self, compare_operators_no_eq_ne): + # https://github.com/pandas-dev/pandas/issues/26504 + # BUG: fix ordered categorical comparison with missing values (#26504 ) + # and following comparisons with scalars in categories with missing + # values should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + scalar = 2 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + expected = getattr(np.array(cat), + compare_operators_no_eq_ne)(scalar) + actual = getattr(cat, compare_operators_no_eq_ne)(scalar) + tm.assert_numpy_array_equal(actual, expected) + + def test_comparison_of_ordered_categorical_with_nan_to_listlike( + self, compare_operators_no_eq_ne): + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons of missing values in ordered Categorical + # with listlike should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) + actual = getattr(cat, compare_operators_no_eq_ne)(other) + tm.assert_numpy_array_equal(actual, expected) + @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] From a69d56f9491d72567c870c8dec874a9dd1ccc027 Mon Sep 17 00:00:00 2001 From: enisnazif Date: Sat, 1 Jun 2019 15:52:35 +0100 Subject: [PATCH 12/43] Fix the output of df.describe on an empty categorical / object column (#26474) --- doc/source/whatsnew/v0.25.0.rst | 28 ++++++++++++++++++++++++++++ pandas/core/arrays/categorical.py | 2 +- pandas/core/generic.py | 6 ++++++ pandas/tests/frame/test_analytics.py | 11 +++++++++++ 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 61182b9fa32f2..ebca80025b9f7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -253,6 +253,34 @@ are returned. (:issue:`21521`) df.groupby("a").ffill() +``DataFrame`` describe on an empty categorical / object column will return top and freq +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When calling :meth:`DataFrame.describe` with an empty categorical / object +column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with +the output for non-empty columns. Now the 'top' and 'freq' columns will always be included, +with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397`) + +.. ipython:: python + + df = pd.DataFrame({"empty_col": pd.Categorical([])}) + df + +*Previous Behavior*: + +.. code-block:: python + + In [3]: df.describe() + Out[3]: + empty_col + count 0 + unique 0 + +*New Behavior*: + +.. ipython:: python + + df.describe() ``__str__`` methods now call ``__repr__`` rather than vica-versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 44bb44457bc25..49dd0041854bc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1483,7 +1483,7 @@ def value_counts(self, dropna=True): if dropna or clean: obs = code if clean else code[mask] - count = bincount(obs, minlength=ncat or None) + count = bincount(obs, minlength=ncat or 0) else: count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0596d0ab844ec..7ca2c52e18c41 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9920,6 +9920,12 @@ def describe_categorical_1d(data): names += ['top', 'freq'] result += [top, freq] + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + names += ['top', 'freq'] + result += [None, None] + return pd.Series(result, index=names, name=data.name) def describe_1d(data): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index effe7eb47323d..487ff7932ec5f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -588,6 +588,16 @@ def test_describe_categorical(self): result = df3.describe() tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + def test_describe_empty_categorical_column(self): + # GH 26397 + # Ensure the index of an an empty categoric DataFrame column + # also contains (count, unique, top, freq) + df = pd.DataFrame({"empty_col": Categorical([])}) + result = df.describe() + expected = DataFrame({'empty_col': [0, 0, None, None]}, + index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + def test_describe_categorical_columns(self): # GH 11558 columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], @@ -608,6 +618,7 @@ def test_describe_categorical_columns(self): index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], columns=exp_columns) + tm.assert_frame_equal(result, expected) tm.assert_categorical_equal(result.columns.values, expected.columns.values) From 210e2dcd43d2055ca1888d07e8f49961ef60ab5e Mon Sep 17 00:00:00 2001 From: Jiang Yue <35633013+jiangyue12392@users.noreply.github.com> Date: Sat, 1 Jun 2019 22:56:34 +0800 Subject: [PATCH 13/43] BUG: MultiIndex not dropping nan level and invalid code value (#26408) --- doc/source/whatsnew/v0.25.0.rst | 37 ++++++++++- pandas/core/indexes/multi.py | 62 ++++++++++++++++--- .../tests/indexes/multi/test_constructor.py | 41 +++++++++++- pandas/tests/indexes/multi/test_missing.py | 15 +++++ 4 files changed, 143 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ebca80025b9f7..3275223b159f8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -119,6 +119,42 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + +.. _whatsnew_0250.api_breaking.multi_indexing: + + +MultiIndex constructed from levels and codes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously. +Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes +would be reassigned as -1. (:issue:`19387`) + +.. ipython:: python + + mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + mi2 = pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) + +*Previous Behavior*: + +.. code-block:: ipython + + In [1]: mi1 + Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + In [2]: mi2 + Out[2]: MultiIndex(levels=[[1, 2]], + codes=[[0, -2]]) + +*New Behavior*: + +.. ipython:: python + + mi1 + mi2 + + .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: GroupBy.apply on ``DataFrame`` evaluates first group only once @@ -536,7 +572,6 @@ MultiIndex - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) - -- I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ec2cc70d1a352..9217b388ce86b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -243,11 +243,35 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, result.sortorder = sortorder if verify_integrity: - result._verify_integrity() + new_codes = result._verify_integrity() + result._codes = new_codes + if _set_identity: result._reset_identity() + return result + def _validate_codes(self, level: list, code: list): + """ + Reassign code values as -1 if their corresponding levels are NaN. + + Parameters + ---------- + code : list + Code to reassign. + level : list + Level to check for missing values (NaN, NaT, None). + + Returns + ------- + code : new code where code value = -1 if it corresponds + to a level with missing values (NaN, NaT, None). + """ + null_mask = isna(level) + if np.any(null_mask): + code = np.where(null_mask[code], -1, code) + return code + def _verify_integrity(self, codes=None, levels=None): """ @@ -263,6 +287,11 @@ def _verify_integrity(self, codes=None, levels=None): ValueError If length of levels and codes don't match, if the codes for any level would exceed level bounds, or there are any duplicate levels. + + Returns + ------- + codes : new codes where code value = -1 if it corresponds to a + NaN level. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -272,22 +301,33 @@ def _verify_integrity(self, codes=None, levels=None): if len(levels) != len(codes): raise ValueError("Length of levels and codes must match. NOTE:" " this index is in an inconsistent state.") - codes_length = len(self.codes[0]) + codes_length = len(codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: raise ValueError("Unequal code lengths: %s" % ([len(code_) for code_ in codes])) if len(level_codes) and level_codes.max() >= len(level): - raise ValueError("On level %d, code max (%d) >= length of" - " level (%d). NOTE: this index is in an" - " inconsistent state" % (i, level_codes.max(), - len(level))) + msg = ("On level {level}, code max ({max_code}) >= length of " + "level ({level_len}). NOTE: this index is in an " + "inconsistent state".format( + level=i, max_code=level_codes.max(), + level_len=len(level))) + raise ValueError(msg) + if len(level_codes) and level_codes.min() < -1: + raise ValueError("On level {level}, code value ({code})" + " < -1".format( + level=i, code=level_codes.min())) if not level.is_unique: raise ValueError("Level values must be unique: {values} on " "level {level}".format( values=[value for value in level], level=i)) + codes = [self._validate_codes(level, code) + for level, code in zip(levels, codes)] + new_codes = FrozenList(codes) + return new_codes + @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): """ @@ -586,7 +626,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, new_levels = FrozenList(new_levels) if verify_integrity: - self._verify_integrity(levels=new_levels) + new_codes = self._verify_integrity(levels=new_levels) + self._codes = new_codes names = self.names self._levels = new_levels @@ -676,7 +717,6 @@ def labels(self): def _set_codes(self, codes, level=None, copy=False, validate=True, verify_integrity=False): - if validate and level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") if validate and level is not None and len(codes) != len(level): @@ -696,9 +736,10 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, new_codes = FrozenList(new_codes) if verify_integrity: - self._verify_integrity(codes=new_codes) + new_codes = self._verify_integrity(codes=new_codes) self._codes = new_codes + self._tuples = None self._reset_cache() @@ -1763,9 +1804,10 @@ def __setstate__(self, state): self._set_levels([Index(x) for x in levels], validate=False) self._set_codes(codes) + new_codes = self._verify_integrity() + self._set_codes(new_codes) self._set_names(names) self.sortorder = sortorder - self._verify_integrity() self._reset_identity() def __getitem__(self, key): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 37290bc6eb1c0..7cab05660ac49 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -63,9 +63,10 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=msg): MultiIndex(levels=levels, codes=codes) - length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." + length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." " NOTE: this index is in an inconsistent state") label_error = r"Unequal code lengths: \[4, 2\]" + code_value_error = r"On level 0, code value \(-2\) < -1" # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): @@ -82,6 +83,44 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=label_error): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) + # test set_codes with verify_integrity=False + # the setting should not raise any value error + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], + verify_integrity=False) + + # code value smaller than -1 + with pytest.raises(ValueError, match=code_value_error): + MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) + + +def test_na_levels(): + # GH26408 + # test if codes are re-assigned value -1 for levels + # with mising values (NaN, NaT, None) + result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[-1, -1, -1, -1, 3, 4]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[-1, -1, 1, -1, 3, -1]]) + tm.assert_index_equal(result, expected) + + # verify set_levels and set_codes + result = MultiIndex( + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( + [[np.nan, 's', pd.NaT, 128, None]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[1, 2, 2, 2, 2, 2]]).set_codes( + [[0, -1, 1, 2, 3, 4]]) + tm.assert_index_equal(result, expected) + def test_labels_deprecated(idx): # GH23752 diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index ed90f74d80989..518c12bb20e13 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -73,6 +73,21 @@ def test_dropna(): with pytest.raises(ValueError, match=msg): idx.dropna(how='xxx') + # GH26408 + # test if missing values are dropped for mutiindex constructed + # from codes and values + idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], + [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], + [0, -1, 3, 3, 3, 4]]) + expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) + tm.assert_index_equal(idx.dropna(), expected) + tm.assert_index_equal(idx.dropna(how='any'), expected) + + expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2], + ["128", "128", "128", 2]]) + tm.assert_index_equal(idx.dropna(how='all'), expected) + def test_nulls(idx): # this is really a smoke test for the methods From a2f9013efc251d64878123fb81e29f73a21e1fc3 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sat, 1 Jun 2019 17:03:06 +0200 Subject: [PATCH 14/43] API: Series.str-accessor infers dtype (and Index.str does not raise on all-NA) (#23167) --- doc/source/user_guide/text.rst | 10 ++ doc/source/whatsnew/v0.25.0.rst | 40 +++++- pandas/core/strings.py | 214 +++++++++++++++++++++++++------- pandas/tests/test_strings.py | 48 +++---- 4 files changed, 233 insertions(+), 79 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index f7fdfcf8bf882..87c75e8bcd91f 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -70,6 +70,16 @@ and replacing any remaining whitespaces with underscores: ``.str`` methods which operate on elements of type ``list`` are not available on such a ``Series``. +.. _text.warn_types: + +.. warning:: + + Before v.0.25.0, the ``.str``-accessor did only the most rudimentary type checks. Starting with + v.0.25.0, the type of the Series is inferred and the allowed types (i.e. strings) are enforced more rigorously. + + Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few + exceptions, other uses are not supported, and may be disabled at a later point. + Splitting and Replacing Strings ------------------------------- diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3275223b159f8..87a8010998bd0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -231,6 +231,43 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. +The ``.str``-accessor performs stricter type checks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Due to the lack of more fine-grained dtypes, :attr:`Series.str` so far only checked whether the data was +of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* the Series; in particular, +``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`, +:meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`. + +*Previous Behaviour*: + +.. code-block:: python + + In [1]: s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) + + In [2]: s + Out[2]: + 0 b'a' + 1 b'ba' + 2 b'cba' + dtype: object + + In [3]: s.str.startswith(b'a') + Out[3]: + 0 True + 1 False + 2 False + dtype: bool + +*New Behaviour*: + +.. ipython:: python + :okexcept: + + s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) + s + s.str.startswith(b'a') + .. _whatsnew_0250.api_breaking.incompatible_index_unions Incompatible Index Type Unions @@ -331,7 +368,6 @@ This change is backward compatible for direct usage of Pandas, but if you subcla Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). - .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies @@ -537,7 +573,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) - - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ee3796241690d..bd756491abd2f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,4 +1,5 @@ import codecs +from functools import wraps import re import textwrap from typing import Dict @@ -12,8 +13,8 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, - is_list_like, is_object_dtype, is_re, is_scalar, is_string_like) -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + is_list_like, is_re, is_scalar, is_string_like) +from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d @@ -1720,12 +1721,78 @@ def str_encode(arr, encoding, errors="strict"): return _na_map(f, arr) -def _noarg_wrapper(f, docstring=None, **kargs): +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {'string', 'empty', 'bytes', + 'mixed', 'mixed-integer'} - set(forbidden) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ('Cannot use .str.{name} with values of inferred dtype ' + '{inf_type!r}.'.format(name=func_name, + inf_type=self._inferred_dtype)) + raise TypeError(msg) + return func(self, *args, **kwargs) + wrapper.__name__ = func_name + return wrapper + return _forbid_nonstring_types + + +def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=['bytes'], + **kargs): + @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): result = _na_map(f, self._parent, **kargs) return self._wrap_result(result) - wrapper.__name__ = f.__name__ + wrapper.__name__ = f.__name__ if name is None else name if docstring is not None: wrapper.__doc__ = docstring else: @@ -1734,22 +1801,26 @@ def wrapper(self): return wrapper -def _pat_wrapper(f, flags=False, na=False, **kwargs): +def _pat_wrapper(f, flags=False, na=False, name=None, + forbidden_types=['bytes'], **kwargs): + @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): result = f(self._parent, pat) return self._wrap_result(result) + @forbid_nonstring_types(forbidden_types, name=name) def wrapper2(self, pat, flags=0, **kwargs): result = f(self._parent, pat, flags=flags, **kwargs) return self._wrap_result(result) + @forbid_nonstring_types(forbidden_types, name=name) def wrapper3(self, pat, na=np.nan): result = f(self._parent, pat, na=na) return self._wrap_result(result) wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - wrapper.__name__ = f.__name__ + wrapper.__name__ = f.__name__ if name is None else name if f.__doc__: wrapper.__doc__ = f.__doc__ @@ -1780,7 +1851,7 @@ class StringMethods(NoNewAttributesMixin): """ def __init__(self, data): - self._validate(data) + self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data) # .values.categories works for both Series/Index @@ -1791,38 +1862,44 @@ def __init__(self, data): @staticmethod def _validate(data): - from pandas.core.index import Index - - if (isinstance(data, ABCSeries) and - not ((is_categorical_dtype(data.dtype) and - is_object_dtype(data.values.categories)) or - (is_object_dtype(data.dtype)))): - # it's neither a string series not a categorical series with - # strings inside the categories. - # this really should exclude all series with any non-string values - # (instead of test for object dtype), but that isn't practical for - # performance reasons until we have a str dtype (GH 9343) + """ + Auxiliary function for StringMethods, infers and checks dtype of data. + + This is a "first line of defence" at the creation of the StringMethods- + object (see _make_accessor), and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types (more info in the corresponding docstring). + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ + if isinstance(data, ABCMultiIndex): + raise AttributeError('Can only use .str accessor with Index, ' + 'not MultiIndex') + + # see _libs/lib.pyx for list of inferred types + allowed_types = ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'] + + values = getattr(data, 'values', data) # Series / Index + values = getattr(values, 'categories', values) # categorical / normal + + # missing values obfuscate type inference -> skip + inferred_dtype = lib.infer_dtype(values, skipna=True) + + if inferred_dtype not in allowed_types: raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - elif isinstance(data, Index): - # can't use ABCIndex to exclude non-str - - # see src/inference.pyx which can contain string values - allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') - if is_categorical_dtype(data.dtype): - inf_type = data.categories.inferred_type - else: - inf_type = data.inferred_type - if inf_type not in allowed_types: - message = ("Can only use .str accessor with string values " - "(i.e. inferred_type is 'string', 'unicode' or " - "'mixed')") - raise AttributeError(message) - if data.nlevels > 1: - message = ("Can only use .str accessor with Index, not " - "MultiIndex") - raise AttributeError(message) + "values!") + return inferred_dtype def __getitem__(self, key): if isinstance(key, slice): @@ -2025,12 +2102,13 @@ def _get_series_list(self, others, ignore_index=False): warnings.warn('list-likes other than Series, Index, or ' 'np.ndarray WITHIN another list-like are ' 'deprecated and will be removed in a future ' - 'version.', FutureWarning, stacklevel=3) + 'version.', FutureWarning, stacklevel=4) return (los, join_warn) elif all(not is_list_like(x) for x in others): return ([Series(others, index=idx)], False) raise TypeError(err_msg) + @forbid_nonstring_types(['bytes', 'mixed', 'mixed-integer']) def cat(self, others=None, sep=None, na_rep=None, join=None): """ Concatenate strings in the Series/Index with given separator. @@ -2211,7 +2289,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "Index/DataFrame in `others`. To enable alignment " "and silence this warning, pass `join='left'|" "'outer'|'inner'|'right'`. The future default will " - "be `join='left'`.", FutureWarning, stacklevel=2) + "be `join='left'`.", FutureWarning, stacklevel=3) # if join is None, _get_series_list already force-aligned indexes join = 'left' if join is None else join @@ -2384,6 +2462,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): @Appender(_shared_docs['str_split'] % { 'side': 'beginning', 'method': 'split'}) + @forbid_nonstring_types(['bytes']) def split(self, pat=None, n=-1, expand=False): result = str_split(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) @@ -2391,6 +2470,7 @@ def split(self, pat=None, n=-1, expand=False): @Appender(_shared_docs['str_split'] % { 'side': 'end', 'method': 'rsplit'}) + @forbid_nonstring_types(['bytes']) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) @@ -2485,6 +2565,7 @@ def rsplit(self, pat=None, n=-1, expand=False): '`sep`.' }) @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + @forbid_nonstring_types(['bytes']) def partition(self, sep=' ', expand=True): f = lambda x: x.partition(sep) result = _na_map(f, self._parent) @@ -2498,6 +2579,7 @@ def partition(self, sep=' ', expand=True): '`sep`.' }) @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + @forbid_nonstring_types(['bytes']) def rpartition(self, sep=' ', expand=True): f = lambda x: x.rpartition(sep) result = _na_map(f, self._parent) @@ -2509,33 +2591,39 @@ def get(self, i): return self._wrap_result(result) @copy(str_join) + @forbid_nonstring_types(['bytes']) def join(self, sep): result = str_join(self._parent, sep) return self._wrap_result(result) @copy(str_contains) + @forbid_nonstring_types(['bytes']) def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) return self._wrap_result(result, fill_value=na) @copy(str_match) + @forbid_nonstring_types(['bytes']) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na) @copy(str_replace) + @forbid_nonstring_types(['bytes']) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): result = str_replace(self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex) return self._wrap_result(result) @copy(str_repeat) + @forbid_nonstring_types(['bytes']) def repeat(self, repeats): result = str_repeat(self._parent, repeats) return self._wrap_result(result) @copy(str_pad) + @forbid_nonstring_types(['bytes']) def pad(self, width, side='left', fillchar=' '): result = str_pad(self._parent, width, side=side, fillchar=fillchar) return self._wrap_result(result) @@ -2559,17 +2647,21 @@ def pad(self, width, side='left', fillchar=' '): @Appender(_shared_docs['str_pad'] % dict(side='left and right', method='center')) + @forbid_nonstring_types(['bytes']) def center(self, width, fillchar=' '): return self.pad(width, side='both', fillchar=fillchar) @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust')) + @forbid_nonstring_types(['bytes']) def ljust(self, width, fillchar=' '): return self.pad(width, side='right', fillchar=fillchar) @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust')) + @forbid_nonstring_types(['bytes']) def rjust(self, width, fillchar=' '): return self.pad(width, side='left', fillchar=fillchar) + @forbid_nonstring_types(['bytes']) def zfill(self, width): """ Pad strings in the Series/Index by prepending '0' characters. @@ -2639,16 +2731,19 @@ def slice(self, start=None, stop=None, step=None): return self._wrap_result(result) @copy(str_slice_replace) + @forbid_nonstring_types(['bytes']) def slice_replace(self, start=None, stop=None, repl=None): result = str_slice_replace(self._parent, start, stop, repl) return self._wrap_result(result) @copy(str_decode) def decode(self, encoding, errors="strict"): + # need to allow bytes here result = str_decode(self._parent, encoding, errors) return self._wrap_result(result) @copy(str_encode) + @forbid_nonstring_types(['bytes']) def encode(self, encoding, errors="strict"): result = str_encode(self._parent, encoding, errors) return self._wrap_result(result) @@ -2718,28 +2813,33 @@ def encode(self, encoding, errors="strict"): @Appender(_shared_docs['str_strip'] % dict(side='left and right sides', method='strip')) + @forbid_nonstring_types(['bytes']) def strip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='both') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='left side', method='lstrip')) + @forbid_nonstring_types(['bytes']) def lstrip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='left') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='right side', method='rstrip')) + @forbid_nonstring_types(['bytes']) def rstrip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='right') return self._wrap_result(result) @copy(str_wrap) + @forbid_nonstring_types(['bytes']) def wrap(self, width, **kwargs): result = str_wrap(self._parent, width, **kwargs) return self._wrap_result(result) @copy(str_get_dummies) + @forbid_nonstring_types(['bytes']) def get_dummies(self, sep='|'): # we need to cast to Series of strings as only that has all # methods available for making the dummies... @@ -2749,20 +2849,23 @@ def get_dummies(self, sep='|'): name=name, expand=True) @copy(str_translate) + @forbid_nonstring_types(['bytes']) def translate(self, table): result = str_translate(self._parent, table) return self._wrap_result(result) - count = _pat_wrapper(str_count, flags=True) - startswith = _pat_wrapper(str_startswith, na=True) - endswith = _pat_wrapper(str_endswith, na=True) - findall = _pat_wrapper(str_findall, flags=True) + count = _pat_wrapper(str_count, flags=True, name='count') + startswith = _pat_wrapper(str_startswith, na=True, name='startswith') + endswith = _pat_wrapper(str_endswith, na=True, name='endswith') + findall = _pat_wrapper(str_findall, flags=True, name='findall') @copy(str_extract) + @forbid_nonstring_types(['bytes']) def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) + @forbid_nonstring_types(['bytes']) def extractall(self, pat, flags=0): return str_extractall(self._orig, pat, flags=flags) @@ -2792,6 +2895,7 @@ def extractall(self, pat, flags=0): @Appender(_shared_docs['find'] % dict(side='lowest', method='find', also='rfind : Return highest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def find(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side='left') return self._wrap_result(result) @@ -2799,11 +2903,13 @@ def find(self, sub, start=0, end=None): @Appender(_shared_docs['find'] % dict(side='highest', method='rfind', also='find : Return lowest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def rfind(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side='right') return self._wrap_result(result) + @forbid_nonstring_types(['bytes']) def normalize(self, form): """ Return the Unicode normal form for the strings in the Series/Index. @@ -2851,6 +2957,7 @@ def normalize(self, form): @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index', also='rindex : Return highest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def index(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side='left') @@ -2859,6 +2966,7 @@ def index(self, sub, start=0, end=None): @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex', also='index : Return lowest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def rindex(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side='right') @@ -2908,7 +3016,8 @@ def rindex(self, sub, start=0, end=None): 5 3.0 dtype: float64 """) - len = _noarg_wrapper(len, docstring=_shared_docs['len'], dtype=int) + len = _noarg_wrapper(len, docstring=_shared_docs['len'], + forbidden_types=None, dtype=int) _shared_docs['casemethods'] = (""" Convert strings in the Series/Index to %(type)s. @@ -2989,21 +3098,27 @@ def rindex(self, sub, start=0, end=None): _doc_args['casefold'] = dict(type='be casefolded', method='casefold', version='\n .. versionadded:: 0.25.0\n') lower = _noarg_wrapper(lambda x: x.lower(), + name='lower', docstring=_shared_docs['casemethods'] % _doc_args['lower']) upper = _noarg_wrapper(lambda x: x.upper(), + name='upper', docstring=_shared_docs['casemethods'] % _doc_args['upper']) title = _noarg_wrapper(lambda x: x.title(), + name='title', docstring=_shared_docs['casemethods'] % _doc_args['title']) capitalize = _noarg_wrapper(lambda x: x.capitalize(), + name='capitalize', docstring=_shared_docs['casemethods'] % _doc_args['capitalize']) swapcase = _noarg_wrapper(lambda x: x.swapcase(), + name='swapcase', docstring=_shared_docs['casemethods'] % _doc_args['swapcase']) casefold = _noarg_wrapper(lambda x: x.casefold(), + name='casefold', docstring=_shared_docs['casemethods'] % _doc_args['casefold']) @@ -3157,30 +3272,39 @@ def rindex(self, sub, start=0, end=None): _doc_args['isnumeric'] = dict(type='numeric', method='isnumeric') _doc_args['isdecimal'] = dict(type='decimal', method='isdecimal') isalnum = _noarg_wrapper(lambda x: x.isalnum(), + name='isalnum', docstring=_shared_docs['ismethods'] % _doc_args['isalnum']) isalpha = _noarg_wrapper(lambda x: x.isalpha(), + name='isalpha', docstring=_shared_docs['ismethods'] % _doc_args['isalpha']) isdigit = _noarg_wrapper(lambda x: x.isdigit(), + name='isdigit', docstring=_shared_docs['ismethods'] % _doc_args['isdigit']) isspace = _noarg_wrapper(lambda x: x.isspace(), + name='isspace', docstring=_shared_docs['ismethods'] % _doc_args['isspace']) islower = _noarg_wrapper(lambda x: x.islower(), + name='islower', docstring=_shared_docs['ismethods'] % _doc_args['islower']) isupper = _noarg_wrapper(lambda x: x.isupper(), + name='isupper', docstring=_shared_docs['ismethods'] % _doc_args['isupper']) istitle = _noarg_wrapper(lambda x: x.istitle(), + name='istitle', docstring=_shared_docs['ismethods'] % _doc_args['istitle']) isnumeric = _noarg_wrapper(lambda x: x.isnumeric(), + name='isnumeric', docstring=_shared_docs['ismethods'] % _doc_args['isnumeric']) isdecimal = _noarg_wrapper(lambda x: x.isdecimal(), + name='isdecimal', docstring=_shared_docs['ismethods'] % _doc_args['isdecimal']) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 2951ca24fa7ff..1ba0ef3918fb7 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -150,6 +150,9 @@ def any_allowed_skipna_inferred_dtype(request): ... inferred_dtype, values = any_allowed_skipna_inferred_dtype ... # will pass ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... + ... # constructor for .str-accessor will also pass + ... pd.Series(values).str """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting @@ -179,20 +182,6 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): pytest.xfail(reason='Conversion to numpy array fails because ' 'the ._values-attribute is not a numpy array for ' 'PeriodArray/IntervalArray; see GH 23553') - if box == Index and inferred_dtype in ['empty', 'bytes']: - pytest.xfail(reason='Raising too restrictively; ' - 'solved by GH 23167') - if (box == Index and dtype == object - and inferred_dtype in ['boolean', 'date', 'time']): - pytest.xfail(reason='Inferring incorrectly because of NaNs; ' - 'solved by GH 23167') - if (box == Series - and (dtype == object and inferred_dtype not in [ - 'string', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer']) - or (dtype == 'category' - and inferred_dtype in ['decimal', 'boolean', 'time'])): - pytest.xfail(reason='Not raising correctly; solved by GH 23167') types_passing_constructor = ['string', 'unicode', 'empty', 'bytes', 'mixed', 'mixed-integer'] @@ -220,27 +209,21 @@ def test_api_per_method(self, box, dtype, method_name, args, kwargs = any_string_method # TODO: get rid of these xfails - if (method_name not in ['encode', 'decode', 'len'] - and inferred_dtype == 'bytes'): - pytest.xfail(reason='Not raising for "bytes", see GH 23011;' - 'Also: malformed method names, see GH 23551; ' - 'solved by GH 23167') - if (method_name == 'cat' - and inferred_dtype in ['mixed', 'mixed-integer']): - pytest.xfail(reason='Bad error message; should raise better; ' - 'solved by GH 23167') - if box == Index and inferred_dtype in ['empty', 'bytes']: - pytest.xfail(reason='Raising too restrictively; ' - 'solved by GH 23167') - if (box == Index and dtype == object - and inferred_dtype in ['boolean', 'date', 'time']): - pytest.xfail(reason='Inferring incorrectly because of NaNs; ' - 'solved by GH 23167') + if (method_name in ['partition', 'rpartition'] and box == Index + and inferred_dtype == 'empty'): + pytest.xfail(reason='Method cannot deal with empty Index') + if (method_name == 'split' and box == Index and values.size == 0 + and kwargs.get('expand', None) is not None): + pytest.xfail(reason='Split fails on empty Series when expand=True') + if (method_name == 'get_dummies' and box == Index + and inferred_dtype == 'empty' and (dtype == object + or values.size == 0)): + pytest.xfail(reason='Need to fortify get_dummies corner cases') t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) - bytes_allowed = method_name in ['encode', 'decode', 'len'] + bytes_allowed = method_name in ['decode', 'get', 'len', 'slice'] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. # This could be changed with an 'errors'-kwarg to the `str`-accessor, @@ -3167,7 +3150,8 @@ def test_str_accessor_no_new_attributes(self): def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) - with pytest.raises(TypeError, match="can't concat str to bytes"): + with pytest.raises(TypeError, + match="Cannot use .str.cat with values of.*"): lhs.str.cat(rhs) def test_casefold(self): From 4cd348bbe97f342787c0f3be2370e57695badd0f Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 1 Jun 2019 16:34:57 +0100 Subject: [PATCH 15/43] Changing dev docs ssh key (#26604) --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9f83917024049..0064d0a932960 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -161,7 +161,7 @@ jobs: - task: InstallSSHKey@0 inputs: hostName: 'github.com' - sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDHmz3l/EdqrgNxEUKkwDUuUcLv91unig03pYFGO/DMIgCmPdMG96zAgfnESd837Rm0wSSqylwSzkRJt5MV/TpFlcVifDLDQmUhqCeO8Z6dLl/oe35UKmyYICVwcvQTAaHNnYRpKC5IUlTh0JEtw9fGlnp1Ta7U1ENBLbKdpywczElhZu+hOQ892zqOj3CwA+U2329/d6cd7YnqIKoFN9DWT3kS5K6JE4IoBfQEVekIOs23bKjNLvPoOmi6CroAhu/K8j+NCWQjge5eJf2x/yTnIIP1PlEcXoHIr8io517posIx3TBup+CN8bNS1PpDW3jyD3ttl1uoBudjOQrobNnJeR6Rn67DRkG6IhSwr3BWj8alwUG5mTdZzwV5Pa9KZFdIiqX7NoDGg+itsR39QCn0thK8lGRNSR8KrWC1PSjecwelKBO7uQ7rnk/rkrZdBWR4oEA8YgNH8tirUw5WfOr5a0AIaJicKxGKNdMxZt+zmC+bS7F4YCOGIm9KHa43RrKhoGRhRf9fHHHKUPwFGqtWG4ykcUgoamDOURJyepesBAO3FiRE9rLU6ILbB3yEqqoekborHmAJD5vf7PWItW3Q/YQKuk3kkqRcKnexPyzyyq5lUgTi8CxxZdaASIOu294wjBhhdyHlXEkVTNJ9JKkj/obF+XiIIp0cBDsOXY9hDQ== pandas-dev@python.org' + sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDfF0BSddjvZx/z4/2TXsy+RxjwBpgdHkmjtL9WfRHxEw1TchBuEj5vWWcxBNTK+9oVzD/Lca89HAXXrklsfkdAK3LvLfGCxTGpP8t/3CxxFdnSg3EN+4cDGKuDlbeTyzdASdPBOq0GTZjUFekl9ZfFrFJ9SoPpqZ4mmPRPapPrkwTs4xIrBly0eWcISFYgZcG58m65+XQpyyBMbpsO5ZHBBxE8kkWN0yY+gKt5PeeIO82xE+7F+3Qhlc67fKfB4FEitQ5SKrbKyGNNdFtEGcC6CEtD0B0vJxssltQEl5dDWPJP6tH4cIm/J6m28mpSYc5fEBhr75jE4Ybw6NtGgBZEdtFRFlnb91mSiVSjM/HEkV7/xYai+H1Gk+I/8tcl8cf3JCiJSP2glz8bp52+i5it29FUL8ITxdJSo0duUkVm3nZ8cDI6zag+nSSmzdZ1I9Fw7M7RRPHM2zd5+6RskeqamR5lY3Iv+t8Yo8cRX10IiHNF89b+3vI5ZkIKqytrPfrY45jGVMXA6x/whMh94Ac94qm+Do7P3eT/66a1lX0r+UfV6UnfwHE6cZ1ZFX2AzlmSiYMKmTD3hn1GNyHHuvk3Mneanbk4+x+8SjAXIK354zJ8c1Qgk1iEicDvna2IBd94R4tBWjYZ8xH7avmPlhs0HwbjiNOFDc45UXvwIl+D7w== pandas-dev@python.org' sshKeySecureFile: 'pandas_docs_key' displayName: 'Install GitHub ssh deployment key' condition : | From ad7c9e9580cc1e5e18ce0f6b68ec952fbddbb71e Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 1 Jun 2019 17:46:56 +0100 Subject: [PATCH 16/43] CI: Removing doc build in azure (#26609) --- azure-pipelines.yml | 60 --------------------------------------------- 1 file changed, 60 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0064d0a932960..85325c52e7e6d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -116,63 +116,3 @@ jobs: fi displayName: 'Running benchmarks' condition: true - -- job: 'Docs' - pool: - vmImage: ubuntu-16.04 - timeoutInMinutes: 90 - steps: - - script: | - echo '##vso[task.setvariable variable=CONDA_ENV]pandas-dev' - echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' - displayName: 'Setting environment variables' - - - script: | - export PATH=$HOME/miniconda3/bin:$PATH - sudo apt-get install -y libc6-dev-i386 - ci/setup_env.sh - displayName: 'Setup environment and build pandas' - - - script: | - export PATH=$HOME/miniconda3/bin:$PATH - source activate pandas-dev - doc/make.py - displayName: 'Build documentation' - - - script: | - cd doc/build/html - git init - touch .nojekyll - git add --all . - git config user.email "pandas-dev@python.org" - git config user.name "pandas-docs-bot" - git commit -m "pandas documentation in master" - displayName: 'Create git repo for docs build' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) - - # This task to work requires next steps: - # 1. Got to "Library > Secure files" in the azure-pipelines dashboard: https://dev.azure.com/pandas-dev/pandas/_library?itemType=SecureFiles - # 2. Click on "+ Secure file" - # 3. Upload the private key (the name of the file must match with the specified in "sshKeySecureFile" input below, "pandas_docs_key") - # 4. Click on file name after it is created, tick the box "Authorize for use in all pipelines" and save - # 5. The public key specified in "sshPublicKey" is the pair of the uploaded private key, and needs to be specified as a deploy key of the repo where the docs will be pushed: https://github.com/pandas-dev/pandas-dev.github.io/settings/keys - - task: InstallSSHKey@0 - inputs: - hostName: 'github.com' - sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDfF0BSddjvZx/z4/2TXsy+RxjwBpgdHkmjtL9WfRHxEw1TchBuEj5vWWcxBNTK+9oVzD/Lca89HAXXrklsfkdAK3LvLfGCxTGpP8t/3CxxFdnSg3EN+4cDGKuDlbeTyzdASdPBOq0GTZjUFekl9ZfFrFJ9SoPpqZ4mmPRPapPrkwTs4xIrBly0eWcISFYgZcG58m65+XQpyyBMbpsO5ZHBBxE8kkWN0yY+gKt5PeeIO82xE+7F+3Qhlc67fKfB4FEitQ5SKrbKyGNNdFtEGcC6CEtD0B0vJxssltQEl5dDWPJP6tH4cIm/J6m28mpSYc5fEBhr75jE4Ybw6NtGgBZEdtFRFlnb91mSiVSjM/HEkV7/xYai+H1Gk+I/8tcl8cf3JCiJSP2glz8bp52+i5it29FUL8ITxdJSo0duUkVm3nZ8cDI6zag+nSSmzdZ1I9Fw7M7RRPHM2zd5+6RskeqamR5lY3Iv+t8Yo8cRX10IiHNF89b+3vI5ZkIKqytrPfrY45jGVMXA6x/whMh94Ac94qm+Do7P3eT/66a1lX0r+UfV6UnfwHE6cZ1ZFX2AzlmSiYMKmTD3hn1GNyHHuvk3Mneanbk4+x+8SjAXIK354zJ8c1Qgk1iEicDvna2IBd94R4tBWjYZ8xH7avmPlhs0HwbjiNOFDc45UXvwIl+D7w== pandas-dev@python.org' - sshKeySecureFile: 'pandas_docs_key' - displayName: 'Install GitHub ssh deployment key' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) - - - script: | - cd doc/build/html - git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git - git push origin master -f - displayName: 'Publish docs to GitHub pages' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) From 68c6766110b918bf6d75d0b5895e2731b14ca610 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 1 Jun 2019 17:03:58 +0000 Subject: [PATCH 17/43] PERF: don't call RangeIndex._data unnecessarily (#26565) --- asv_bench/benchmarks/index_object.py | 6 +++++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexes/range.py | 32 +++++++++++++++++++++++-- pandas/tests/indexes/test_range.py | 36 ++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 896a20bae2069..78fe2ae966896 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -94,6 +94,12 @@ def time_min(self): def time_min_trivial(self): self.idx_inc.min() + def time_get_loc_inc(self): + self.idx_inc.get_loc(900000) + + def time_get_loc_dec(self): + self.idx_dec.get_loc(100000) + class IndexAppend: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 87a8010998bd0..1619ba1a45739 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -493,6 +493,7 @@ Performance Improvements - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) +- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) - Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ea14a4c789cd3..9401de3346ccd 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -22,6 +22,8 @@ from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index +from pandas.io.formats.printing import pprint_thing + class RangeIndex(Int64Index): """ @@ -64,6 +66,8 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + # check whether self._data has benn called + _cached_data = None # type: np.ndarray # -------------------------------------------------------------------- # Constructors @@ -164,6 +168,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None, for k, v in kwargs.items(): setattr(result, k, v) + result._range = range(result._start, result._stop, result._step) + result._reset_identity() return result @@ -180,9 +186,19 @@ def _constructor(self): """ return the class to use for construction """ return Int64Index - @cache_readonly + @property def _data(self): - return np.arange(self._start, self._stop, self._step, dtype=np.int64) + """ + An int array that for performance reasons is created only when needed. + + The constructed array is saved in ``_cached_data``. This allows us to + check if the array has been created without accessing ``_data`` and + triggering the construction. + """ + if self._cached_data is None: + self._cached_data = np.arange(self._start, self._stop, self._step, + dtype=np.int64) + return self._cached_data @cache_readonly def _int64index(self): @@ -215,6 +231,9 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_with_header(self, header, na_rep='NaN', **kwargs): + return header + list(map(pprint_thing, self._range)) + # -------------------------------------------------------------------- @property def start(self): @@ -296,6 +315,15 @@ def is_monotonic_decreasing(self): def has_duplicates(self): return False + @Appender(_index_shared_docs['get_loc']) + def get_loc(self, key, method=None, tolerance=None): + if is_integer(key) and method is None and tolerance is None: + try: + return self._range.index(key) + except ValueError: + raise KeyError(key) + return super().get_loc(key, method=method, tolerance=tolerance) + def tolist(self): return list(range(self._start, self._stop, self._step)) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index b2c330015081c..477a4e527f278 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -241,6 +241,42 @@ def test_view(self): def test_dtype(self): assert self.index.dtype == np.int64 + def test_cached_data(self): + # GH 26565 + # Calling RangeIndex._data caches an int64 array of the same length at + # self._cached_data. This tests whether _cached_data has been set. + idx = RangeIndex(0, 100, 10) + + assert idx._cached_data is None + + repr(idx) + assert idx._cached_data is None + + str(idx) + assert idx._cached_data is None + + idx.get_loc(20) + assert idx._cached_data is None + + df = pd.DataFrame({'a': range(10)}, index=idx) + + df.loc[50] + assert idx._cached_data is None + + with pytest.raises(KeyError): + df.loc[51] + assert idx._cached_data is None + + df.loc[10:50] + assert idx._cached_data is None + + df.iloc[5:10] + assert idx._cached_data is None + + # actually calling data._data + assert isinstance(idx._data, np.ndarray) + assert isinstance(idx._cached_data, np.ndarray) + def test_is_monotonic(self): assert self.index.is_monotonic is True assert self.index.is_monotonic_increasing is True From 1f837331e57f119f9471758f4c8fecaa1e7dc16e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 2 Jun 2019 12:47:34 +0100 Subject: [PATCH 18/43] CI: pin pytest version on Python 3.5 (#26619) --- ci/deps/azure-35-compat.yaml | 2 +- ci/deps/azure-macos-35.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/azure-35-compat.yaml b/ci/deps/azure-35-compat.yaml index d0a48bd3f8b27..e55a4fbdf3fa9 100644 --- a/ci/deps/azure-35-compat.yaml +++ b/ci/deps/azure-35-compat.yaml @@ -26,5 +26,5 @@ dependencies: - pip - pip: # for python 3.5, pytest>=4.0.2 is not available in conda - - pytest>=4.0.2 + - pytest==4.5.0 - html5lib==1.0b2 diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 591266348a5f1..00c2051f29760 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -25,7 +25,7 @@ dependencies: - pip: - python-dateutil==2.5.3 # universal - - pytest>=4.0.2 + - pytest==4.5.0 - pytest-xdist - pytest-mock - hypothesis>=3.58.0 From 6fb0be001fce70d9e87ba571a97b55d273d76f4a Mon Sep 17 00:00:00 2001 From: Chuanzhu Xu Date: Sun, 2 Jun 2019 17:09:44 -0400 Subject: [PATCH 19/43] remove outdated gtk package from code (#26590) --- doc/source/install.rst | 1 - doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 2 +- pandas/io/clipboard/__init__.py | 21 +++++---------------- pandas/io/clipboard/clipboards.py | 16 ---------------- pandas/io/clipboards.py | 2 +- 7 files changed, 9 insertions(+), 36 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index b3b5945cc515e..98443ede2e965 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -281,7 +281,6 @@ Optional Dependencies `qtpy `__ (requires PyQt or PySide), `PyQt5 `__, `PyQt4 `__, - `pygtk `__, `xsel `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 88d8ccbbe036e..4aacb6fa1e278 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3272,7 +3272,7 @@ We can see that we got the same content back, which we had earlier written to th .. note:: - You may need to install xclip or xsel (with gtk, PyQt5, PyQt4 or qtpy) on Linux to use these methods. + You may need to install xclip or xsel (with PyQt5, PyQt4 or qtpy) on Linux to use these methods. .. _io.pickle: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1619ba1a45739..f122c73325b7d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -434,6 +434,7 @@ Other API Changes - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) +- Removed support of gtk package for clipboards (:issue:`26563`) .. _whatsnew_0250.deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7ca2c52e18c41..33b0035e74913 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2679,7 +2679,7 @@ def to_clipboard(self, excel=True, sep=None, **kwargs): ----- Requirements for your platform. - - Linux : `xclip`, or `xsel` (with `gtk` or `PyQt4` modules) + - Linux : `xclip`, or `xsel` (with `PyQt4` modules) - Windows : none - OS X : none diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index b76a843e3e7f2..2063978c76c5a 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -18,21 +18,19 @@ On Linux, install xclip or xsel via package manager. For example, in Debian: sudo apt-get install xclip -Otherwise on Linux, you will need the gtk, qtpy or PyQt modules installed. +Otherwise on Linux, you will need the qtpy or PyQt modules installed. qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2 -gtk and PyQt4 modules are not available for Python 3, -and this module does not work with PyGObject yet. +This module does not work with PyGObject yet. """ __version__ = '1.5.27' import platform import os import subprocess -from .clipboards import (init_osx_clipboard, - init_gtk_clipboard, init_qt_clipboard, - init_xclip_clipboard, init_xsel_clipboard, - init_klipper_clipboard, init_no_clipboard) +from .clipboards import ( + init_osx_clipboard, init_qt_clipboard, init_xclip_clipboard, + init_xsel_clipboard, init_klipper_clipboard, init_no_clipboard) from .windows import init_windows_clipboard # `import qtpy` sys.exit()s if DISPLAY is not in the environment. @@ -60,14 +58,6 @@ def determine_clipboard(): return init_osx_clipboard() if HAS_DISPLAY: # Determine which command/module is installed, if any. - try: - # Check if gtk is installed - import gtk # noqa - except ImportError: - pass - else: - return init_gtk_clipboard() - try: # qtpy is a small abstraction layer that lets you write # applications using a single api call to either PyQt or PySide @@ -104,7 +94,6 @@ def set_clipboard(clipboard): global copy, paste clipboard_types = {'osx': init_osx_clipboard, - 'gtk': init_gtk_clipboard, 'qt': init_qt_clipboard, 'xclip': init_xclip_clipboard, 'xsel': init_xsel_clipboard, diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py index 66e2e35bf0c59..52abdeafb5ecc 100644 --- a/pandas/io/clipboard/clipboards.py +++ b/pandas/io/clipboard/clipboards.py @@ -22,22 +22,6 @@ def paste_osx(): return copy_osx, paste_osx -def init_gtk_clipboard(): - import gtk - - def copy_gtk(text): - global cb - cb = gtk.Clipboard() - cb.set_text(text) - cb.store() - - def paste_gtk(): - clipboardContents = gtk.Clipboard().wait_for_text() - return clipboardContents - - return copy_gtk, paste_gtk - - def init_qt_clipboard(): # $DISPLAY should exist diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index be1256edf7afe..dc30285895dd5 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -91,7 +91,7 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover Notes ----- Requirements for your platform - - Linux: xclip, or xsel (with gtk or PyQt4 modules) + - Linux: xclip, or xsel (with PyQt4 modules) - Windows: - OS X: """ From a6ad17dde640e026eddadbe3551e15d4e25961ee Mon Sep 17 00:00:00 2001 From: iamshwin <23633545+iamshwin@users.noreply.github.com> Date: Mon, 3 Jun 2019 00:11:48 +0100 Subject: [PATCH 20/43] Tidy documentation about plotting Series histograms (#26624) --- pandas/plotting/_core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index fed4b0d90983c..3f6a30c4639bc 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2477,8 +2477,6 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last bin. In this case, bins is returned unmodified. - bins : integer, default 10 - Number of histogram bins to be used `**kwds` : keywords To be passed to the actual plotting function From 3a5619531e44f07b5e7a58858e79432b64e0f29d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 3 Jun 2019 00:13:08 +0100 Subject: [PATCH 21/43] TST/CLN: deduplicate fixture from test_to_latex.py (#26603) --- pandas/conftest.py | 31 ++++++++++++++++++++++++ pandas/tests/frame/conftest.py | 29 ---------------------- pandas/tests/io/formats/test_to_latex.py | 27 +++++++++------------ 3 files changed, 42 insertions(+), 45 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 8f71028f51ab4..09fe8e0829fa1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -12,6 +12,8 @@ import pandas.util._test_decorators as td import pandas as pd +from pandas import DataFrame +import pandas.util.testing as tm hypothesis.settings.register_profile( "ci", @@ -690,3 +692,32 @@ def tick_classes(request): normalize=st.booleans(), startingMonth=st.integers(min_value=1, max_value=12) )) + + +@pytest.fixture +def float_frame(): + """ + Fixture for DataFrame of floats with index of unique strings + + Columns are ['A', 'B', 'C', 'D']. + + A B C D + P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 + qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 + tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 + wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 + M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 + QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 + r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 + ... ... ... ... ... + IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 + lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 + qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 + yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 + 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 + eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 + xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 + + [30 rows x 4 columns] + """ + return DataFrame(tm.getSeriesData()) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index c451cd58f1497..d8a590bc492a4 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -5,35 +5,6 @@ import pandas.util.testing as tm -@pytest.fixture -def float_frame(): - """ - Fixture for DataFrame of floats with index of unique strings - - Columns are ['A', 'B', 'C', 'D']. - - A B C D - P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 - qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 - tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 - wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 - M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 - QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 - r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 - ... ... ... ... ... - IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 - lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 - qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 - yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 - 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 - eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 - xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getSeriesData()) - - @pytest.fixture def float_frame_with_na(): """ diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 5a6511fbd20ee..b9f28ec36d021 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -8,19 +8,14 @@ from pandas.util import testing as tm -@pytest.fixture -def frame(): - return DataFrame(tm.getSeriesData()) - - class TestToLatex: - def test_to_latex_filename(self, frame): + def test_to_latex_filename(self, float_frame): with tm.ensure_clean('test.tex') as path: - frame.to_latex(path) + float_frame.to_latex(path) with open(path, 'r') as f: - assert frame.to_latex() == f.read() + assert float_frame.to_latex() == f.read() # test with utf-8 and encoding option (GH 7061) df = DataFrame([['au\xdfgangen']]) @@ -35,9 +30,9 @@ def test_to_latex_filename(self, frame): with codecs.open(path, 'r', encoding='utf-8') as f: assert df.to_latex() == f.read() - def test_to_latex(self, frame): + def test_to_latex(self, float_frame): # it works! - frame.to_latex() + float_frame.to_latex() df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) withindex_result = df.to_latex() @@ -66,9 +61,9 @@ def test_to_latex(self, frame): assert withoutindex_result == withoutindex_expected - def test_to_latex_format(self, frame): + def test_to_latex_format(self, float_frame): # GH Bug #9402 - frame.to_latex(column_format='ccc') + float_frame.to_latex(column_format='ccc') df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) withindex_result = df.to_latex(column_format='ccc') @@ -389,8 +384,8 @@ def test_to_latex_special_escape(self): """ assert escaped_result == escaped_expected - def test_to_latex_longtable(self, frame): - frame.to_latex(longtable=True) + def test_to_latex_longtable(self, float_frame): + float_frame.to_latex(longtable=True) df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) withindex_result = df.to_latex(longtable=True) @@ -535,9 +530,9 @@ def test_to_latex_specified_header(self): with pytest.raises(ValueError): df.to_latex(header=['A']) - def test_to_latex_decimal(self, frame): + def test_to_latex_decimal(self, float_frame): # GH 12031 - frame.to_latex() + float_frame.to_latex() df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']}) withindex_result = df.to_latex(decimal=',') From ee52d0efe8ad1281292e80937662854a44a9da9a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 2 Jun 2019 16:20:15 -0700 Subject: [PATCH 22/43] CLN: Remove convert_objects (#26612) --- doc/source/reference/frame.rst | 1 - doc/source/reference/series.rst | 1 - doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 48 +--------- pandas/tests/series/test_internals.py | 125 -------------------------- 5 files changed, 2 insertions(+), 174 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index dfa475684c834..b4fb85c028b3e 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -48,7 +48,6 @@ Conversion :toctree: api/ DataFrame.astype - DataFrame.convert_objects DataFrame.infer_objects DataFrame.copy DataFrame.isna diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index b406893e3414a..8fccdea979602 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -56,7 +56,6 @@ Conversion Series.astype Series.infer_objects - Series.convert_objects Series.copy Series.bool Series.to_numpy diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f122c73325b7d..1cbec223008c4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -483,6 +483,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``TimeGrouper`` (:issue:`16942`) - Removed the previously deprecated ``parse_cols`` keyword in :func:`read_excel` (:issue:`16488`) - Removed the previously deprecated ``pd.options.html.border`` (:issue:`16970`) +- Removed the previously deprecated ``convert_objects`` (:issue:`11221`) .. _whatsnew_0250.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 33b0035e74913..2428bbad7003b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -113,7 +113,7 @@ class NDFrame(PandasObject, SelectionMixin): _internal_names_set = set(_internal_names) # type: Set[str] _accessors = set() # type: Set[str] _deprecations = frozenset([ - 'as_blocks', 'blocks', 'convert_objects', 'is_copy' + 'as_blocks', 'blocks', 'is_copy' ]) # type: FrozenSet[str] _metadata = [] # type: List[str] _is_copy = None @@ -5913,52 +5913,6 @@ def _convert(self, datetime=False, numeric=False, timedelta=False, timedelta=timedelta, coerce=coerce, copy=copy)).__finalize__(self) - def convert_objects(self, convert_dates=True, convert_numeric=False, - convert_timedeltas=True, copy=True): - """ - Attempt to infer better dtype for object columns. - - .. deprecated:: 0.21.0 - - Parameters - ---------- - convert_dates : boolean, default True - If True, convert to date where possible. If 'coerce', force - conversion, with unconvertible values becoming NaT. - convert_numeric : boolean, default False - If True, attempt to coerce to numbers (including strings), with - unconvertible values becoming NaN. - convert_timedeltas : boolean, default True - If True, convert to timedelta where possible. If 'coerce', force - conversion, with unconvertible values becoming NaT. - copy : boolean, default True - If True, return a copy even if no copy is necessary (e.g. no - conversion was done). Note: This is meant for internal use, and - should not be confused with inplace. - - Returns - ------- - converted : same as input object - - See Also - -------- - to_datetime : Convert argument to datetime. - to_timedelta : Convert argument to timedelta. - to_numeric : Convert argument to numeric type. - """ - msg = ("convert_objects is deprecated. To re-infer data dtypes for " - "object columns, use {klass}.infer_objects()\nFor all " - "other conversions use the data-type specific converters " - "pd.to_datetime, pd.to_timedelta and pd.to_numeric." - ).format(klass=self.__class__.__name__) - warnings.warn(msg, FutureWarning, stacklevel=2) - - return self._constructor( - self._data.convert(convert_dates=convert_dates, - convert_numeric=convert_numeric, - convert_timedeltas=convert_timedeltas, - copy=copy)).__finalize__(self) - def infer_objects(self): """ Attempt to infer better dtypes for object columns. diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index f6f4a2db359f7..29846f10dae33 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -12,131 +12,6 @@ class TestSeriesInternals: - def test_convert_objects(self): - - s = Series([1., 2, 3], index=['a', 'b', 'c']) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates=False, - convert_numeric=True) - assert_series_equal(result, s) - - # force numeric conversion - r = s.copy().astype('O') - r['a'] = '1' - with tm.assert_produces_warning(FutureWarning): - result = r.convert_objects(convert_dates=False, - convert_numeric=True) - assert_series_equal(result, s) - - r = s.copy().astype('O') - r['a'] = '1.' - with tm.assert_produces_warning(FutureWarning): - result = r.convert_objects(convert_dates=False, - convert_numeric=True) - assert_series_equal(result, s) - - r = s.copy().astype('O') - r['a'] = 'garbled' - expected = s.copy() - expected['a'] = np.nan - with tm.assert_produces_warning(FutureWarning): - result = r.convert_objects(convert_dates=False, - convert_numeric=True) - assert_series_equal(result, expected) - - # GH 4119, not converting a mixed type (e.g.floats and object) - s = Series([1, 'na', 3, 4]) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_numeric=True) - expected = Series([1, np.nan, 3, 4]) - assert_series_equal(result, expected) - - s = Series([1, '', 3, 4]) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_numeric=True) - expected = Series([1, np.nan, 3, 4]) - assert_series_equal(result, expected) - - # dates - s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0)]) - s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1, - Timestamp('20010104'), '20010105'], - dtype='O') - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates=True, - convert_numeric=False) - expected = Series([Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20010103')], dtype='M8[ns]') - assert_series_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce', - convert_numeric=False) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce', - convert_numeric=True) - assert_series_equal(result, expected) - - expected = Series([Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20010103'), - NaT, NaT, NaT, Timestamp('20010104'), - Timestamp('20010105')], dtype='M8[ns]') - with tm.assert_produces_warning(FutureWarning): - result = s2.convert_objects(convert_dates='coerce', - convert_numeric=False) - assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - result = s2.convert_objects(convert_dates='coerce', - convert_numeric=True) - assert_series_equal(result, expected) - - # preserver all-nans (if convert_dates='coerce') - s = Series(['foo', 'bar', 1, 1.0], dtype='O') - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce', - convert_numeric=False) - expected = Series([NaT] * 2 + [Timestamp(1)] * 2) - assert_series_equal(result, expected) - - # preserver if non-object - s = Series([1], dtype='float32') - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce', - convert_numeric=False) - assert_series_equal(result, s) - - # r = s.copy() - # r[0] = np.nan - # result = r.convert_objects(convert_dates=True,convert_numeric=False) - # assert result.dtype == 'M8[ns]' - - # dateutil parses some single letters into today's value as a date - for x in 'abcdefghijklmnopqrstuvwxyz': - s = Series([x]) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce') - assert_series_equal(result, s) - s = Series([x.upper()]) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce') - assert_series_equal(result, s) - - def test_convert_objects_preserve_bool(self): - s = Series([1, True, 3, 5], dtype=object) - with tm.assert_produces_warning(FutureWarning): - r = s.convert_objects(convert_numeric=True) - e = Series([1, 1, 3, 5], dtype='i8') - tm.assert_series_equal(r, e) - - def test_convert_objects_preserve_all_bool(self): - s = Series([False, True, False, False], dtype=object) - with tm.assert_produces_warning(FutureWarning): - r = s.convert_objects(convert_numeric=True) - e = Series([False, True, False, False], dtype=bool) - tm.assert_series_equal(r, e) - # GH 10265 def test_convert(self): # Tests: All to nans, coerce, true From 6f9aa6a678fbbf113956badf7db5c8532c04958f Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Mon, 3 Jun 2019 01:34:27 +0200 Subject: [PATCH 23/43] Clean up ufuncs post numpy bump (#26606) --- pandas/core/arrays/sparse.py | 9 --------- pandas/core/sparse/frame.py | 6 ------ pandas/core/sparse/series.py | 20 -------------------- 3 files changed, 35 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index ecc06db2bd07b..926ed6a829a6d 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -573,7 +573,6 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): Whether to explicitly copy the incoming `data` array. """ - __array_priority__ = 15 _pandas_ftype = 'sparse' _subtyp = 'sparse_array' # register ABCSparseArray @@ -1639,14 +1638,6 @@ def T(self): # Ufuncs # ------------------------------------------------------------------------ - def __array_wrap__(self, array, context=None): - from pandas.core.dtypes.generic import ABCSparseSeries - - ufunc, inputs, _ = context - inputs = tuple(x.to_dense() if isinstance(x, ABCSparseSeries) else x - for x in inputs) - return self.__array_ufunc__(ufunc, '__call__', *inputs) - _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index bf1cec7571f4d..0320da6d9a48d 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -242,12 +242,6 @@ def _init_spmatrix(self, data, index, columns, dtype=None, def to_coo(self): return SparseFrameAccessor(self).to_coo() - def __array_wrap__(self, result): - return self._constructor( - result, index=self.index, columns=self.columns, - default_kind=self._default_kind, - default_fill_value=self._default_fill_value).__finalize__(self) - def __getstate__(self): # pickling return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data, diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 3f95acdbfb42c..3814d8bb66635 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -124,26 +124,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): fill_value=result.fill_value, copy=False).__finalize__(self) - def __array_wrap__(self, result, context=None): - """ - Gets called prior to a ufunc (and after) - - See SparseArray.__array_wrap__ for detail. - """ - result = self.values.__array_wrap__(result, context=context) - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) - - def __array_finalize__(self, obj): - """ - Gets called after any ufunc or other array operations, necessary - to pass on the index. - """ - self.name = getattr(obj, 'name', None) - self.fill_value = getattr(obj, 'fill_value', None) - # unary ops # TODO: See if this can be shared def __pos__(self): From c95be629022fa7339b2d744004feaa1b381cf1ba Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sun, 2 Jun 2019 18:42:54 -0500 Subject: [PATCH 24/43] Add more specific error message when user passes incorrect matrix format to from_coo (#26584) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/sparse/scipy_sparse.py | 11 ++++++++++- pandas/tests/arrays/sparse/test_accessor.py | 10 ++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1cbec223008c4..461c883f542ab 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -694,7 +694,7 @@ Sparse - Significant speedup in :class:`SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`) - Bug in :class:`SparseDataFrame` when adding a column in which the length of values does not match length of index, ``AssertionError`` is raised instead of raising ``ValueError`` (:issue:`25484`) - +- Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) Other ^^^^^ diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 7630983421ff9..0dd8958e93c13 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -130,10 +130,19 @@ def _coo_to_sparse_series(A, dense_index: bool = False, Returns ------- Series or SparseSeries + + Raises + ------ + TypeError if A is not a coo_matrix + """ from pandas import SparseDtype - s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) + try: + s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) + except AttributeError: + raise TypeError('Expected coo_matrix. Got {} instead.' + .format(type(A).__name__)) s = s.sort_index() if sparse_series: # TODO(SparseSeries): remove this and the sparse_series keyword. diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 370d222c1ab4e..d0a188a8aff3c 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -119,3 +119,13 @@ def test_series_from_coo(self, dtype, dense_index): ) tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_series_from_coo_incorrect_format_raises(self): + # gh-26554 + import scipy.sparse + m = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 0]])) + with pytest.raises(TypeError, + match='Expected coo_matrix. Got csr_matrix instead.' + ): + pd.Series.sparse.from_coo(m) From 21f49c41c0372b513c82a21c7641f6f3f6abfa16 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 3 Jun 2019 07:35:25 +0200 Subject: [PATCH 25/43] DOC/CI: restore travis CI doc build environment (#26621) --- .travis.yml | 4 ++-- ci/deps/travis-36-doc.yaml | 46 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 ci/deps/travis-36-doc.yaml diff --git a/.travis.yml b/.travis.yml index 90dd904e6cb1e..ce8817133a477 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,14 +51,14 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true + - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true allow_failures: - dist: trusty env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true + - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true before_install: - echo "before_install" diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml new file mode 100644 index 0000000000000..9d6cbd82fdc05 --- /dev/null +++ b/ci/deps/travis-36-doc.yaml @@ -0,0 +1,46 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - bottleneck + - cython>=0.28.2 + - fastparquet>=0.2.1 + - gitpython + - html5lib + - hypothesis>=3.58.0 + - ipykernel + - ipython + - ipywidgets + - lxml + - matplotlib + - nbconvert>=5.4.1 + - nbformat + - nbsphinx + - notebook>=5.7.5 + - numexpr + - numpy + - numpydoc + - openpyxl + - pandoc + - pyarrow + - pyqt + - pytables + - python-dateutil + - python-snappy + - python=3.6.* + - pytz + - scipy + - seaborn + - sphinx + - sqlalchemy + - statsmodels + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest>=4.0.2 + - pytest-xdist + - isort From b1e4c55ddce8e0371b4e192a519363c53070489b Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Mon, 3 Jun 2019 13:56:29 +0200 Subject: [PATCH 26/43] TST/API: Forbid str-accessor for 1-level MultiIndex (#26608) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/tests/test_strings.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 461c883f542ab..0e8cd95084a8d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -434,6 +434,7 @@ Other API Changes - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) +- The `.str`-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) .. _whatsnew_0250.deprecations: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1ba0ef3918fb7..a1d522930e9aa 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -169,6 +169,14 @@ def test_api(self): assert Series.str is strings.StringMethods assert isinstance(Series(['']).str, strings.StringMethods) + def test_api_mi_raises(self): + # GH 23679 + mi = MultiIndex.from_arrays([['a', 'b', 'c']]) + with pytest.raises(AttributeError, match='Can only use .str accessor ' + 'with Index, not MultiIndex'): + mi.str + assert not hasattr(mi, 'str') + @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): From d5fad240b04c5f7ec21e78350ab46779c7ed7730 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 3 Jun 2019 22:17:40 +0000 Subject: [PATCH 27/43] Minor doc cleanup because of Panel removal (#26638) --- doc/source/getting_started/basics.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 80e334054a986..5ec0094de0a91 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1455,9 +1455,8 @@ Iteration The behavior of basic iteration over pandas objects depends on the type. When iterating over a Series, it is regarded as array-like, and basic iteration -produces the values. Other data structures, like DataFrame, -follow the dict-like convention of iterating over the "keys" of the -objects. +produces the values. DataFrames follow the dict-like convention of iterating +over the "keys" of the objects. In short, basic iteration (``for i in object``) produces: @@ -1537,9 +1536,9 @@ For example: .. ipython:: python - for item, frame in df.iteritems(): - print(item) - print(frame) + for label, ser in df.iteritems(): + print(label) + print(ser) .. _basics.iterrows: From 0ee4317ec6056d90795e0c9169d0b9464a24ebae Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 3 Jun 2019 19:23:49 -0600 Subject: [PATCH 28/43] DOC: Small whatsnew cleanups (#26643) --- doc/source/whatsnew/v0.25.0.rst | 65 +++++++++++++++++---------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0e8cd95084a8d..267e34efc946f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -72,7 +72,7 @@ Other Enhancements - :meth:`DataFrame.pivot_table` now accepts an ``observed`` parameter which is passed to underlying calls to :meth:`DataFrame.groupby` to speed up grouping categorical data. (:issue:`24923`) - ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) -- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) +- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behavior of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) - :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`) - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) @@ -123,11 +123,11 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) .. _whatsnew_0250.api_breaking.multi_indexing: -MultiIndex constructed from levels and codes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``MultiIndex`` constructed from levels and codes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously. -Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes +Constructing a :class:`MultiIndex` with ``NaN`` levels or codes value < -1 was allowed previously. +Now, construction with codes value < -1 is not allowed and ``NaN`` levels' corresponding codes would be reassigned as -1. (:issue:`19387`) .. ipython:: python @@ -157,8 +157,8 @@ would be reassigned as -1. (:issue:`19387`) .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: -GroupBy.apply on ``DataFrame`` evaluates first group only once -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``GroupBy.apply`` on ``DataFrame`` evaluates first group only once +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The implementation of :meth:`DataFrameGroupBy.apply() ` previously evaluated the supplied function consistently twice on the first group @@ -176,7 +176,7 @@ Now every group is evaluated only a single time. print(group.name) return group -*Previous Behaviour*: +*Previous Behavior*: .. code-block:: python @@ -189,7 +189,7 @@ Now every group is evaluated only a single time. 0 x 1 1 y 2 -*New Behaviour*: +*New Behavior*: .. ipython:: python @@ -239,7 +239,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t ``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`, :meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`. -*Previous Behaviour*: +*Previous Behavior*: .. code-block:: python @@ -259,7 +259,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t 2 False dtype: bool -*New Behaviour*: +*New Behavior*: .. ipython:: python :okexcept: @@ -282,6 +282,8 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). *Previous Behavior*: +.. code-block:: python + In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) ... ValueError: can only call with other PeriodIndex-ed objects @@ -310,7 +312,7 @@ are returned. (:issue:`21521`) df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) df -*Previous Behaviour*: +*Previous Behavior*: .. code-block:: python @@ -320,7 +322,7 @@ are returned. (:issue:`21521`) 0 x 1 1 y 2 -*New Behaviour*: +*New Behavior*: .. ipython:: python @@ -355,7 +357,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 df.describe() -``__str__`` methods now call ``__repr__`` rather than vica-versa +``__str__`` methods now call ``__repr__`` rather than vice versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas has until now mostly defined string representations in a Pandas objects's @@ -434,7 +436,7 @@ Other API Changes - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) -- The `.str`-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) +- The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) .. _whatsnew_0250.deprecations: @@ -468,7 +470,7 @@ The memory usage of the two approaches is identical. See :ref:`sparse.migration` Other Deprecations ^^^^^^^^^^^^^^^^^^ -- The deprecated ``.ix[]`` indexer now raises a more visible FutureWarning instead of DeprecationWarning (:issue:`26438`). +- The deprecated ``.ix[]`` indexer now raises a more visible ``FutureWarning`` instead of ``DeprecationWarning`` (:issue:`26438`). - Deprecated the ``units=M`` (months) and ``units=Y`` (year) parameters for ``units`` of :func:`pandas.to_timedelta`, :func:`pandas.Timedelta` and :func:`pandas.TimedeltaIndex` (:issue:`16344`) - The :attr:`SparseArray.values` attribute is deprecated. You can use ``np.asarray(...)`` or the :meth:`SparseArray.to_dense` method instead (:issue:`26421`). @@ -499,14 +501,13 @@ Performance Improvements - Improved performance when slicing :class:`RangeIndex` (:issue:`26565`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) -- Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) +- Improved performance of :attr:`IntervalIndex.is_monotonic`, :attr:`IntervalIndex.is_monotonic_increasing` and :attr:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) - Improved performance of :meth:`DataFrame.to_csv` when writing datetime dtypes (:issue:`25708`) - Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`) - Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`) - Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) -- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero - and float NaN; by faster checking the string for the possibility of being a date (:issue:`25754`) -- Improved performance of :meth:`IntervalIndex.is_unique` by removing conversion to `MultiIndex` (:issue:`24813`) +- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero and float ``NaN``; by faster checking the string for the possibility of being a date (:issue:`25754`) +- Improved performance of :attr:`IntervalIndex.is_unique` by removing conversion to ``MultiIndex`` (:issue:`24813`) .. _whatsnew_0250.bug_fixes: @@ -518,7 +519,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) +- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`) - Datetimelike @@ -570,7 +571,7 @@ Numeric Conversion ^^^^^^^^^^ -- Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the `errors` parameter was ignored. (:issue:`25905`) +- Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) - - @@ -597,7 +598,7 @@ Indexing - Bug in which :meth:`DataFrame.append` produced an erroneous warning indicating that a ``KeyError`` will be thrown in the future when the data to be appended contains new columns (:issue:`22252`). - Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`). - Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) -- Allow keyword arguments for callable local reference used in the :method:`DataFrame.query` string (:issue:`26426`) +- Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) Missing @@ -620,8 +621,8 @@ I/O - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) -- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) +- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to :class:`Timestamp`, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) +- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string ``"nan"`` instead of ``numpy.nan`` (:issue:`25468`) - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) @@ -644,7 +645,7 @@ Plotting - Fixed bug where :class:`api.extensions.ExtensionArray` could not be used in matplotlib plotting (:issue:`25587`) - Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) -- Bug in incorrect ticklabel positions when plotting an index that are non-numeric / non-datetime (:issue:`7612` :issue:`15912` :issue:`22334`) +- Bug in incorrect ticklabel positions when plotting an index that are non-numeric / non-datetime (:issue:`7612`, :issue:`15912`, :issue:`22334`) - Fixed bug causing plots of :class:`PeriodIndex` timeseries to fail if the frequency is a multiple of the frequency rule code (:issue:`14763`) - - @@ -655,7 +656,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`) -- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) +- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying an aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`) - Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`) @@ -663,11 +664,11 @@ Groupby/Resample/Rolling - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) - Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`) -- Bug in :meth:`pandas.core.window.Rolling.count` and `pandas.core.window.Expanding.count` was previously ignoring the axis keyword (:issue:`13503`) +- Bug in :meth:`pandas.core.window.Rolling.count` and ``pandas.core.window.Expanding.count`` was previously ignoring the ``axis`` keyword (:issue:`13503`) - Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) -- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) +- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`) - Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) @@ -682,11 +683,11 @@ Reshaping - Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`). - Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`) - bug in :class:`DataFrame` instantiating with a dict of iterators or generators (e.g. ``pd.DataFrame({'A': reversed(range(3))})``) raised an error (:issue:`26349`). -- bug in :class:`DataFrame` instantiating with a ``range`` (e.g. ``pd.DataFrame(range(3))``) raised an error (:issue:`26342`). +- Bug in :class:`DataFrame` instantiating with a ``range`` (e.g. ``pd.DataFrame(range(3))``) raised an error (:issue:`26342`). - Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`) - Bug in :func:`Series.apply` failed when the series is a timezone aware :class:`DatetimeIndex` (:issue:`25959`) - Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an integer overflow (:issue:`26045`) -- Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed DataFrame is sorted on all levels with the initial level sorted last (:issue:`26053`) +- Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed ``DataFrame`` is sorted on all levels with the initial level sorted last (:issue:`26053`) - Bug in :meth:`Series.nlargest` treats ``True`` as smaller than ``False`` (:issue:`26154`) Sparse @@ -702,7 +703,7 @@ Other - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) - Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). -- Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions. +- Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) .. _whatsnew_0.250.contributors: From da6900e149c8e33090b71e9dba9ad58827318250 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 4 Jun 2019 12:23:42 +0100 Subject: [PATCH 29/43] DOC/CI: Removing Panel specific code from validate_docstrings.py (#26627) --- scripts/validate_docstrings.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 63db50db45a7c..64eaf45376b2f 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -539,14 +539,9 @@ def first_line_ends_in_dot(self): if self.doc: return self.doc.split('\n')[0][-1] == '.' - @property - def deprecated_with_directive(self): - return '.. deprecated:: ' in (self.summary + self.extended_summary) - @property def deprecated(self): - return (self.name.startswith('pandas.Panel') - or self.deprecated_with_directive) + return '.. deprecated:: ' in (self.summary + self.extended_summary) @property def mentioned_private_classes(self): @@ -674,7 +669,7 @@ def get_validation_data(doc): errs.append(error('GL07', correct_sections=', '.join(correct_order))) - if (doc.deprecated_with_directive + if (doc.deprecated and not doc.extended_summary.startswith('.. deprecated:: ')): errs.append(error('GL09')) @@ -859,9 +854,9 @@ def validate_all(prefix, ignore_deprecated=False): seen[shared_code_key] = func_name - # functions from introspecting Series, DataFrame and Panel + # functions from introspecting Series and DataFrame api_item_names = set(list(zip(*api_items))[0]) - for class_ in (pandas.Series, pandas.DataFrame, pandas.Panel): + for class_ in (pandas.Series, pandas.DataFrame): for member in inspect.getmembers(class_): func_name = 'pandas.{}.{}'.format(class_.__name__, member[0]) if (not member[0].startswith('_') From dbdd556d9b0b65f3054242dae7001a39f7e3bbc0 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 4 Jun 2019 23:59:01 +0000 Subject: [PATCH 30/43] Remove NDFrame.select (#26641) --- doc/source/reference/frame.rst | 1 - doc/source/reference/series.rst | 1 - doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 34 ------------------ .../tests/frame/test_axis_select_reindex.py | 35 ------------------- pandas/tests/series/indexing/test_indexing.py | 14 -------- 6 files changed, 1 insertion(+), 85 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index b4fb85c028b3e..7d5cd5d245631 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -204,7 +204,6 @@ Reindexing / Selection / Label manipulation DataFrame.rename_axis DataFrame.reset_index DataFrame.sample - DataFrame.select DataFrame.set_axis DataFrame.set_index DataFrame.tail diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 8fccdea979602..79beeb0022307 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -211,7 +211,6 @@ Reindexing / Selection / Label manipulation Series.rename_axis Series.reset_index Series.sample - Series.select Series.set_axis Series.take Series.tail diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 267e34efc946f..4e8af90b85f83 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -487,6 +487,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``parse_cols`` keyword in :func:`read_excel` (:issue:`16488`) - Removed the previously deprecated ``pd.options.html.border`` (:issue:`16970`) - Removed the previously deprecated ``convert_objects`` (:issue:`11221`) +- Removed the previously deprecated ``select`` method of ``DataFrame`` and ``Series`` (:issue:`17633`) .. _whatsnew_0250.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2428bbad7003b..19d093dd29457 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3682,40 +3682,6 @@ class animal locomotion _xs = xs # type: Callable - def select(self, crit, axis=0): - """ - Return data corresponding to axis labels matching criteria. - - .. deprecated:: 0.21.0 - Use df.loc[df.index.map(crit)] to select via labels - - Parameters - ---------- - crit : function - To be called on each index (label). Should return True or False - axis : int - - Returns - ------- - selection : same type as caller - """ - warnings.warn("'select' is deprecated and will be removed in a " - "future release. You can use " - ".loc[labels.map(crit)] as a replacement", - FutureWarning, stacklevel=2) - - axis = self._get_axis_number(axis) - axis_name = self._get_axis_name(axis) - axis_values = self._get_axis(axis) - - if len(axis_values) > 0: - new_axis = axis_values[ - np.asarray([bool(crit(label)) for label in axis_values])] - else: - new_axis = axis_values - - return self.reindex(**{axis_name: new_axis}) - def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): """ diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index ad6c66c911615..42f98d5c96aa5 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -895,41 +895,6 @@ def test_filter_corner(self): result = empty.filter(like='foo') assert_frame_equal(result, empty) - def test_select(self): - - # deprecated: gh-12410 - f = lambda x: x.weekday() == 2 - index = self.tsframe.index[[f(x) for x in self.tsframe.index]] - expected_weekdays = self.tsframe.reindex(index=index) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = self.tsframe.select(f, axis=0) - assert_frame_equal(result, expected_weekdays) - - result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) - expected = self.frame.reindex(columns=['B', 'D']) - assert_frame_equal(result, expected, check_names=False) - - # replacement - f = lambda x: x.weekday == 2 - result = self.tsframe.loc(axis=0)[f(self.tsframe.index)] - assert_frame_equal(result, expected_weekdays) - - crit = lambda x: x in ['B', 'D'] - result = self.frame.loc(axis=1)[(self.frame.columns.map(crit))] - expected = self.frame.reindex(columns=['B', 'D']) - assert_frame_equal(result, expected, check_names=False) - - # doc example - df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) - - crit = lambda x: x in ['bar', 'baz'] - with tm.assert_produces_warning(FutureWarning): - expected = df.select(crit) - result = df.loc[df.index.map(crit)] - assert_frame_equal(result, expected, check_names=False) - def test_take(self): # homogeneous order = [3, 1, 2, 0] diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 6641311faace2..702e22b6741e4 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -772,20 +772,6 @@ def test_setitem_slice_into_readonly_backing_data(): """ -def test_select(test_data): - # deprecated: gh-12410 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - n = len(test_data.ts) - result = test_data.ts.select(lambda x: x >= test_data.ts.index[n // 2]) - expected = test_data.ts.reindex(test_data.ts.index[n // 2:]) - assert_series_equal(result, expected) - - result = test_data.ts.select(lambda x: x.weekday() == 2) - expected = test_data.ts[test_data.ts.index.weekday == 2] - assert_series_equal(result, expected) - - def test_pop(): # GH 6600 df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, }) From 7370c1d29cc89ea067c068318734829ebb681f67 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Wed, 5 Jun 2019 15:22:08 +0800 Subject: [PATCH 31/43] [TST] Fix test_quantile_interpolation_int (#26633) --- pandas/tests/frame/test_quantile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 9ccbd290923ba..097477c42d249 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -160,8 +160,7 @@ def test_quantile_interpolation_int(self, int_frame): assert q['A'] == np.percentile(df['A'], 10) # test with and without interpolation keyword - # TODO: q1 is not different from q - q1 = df.quantile(0.1) + q1 = df.quantile(0.1, axis=0, interpolation='linear') assert q1['A'] == np.percentile(df['A'], 10) tm.assert_series_equal(q, q1) From 8a1f71490fc60e74090e2b2ea43b9293636369b2 Mon Sep 17 00:00:00 2001 From: shawnbrown Date: Wed, 5 Jun 2019 07:53:40 -0400 Subject: [PATCH 32/43] Update Accessors URL for PdVega package. (#26653) See altair-viz/pdvega@7476a8a26b for details. --- doc/source/ecosystem.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index e232bd2157611..b1a5430752558 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -363,4 +363,5 @@ Library Accessor Classes ============== ========== ========================= .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest -.. _pdvega: https://jakevdp.github.io/pdvega/ +.. _pdvega: https://altair-viz.github.io/pdvega/ + From b6427263fc2aa154db6e1df203dc8280bdd99ba0 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 5 Jun 2019 13:46:37 +0100 Subject: [PATCH 33/43] DEPS: Adding missing doc dependencies to environment.yml (#26657) --- environment.yml | 7 +++++++ requirements-dev.txt | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/environment.yml b/environment.yml index cf17dc1281ec9..91ea26eef4b61 100644 --- a/environment.yml +++ b/environment.yml @@ -17,10 +17,17 @@ dependencies: - flake8-rst>=0.6.0,<=0.7.0 - gitpython - hypothesis>=3.82 + - ipywidgets - isort - moto - mypy + - nbconvert>=5.4.1 + - nbformat + - notebook>=5.7.5 + - pandoc - pycodestyle + - pyqt + - python-snappy - pytest>=4.0.2 - pytest-mock - sphinx diff --git a/requirements-dev.txt b/requirements-dev.txt index 115a93495c95b..e6085920a9999 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,10 +8,17 @@ flake8-comprehensions flake8-rst>=0.6.0,<=0.7.0 gitpython hypothesis>=3.82 +ipywidgets isort moto mypy +nbconvert>=5.4.1 +nbformat +notebook>=5.7.5 +pandoc pycodestyle +pyqt +python-snappy pytest>=4.0.2 pytest-mock sphinx From 5abb8c37394ecb703df3c65393da05bab7a5f8e7 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Wed, 5 Jun 2019 12:50:33 +0000 Subject: [PATCH 34/43] use range in RangeIndex instead of _start etc. (#26581) --- doc/source/whatsnew/v0.25.0.rst | 3 + pandas/core/dtypes/common.py | 29 +++ pandas/core/dtypes/concat.py | 21 +- pandas/core/frame.py | 10 +- pandas/core/indexes/range.py | 304 +++++++++++++---------------- pandas/core/series.py | 6 +- pandas/io/packers.py | 7 +- pandas/tests/indexes/test_range.py | 22 ++- 8 files changed, 202 insertions(+), 200 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4e8af90b85f83..4018418294963 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -476,6 +476,9 @@ Other Deprecations the :meth:`SparseArray.to_dense` method instead (:issue:`26421`). - The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64` or :meth:`Timedelta.to_timedelta64`. (:issue:`24416`) - The :meth:`DataFrame.compound` and :meth:`Series.compound` methods are deprecated and will be removed in a future version (:issue:`26405`). +- The internal attributes ``_start``, ``_stop`` and ``_step`` attributes of :class:`RangeIndex` have been deprecated. + Use the public attributes :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop` and :attr:`~RangeIndex.step` instead (:issue:`26581`). + .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b5cd73a81962b..4029e6f4bfdb5 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1,4 +1,5 @@ """ common type operations """ +from typing import Union import warnings import numpy as np @@ -125,6 +126,34 @@ def ensure_int_or_float(arr: ArrayLike, copy=False) -> np.array: return arr.astype('float64', copy=copy) +def ensure_python_int(value: Union[int, np.integer]) -> int: + """ + Ensure that a value is a python int. + + Parameters + ---------- + value: int or numpy.integer + + Returns + ------- + int + + Raises + ------ + TypeError: if the value isn't an int or can't be converted to one. + """ + if not is_scalar(value): + raise TypeError("Value needs to be a scalar value, was type {}" + .format(type(value))) + msg = "Wrong type {} for value {}" + try: + new_value = int(value) + assert (new_value == value) + except (TypeError, ValueError, AssertionError): + raise TypeError(msg.format(type(value), value)) + return new_value + + def classes(*klasses): """ evaluate if the tipo is a subclass of the klasses """ return lambda tipo: issubclass(tipo, klasses) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b22ed45642cf6..e2c6fba322be0 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -541,36 +541,37 @@ def _concat_rangeindex_same_dtype(indexes): """ from pandas import Int64Index, RangeIndex - start = step = next = None + start = step = next_ = None # Filter the empty indexes non_empty_indexes = [obj for obj in indexes if len(obj)] for obj in non_empty_indexes: + rng = obj._range # type: range if start is None: # This is set by the first non-empty index - start = obj._start - if step is None and len(obj) > 1: - step = obj._step + start = rng.start + if step is None and len(rng) > 1: + step = rng.step elif step is None: # First non-empty index had only one element - if obj._start == start: + if rng.start == start: return _concat_index_same_dtype(indexes, klass=Int64Index) - step = obj._start - start + step = rng.start - start - non_consecutive = ((step != obj._step and len(obj) > 1) or - (next is not None and obj._start != next)) + non_consecutive = ((step != rng.step and len(rng) > 1) or + (next_ is not None and rng.start != next_)) if non_consecutive: return _concat_index_same_dtype(indexes, klass=Int64Index) if step is not None: - next = obj[-1] + step + next_ = rng[-1] + step if non_empty_indexes: # Get the stop value from "next" or alternatively # from the last non-empty index - stop = non_empty_indexes[-1]._stop if next is None else next + stop = non_empty_indexes[-1].stop if next_ is None else next_ return RangeIndex(start, stop, step) # Here all "indexes" had 0 length, i.e. were empty. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5957b23535350..48dfa57c47bf6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2282,7 +2282,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, text_col 5 non-null object float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) - memory usage: 200.0+ bytes + memory usage: 248.0+ bytes Prints a summary of columns count and its dtypes but not per column information: @@ -2292,7 +2292,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, RangeIndex: 5 entries, 0 to 4 Columns: 3 entries, int_col to float_col dtypes: float64(1), int64(1), object(1) - memory usage: 200.0+ bytes + memory usage: 248.0+ bytes Pipe output of DataFrame.info to buffer instead of sys.stdout, get buffer content and writes to a text file: @@ -2494,7 +2494,7 @@ def memory_usage(self, index=True, deep=False): 4 1 1.0 1.0+0.0j 1 True >>> df.memory_usage() - Index 80 + Index 128 int64 40000 float64 40000 complex128 80000 @@ -2513,7 +2513,7 @@ def memory_usage(self, index=True, deep=False): The memory footprint of `object` dtype columns is ignored by default: >>> df.memory_usage(deep=True) - Index 80 + Index 128 int64 40000 float64 40000 complex128 80000 @@ -2525,7 +2525,7 @@ def memory_usage(self, index=True, deep=False): many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5168 + 5216 """ result = Series([c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()], index=self.columns) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 9401de3346ccd..82fd7342c027c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -12,7 +12,8 @@ from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.common import ( - is_int64_dtype, is_integer, is_scalar, is_timedelta64_dtype) + ensure_python_int, is_int64_dtype, is_integer, is_scalar, + is_timedelta64_dtype) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, ABCTimedeltaIndex) @@ -65,6 +66,7 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + _range = None # type: range # check whether self._data has benn called _cached_data = None # type: np.ndarray @@ -91,39 +93,19 @@ def __new__(cls, start=None, stop=None, step=None, **dict(start._get_data_as_items())) # validate the arguments - def ensure_int(value, field): - msg = ("RangeIndex(...) must be called with integers," - " {value} was passed for {field}") - if not is_scalar(value): - raise TypeError(msg.format(value=type(value).__name__, - field=field)) - try: - new_value = int(value) - assert(new_value == value) - except (TypeError, ValueError, AssertionError): - raise TypeError(msg.format(value=type(value).__name__, - field=field)) + if com._all_none(start, stop, step): + raise TypeError("RangeIndex(...) must be called with integers") - return new_value + start = ensure_python_int(start) if start is not None else 0 - if com._all_none(start, stop, step): - msg = "RangeIndex(...) must be called with integers" - raise TypeError(msg) - elif start is None: - start = 0 - else: - start = ensure_int(start, 'start') if stop is None: - stop = start - start = 0 + start, stop = 0, start else: - stop = ensure_int(stop, 'stop') - if step is None: - step = 1 - elif step == 0: + stop = ensure_python_int(stop) + + step = ensure_python_int(step) if step is not None else 1 + if step == 0: raise ValueError("Step must not be zero") - else: - step = ensure_int(step, 'step') return cls._simple_new(start, stop, step, name) @@ -142,7 +124,7 @@ def from_range(cls, data, name=None, dtype=None, **kwargs): 'range, {1} was passed'.format(cls.__name__, repr(data))) start, stop, step = data.start, data.stop, data.step - return RangeIndex(start, stop, step, dtype=dtype, name=name, **kwargs) + return cls(start, stop, step, dtype=dtype, name=name, **kwargs) @classmethod def _simple_new(cls, start, stop=None, step=None, name=None, @@ -156,20 +138,16 @@ def _simple_new(cls, start, stop=None, step=None, name=None, if start is None or not is_integer(start): try: - - return RangeIndex(start, stop, step, name=name, **kwargs) + return cls(start, stop, step, name=name, **kwargs) except TypeError: return Index(start, stop, step, name=name, **kwargs) - result._start = start - result._stop = stop or 0 - result._step = step or 1 + result._range = range(start, stop or 0, step or 1) + result.name = name for k, v in kwargs.items(): setattr(result, k, v) - result._range = range(result._start, result._stop, result._step) - result._reset_identity() return result @@ -196,7 +174,7 @@ def _data(self): triggering the construction. """ if self._cached_data is None: - self._cached_data = np.arange(self._start, self._stop, self._step, + self._cached_data = np.arange(self.start, self.stop, self.step, dtype=np.int64) return self._cached_data @@ -206,9 +184,10 @@ def _int64index(self): def _get_data_as_items(self): """ return a list of tuples of start, stop, step """ - return [('start', self._start), - ('stop', self._stop), - ('step', self._step)] + rng = self._range + return [('start', rng.start), + ('stop', rng.stop), + ('step', rng.step)] def __reduce__(self): d = self._get_attributes_dict() @@ -235,39 +214,79 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): return header + list(map(pprint_thing, self._range)) # -------------------------------------------------------------------- - @property + _deprecation_message = ("RangeIndex.{} is deprecated and will be " + "removed in a future version. Use RangeIndex.{} " + "instead") + + @cache_readonly def start(self): """ - The value of the `start` parameter (or ``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied) """ # GH 25710 - return self._start + return self._range.start @property + def _start(self): + """ + The value of the `start` parameter (``0`` if this was not supplied) + + .. deprecated:: 0.25.0 + Use ``start`` instead. + """ + warnings.warn(self._deprecation_message.format("_start", "start"), + DeprecationWarning, stacklevel=2) + return self.start + + @cache_readonly def stop(self): """ The value of the `stop` parameter """ - # GH 25710 - return self._stop + return self._range.stop @property + def _stop(self): + """ + The value of the `stop` parameter + + .. deprecated:: 0.25.0 + Use ``stop`` instead. + """ + # GH 25710 + warnings.warn(self._deprecation_message.format("_stop", "stop"), + DeprecationWarning, stacklevel=2) + return self.stop + + @cache_readonly def step(self): """ - The value of the `step` parameter (or ``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied) """ # GH 25710 - return self._step + return self._range.step + + @property + def _step(self): + """ + The value of the `step` parameter (``1`` if this was not supplied) + + .. deprecated:: 0.25.0 + Use ``step`` instead. + """ + # GH 25710 + warnings.warn(self._deprecation_message.format("_step", "step"), + DeprecationWarning, stacklevel=2) + return self.step @cache_readonly def nbytes(self): """ - Return the number of bytes in the underlying data - On implementations where this is undetermined (PyPy) - assume 24 bytes for each value + Return the number of bytes in the underlying data. """ - return sum(getsizeof(getattr(self, v), 24) for v in - ['_start', '_stop', '_step']) + rng = self._range + return getsizeof(rng) + sum(getsizeof(getattr(rng, attr_name)) + for attr_name in ['start', 'stop', 'step']) def memory_usage(self, deep=False): """ @@ -305,11 +324,11 @@ def is_unique(self): @cache_readonly def is_monotonic_increasing(self): - return self._step > 0 or len(self) <= 1 + return self._range.step > 0 or len(self) <= 1 @cache_readonly def is_monotonic_decreasing(self): - return self._step < 0 or len(self) <= 1 + return self._range.step < 0 or len(self) <= 1 @property def has_duplicates(self): @@ -325,13 +344,13 @@ def get_loc(self, key, method=None, tolerance=None): return super().get_loc(key, method=method, tolerance=tolerance) def tolist(self): - return list(range(self._start, self._stop, self._step)) + return list(self._range) @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is None: name = kwargs.get("name", self.name) - return RangeIndex._simple_new( + return self._simple_new( name=name, **dict(self._get_data_as_items())) else: kwargs.setdefault('name', self.name) @@ -342,18 +361,17 @@ def copy(self, name=None, deep=False, dtype=None, **kwargs): self._validate_dtype(dtype) if name is None: name = self.name - return RangeIndex._simple_new( - name=name, **dict(self._get_data_as_items())) + return self.from_range(self._range, name=name) def _minmax(self, meth): no_steps = len(self) - 1 if no_steps == -1: return np.nan - elif ((meth == 'min' and self._step > 0) or - (meth == 'max' and self._step < 0)): - return self._start + elif ((meth == 'min' and self.step > 0) or + (meth == 'max' and self.step < 0)): + return self.start - return self._start + self._step * no_steps + return self.start + self.step * no_steps def min(self, axis=None, skipna=True, *args, **kwargs): """The minimum value of the RangeIndex""" @@ -382,7 +400,7 @@ def argsort(self, *args, **kwargs): """ nv.validate_argsort(args, kwargs) - if self._step > 0: + if self._range.step > 0: return np.arange(len(self)) else: return np.arange(len(self) - 1, -1, -1) @@ -392,15 +410,7 @@ def equals(self, other): Determines if two Index objects contain the same elements. """ if isinstance(other, RangeIndex): - ls = len(self) - lo = len(other) - return (ls == lo == 0 or - ls == lo == 1 and - self._start == other._start or - ls == lo and - self._start == other._start and - self._step == other._step) - + return self._range == other._range return super().equals(other) def intersection(self, other, sort=False): @@ -433,39 +443,40 @@ def intersection(self, other, sort=False): return super().intersection(other, sort=sort) if not len(self) or not len(other): - return RangeIndex._simple_new(None) + return self._simple_new(None) - first = self[::-1] if self._step < 0 else self - second = other[::-1] if other._step < 0 else other + first = self._range[::-1] if self.step < 0 else self._range + second = other._range[::-1] if other.step < 0 else other._range # check whether intervals intersect # deals with in- and decreasing ranges - int_low = max(first._start, second._start) - int_high = min(first._stop, second._stop) + int_low = max(first.start, second.start) + int_high = min(first.stop, second.stop) if int_high <= int_low: - return RangeIndex._simple_new(None) + return self._simple_new(None) # Method hint: linear Diophantine equation # solve intersection problem # performance hint: for identical step sizes, could use # cheaper alternative - gcd, s, t = first._extended_gcd(first._step, second._step) + gcd, s, t = self._extended_gcd(first.step, second.step) # check whether element sets intersect - if (first._start - second._start) % gcd: - return RangeIndex._simple_new(None) + if (first.start - second.start) % gcd: + return self._simple_new(None) # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds - tmp_start = first._start + (second._start - first._start) * \ - first._step // gcd * s - new_step = first._step * second._step // gcd - new_index = RangeIndex._simple_new(tmp_start, int_high, new_step) + tmp_start = first.start + (second.start - first.start) * \ + first.step // gcd * s + new_step = first.step * second.step // gcd + new_index = self._simple_new(tmp_start, int_high, new_step) # adjust index to limiting interval - new_index._start = new_index._min_fitting_element(int_low) + new_start = new_index._min_fitting_element(int_low) + new_index = self._simple_new(new_start, new_index.stop, new_index.step) - if (self._step < 0 and other._step < 0) is not (new_index._step < 0): + if (self.step < 0 and other.step < 0) is not (new_index.step < 0): new_index = new_index[::-1] if sort is None: new_index = new_index.sort_values() @@ -473,13 +484,13 @@ def intersection(self, other, sort=False): def _min_fitting_element(self, lower_limit): """Returns the smallest element greater than or equal to the limit""" - no_steps = -(-(lower_limit - self._start) // abs(self._step)) - return self._start + abs(self._step) * no_steps + no_steps = -(-(lower_limit - self.start) // abs(self.step)) + return self.start + abs(self.step) * no_steps def _max_fitting_element(self, upper_limit): """Returns the largest element smaller than or equal to the limit""" - no_steps = (upper_limit - self._start) // abs(self._step) - return self._start + abs(self._step) * no_steps + no_steps = (upper_limit - self.start) // abs(self.step) + return self.start + abs(self.step) * no_steps def _extended_gcd(self, a, b): """ @@ -522,16 +533,16 @@ def _union(self, other, sort): return super()._union(other, sort=sort) if isinstance(other, RangeIndex) and sort is None: - start_s, step_s = self._start, self._step - end_s = self._start + self._step * (len(self) - 1) - start_o, step_o = other._start, other._step - end_o = other._start + other._step * (len(other) - 1) - if self._step < 0: + start_s, step_s = self.start, self.step + end_s = self.start + self.step * (len(self) - 1) + start_o, step_o = other.start, other.step + end_o = other.start + other.step * (len(other) - 1) + if self.step < 0: start_s, step_s, end_s = end_s, -step_s, start_s - if other._step < 0: + if other.step < 0: start_o, step_o, end_o = end_o, -step_o, start_o if len(self) == 1 and len(other) == 1: - step_s = step_o = abs(self._start - other._start) + step_s = step_o = abs(self.start - other.start) elif len(self) == 1: step_s = step_o elif len(other) == 1: @@ -542,21 +553,23 @@ def _union(self, other, sort): if ((start_s - start_o) % step_s == 0 and (start_s - end_o) <= step_s and (start_o - end_s) <= step_s): - return RangeIndex(start_r, end_r + step_s, step_s) + return self.__class__(start_r, end_r + step_s, step_s) if ((step_s % 2 == 0) and (abs(start_s - start_o) <= step_s / 2) and (abs(end_s - end_o) <= step_s / 2)): - return RangeIndex(start_r, end_r + step_s / 2, step_s / 2) + return self.__class__(start_r, + end_r + step_s / 2, + step_s / 2) elif step_o % step_s == 0: if ((start_o - start_s) % step_s == 0 and (start_o + step_s >= start_s) and (end_o - step_s <= end_s)): - return RangeIndex(start_r, end_r + step_s, step_s) + return self.__class__(start_r, end_r + step_s, step_s) elif step_s % step_o == 0: if ((start_s - start_o) % step_o == 0 and (start_s + step_o >= start_o) and (end_s - step_o <= end_o)): - return RangeIndex(start_r, end_r + step_o, step_o) + return self.__class__(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) @Appender(_index_shared_docs['join']) @@ -576,7 +589,7 @@ def __len__(self): """ return the length of the RangeIndex """ - return max(0, -(-(self._stop - self._start) // self._step)) + return len(self._range) @property def size(self): @@ -597,59 +610,15 @@ def __getitem__(self, key): n = com.cast_scalar_indexer(key) if n != key: return super_getitem(key) - if n < 0: - n = len(self) + key - if n < 0 or n > len(self) - 1: + try: + return self._range[key] + except IndexError: raise IndexError("index {key} is out of bounds for axis 0 " "with size {size}".format(key=key, size=len(self))) - return self._start + n * self._step - if isinstance(key, slice): - - # This is basically PySlice_GetIndicesEx, but delegation to our - # super routines if we don't have integers - - length = len(self) - - # complete missing slice information - step = 1 if key.step is None else key.step - if key.start is None: - start = length - 1 if step < 0 else 0 - else: - start = key.start - - if start < 0: - start += length - if start < 0: - start = -1 if step < 0 else 0 - if start >= length: - start = length - 1 if step < 0 else length - - if key.stop is None: - stop = -1 if step < 0 else length - else: - stop = key.stop - - if stop < 0: - stop += length - if stop < 0: - stop = -1 - if stop > length: - stop = length - - # delegate non-integer slices - if (start != int(start) or - stop != int(stop) or - step != int(step)): - return super_getitem(key) - - # convert indexes to values - start = self._start + self._step * start - stop = self._start + self._step * stop - step = self._step * step - - return RangeIndex._simple_new(start, stop, step, name=self.name) + new_range = self._range[key] + return self.from_range(new_range, name=self.name) # fall back to Int64Index return super_getitem(key) @@ -660,17 +629,15 @@ def __floordiv__(self, other): if is_integer(other) and other != 0: if (len(self) == 0 or - self._start % other == 0 and - self._step % other == 0): - start = self._start // other - step = self._step // other + self.start % other == 0 and + self.step % other == 0): + start = self.start // other + step = self.step // other stop = start + len(self) * step - return RangeIndex._simple_new( - start, stop, step, name=self.name) + return self._simple_new(start, stop, step, name=self.name) if len(self) == 1: - start = self._start // other - return RangeIndex._simple_new( - start, start + 1, 1, name=self.name) + start = self.start // other + return self._simple_new(start, start + 1, 1, name=self.name) return self._int64index // other @classmethod @@ -712,7 +679,7 @@ def _evaluate_numeric_binop(self, other): # apply if we have an override if step: with np.errstate(all='ignore'): - rstep = step(left._step, right) + rstep = step(left.step, right) # we don't have a representable op # so return a base index @@ -720,16 +687,13 @@ def _evaluate_numeric_binop(self, other): raise ValueError else: - rstep = left._step + rstep = left.step with np.errstate(all='ignore'): - rstart = op(left._start, right) - rstop = op(left._stop, right) + rstart = op(left.start, right) + rstop = op(left.stop, right) - result = RangeIndex(rstart, - rstop, - rstep, - **attrs) + result = self.__class__(rstart, rstop, rstep, **attrs) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return diff --git a/pandas/core/series.py b/pandas/core/series.py index 8fb6ad3e3ccc5..472d984234275 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4010,7 +4010,7 @@ def memory_usage(self, index=True, deep=False): -------- >>> s = pd.Series(range(3)) >>> s.memory_usage() - 104 + 152 Not including the index gives the size of the rest of the data, which is necessarily smaller: @@ -4024,9 +4024,9 @@ def memory_usage(self, index=True, deep=False): >>> s.values array(['a', 'b'], dtype=object) >>> s.memory_usage() - 96 + 144 >>> s.memory_usage(deep=True) - 212 + 260 """ v = super().memory_usage(deep=deep) if index: diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 1309bd1fef421..ead0fbd263ebf 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -367,9 +367,10 @@ def encode(obj): return {'typ': 'range_index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), - 'start': getattr(obj, '_start', None), - 'stop': getattr(obj, '_stop', None), - 'step': getattr(obj, '_step', None)} + 'start': obj._range.start, + 'stop': obj._range.stop, + 'step': obj._range.step, + } elif isinstance(obj, PeriodIndex): return {'typ': 'period_index', 'klass': obj.__class__.__name__, diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 477a4e527f278..bca50186827de 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -51,10 +51,8 @@ def test_constructor(self, args, kwargs, start, stop, step, name): expected = Index(np.arange(start, stop, step, dtype=np.int64), name=name) assert isinstance(result, RangeIndex) - assert result._start == start - assert result._stop == stop - assert result._step == step assert result.name is name + assert result._range == range(start, stop, step) tm.assert_index_equal(result, expected) def test_constructor_invalid_args(self): @@ -169,14 +167,19 @@ def test_start_stop_step_attrs(self, index, start, stop, step): assert index.stop == stop assert index.step == step + def test_deprecated_start_stop_step_attrs(self): + # GH 26581 + idx = self.create_index() + for attr_name in ['_start', '_stop', '_step']: + with tm.assert_produces_warning(DeprecationWarning): + getattr(idx, attr_name) + def test_copy(self): i = RangeIndex(5, name='Foo') i_copy = i.copy() assert i_copy is not i assert i_copy.identical(i) - assert i_copy._start == 0 - assert i_copy._stop == 5 - assert i_copy._step == 1 + assert i_copy._range == range(0, 5, 1) assert i_copy.name == 'Foo' def test_repr(self): @@ -243,8 +246,9 @@ def test_dtype(self): def test_cached_data(self): # GH 26565 - # Calling RangeIndex._data caches an int64 array of the same length at - # self._cached_data. This tests whether _cached_data has been set. + # Calling RangeIndex._data caches an int64 array of the same length as + # self at self._cached_data. + # This tests whether _cached_data is being set by various operations. idx = RangeIndex(0, 100, 10) assert idx._cached_data is None @@ -273,7 +277,7 @@ def test_cached_data(self): df.iloc[5:10] assert idx._cached_data is None - # actually calling data._data + # actually calling idx._data assert isinstance(idx._data, np.ndarray) assert isinstance(idx._cached_data, np.ndarray) From b5535dd0262113a8ca18b8bb1e5f0d35898c1c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Heikkil=C3=A4?= <42970828+mahepe@users.noreply.github.com> Date: Wed, 5 Jun 2019 15:54:34 +0300 Subject: [PATCH 35/43] TST: Test sorting levels not aligned with index (#25775) (#26492) --- pandas/tests/frame/test_sorting.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 246ba943a4509..96aeb608ba3b8 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -227,6 +227,18 @@ def test_stable_descending_multicolumn_sort(self): kind='mergesort') assert_frame_equal(sorted_df, expected) + def test_sort_multi_index(self): + # GH 25775, testing that sorting by index works with a multi-index. + df = DataFrame({'a': [3, 1, 2], 'b': [0, 0, 0], + 'c': [0, 1, 2], 'd': list('abc')}) + result = df.set_index(list('abc')).sort_index(level=list('ba')) + + expected = DataFrame({'a': [1, 2, 3], 'b': [0, 0, 0], + 'c': [1, 2, 0], 'd': list('bca')}) + expected = expected.set_index(list('abc')) + + tm.assert_frame_equal(result, expected) + def test_stable_categorial(self): # GH 16793 df = DataFrame({ From d8c2b40c0a55d2db6c5a65f4c921a0004bb6df17 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 5 Jun 2019 08:59:12 -0400 Subject: [PATCH 36/43] Remove SharedItems from test_excel (#26579) --- pandas/tests/io/test_excel.py | 332 +++++++++++++++++----------------- 1 file changed, 169 insertions(+), 163 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 7693caf3b31d2..b99f0336fa4c5 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -26,13 +26,22 @@ from pandas.io.formats.excel import ExcelFormatter from pandas.io.parsers import read_csv -_seriesd = tm.getSeriesData() -_tsd = tm.getTimeSeriesData() -_frame = DataFrame(_seriesd)[:10] -_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10] -_tsframe = tm.makeTimeDataFrame()[:5] -_mixed_frame = _frame.copy() -_mixed_frame['foo'] = 'bar' + +@pytest.fixture +def frame(float_frame): + return float_frame[:10] + + +@pytest.fixture +def frame2(float_frame): + float_frame = float_frame.copy() + float_frame.columns = ['D', 'C', 'B', 'A'] + return float_frame[:10] + + +@pytest.fixture +def tsframe(): + return tm.makeTimeDataFrame()[:5] @contextlib.contextmanager @@ -49,18 +58,8 @@ def ignore_xlrd_time_clock_warning(): yield -class SharedItems: - - @pytest.fixture(autouse=True) - def setup_method(self, datapath): - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.tsframe = _tsframe.copy() - self.mixed_frame = _mixed_frame.copy() - - @td.skip_if_no('xlrd', '1.0.0') -class ReadingTestsBase(SharedItems): +class ReadingTestsBase: # This is based on ExcelWriterBase @pytest.fixture(autouse=True, params=['xlrd', None]) @@ -1055,9 +1054,9 @@ class TestXlrdReader(ReadingTestsBase): """ @td.skip_if_no("xlwt") - def test_read_xlrd_book(self, ext): + def test_read_xlrd_book(self, ext, frame): import xlrd - df = self.frame + df = frame engine = "xlrd" sheet_name = "SheetA" @@ -1075,7 +1074,7 @@ def test_read_xlrd_book(self, ext): tm.assert_frame_equal(df, result) -class _WriterBase(SharedItems): +class _WriterBase: @pytest.fixture(autouse=True) def set_engine_and_path(self, request, merge_cells, engine, ext): @@ -1150,75 +1149,79 @@ def test_excel_sheet_by_name_raise(self, *_): with pytest.raises(xlrd.XLRDError): pd.read_excel(xl, "0") - def test_excel_writer_context_manager(self, *_): + def test_excel_writer_context_manager(self, frame, frame2, *_): with ExcelWriter(self.path) as writer: - self.frame.to_excel(writer, "Data1") - self.frame2.to_excel(writer, "Data2") + frame.to_excel(writer, "Data1") + frame2.to_excel(writer, "Data2") with ExcelFile(self.path) as reader: found_df = pd.read_excel(reader, "Data1", index_col=0) found_df2 = pd.read_excel(reader, "Data2", index_col=0) - tm.assert_frame_equal(found_df, self.frame) - tm.assert_frame_equal(found_df2, self.frame2) + tm.assert_frame_equal(found_df, frame) + tm.assert_frame_equal(found_df2, frame2) - def test_roundtrip(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan + def test_roundtrip(self, merge_cells, engine, ext, frame): + frame = frame.copy() + frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) # test roundtrip - self.frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1') recons = pd.read_excel(self.path, 'test1', index_col=0) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1', index=False) recons = pd.read_excel(self.path, 'test1', index_col=None) - recons.index = self.frame.index - tm.assert_frame_equal(self.frame, recons) + recons.index = frame.index + tm.assert_frame_equal(frame, recons) - self.frame.to_excel(self.path, 'test1', na_rep='NA') + frame.to_excel(self.path, 'test1', na_rep='NA') recons = pd.read_excel( self.path, 'test1', index_col=0, na_values=['NA']) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) # GH 3611 - self.frame.to_excel(self.path, 'test1', na_rep='88') + frame.to_excel(self.path, 'test1', na_rep='88') recons = pd.read_excel( self.path, 'test1', index_col=0, na_values=['88']) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) - self.frame.to_excel(self.path, 'test1', na_rep='88') + frame.to_excel(self.path, 'test1', na_rep='88') recons = pd.read_excel( self.path, 'test1', index_col=0, na_values=[88, 88.0]) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) # GH 6573 - self.frame.to_excel(self.path, 'Sheet1') + frame.to_excel(self.path, 'Sheet1') recons = pd.read_excel(self.path, index_col=0) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) - self.frame.to_excel(self.path, '0') + frame.to_excel(self.path, '0') recons = pd.read_excel(self.path, index_col=0) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) # GH 8825 Pandas Series should provide to_excel method - s = self.frame["A"] + s = frame["A"] s.to_excel(self.path) recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(s.to_frame(), recons) - def test_mixed(self, merge_cells, engine, ext): - self.mixed_frame.to_excel(self.path, 'test1') + def test_mixed(self, merge_cells, engine, ext, frame): + mixed_frame = frame.copy() + mixed_frame['foo'] = 'bar' + + mixed_frame.to_excel(self.path, 'test1') reader = ExcelFile(self.path) recons = pd.read_excel(reader, 'test1', index_col=0) - tm.assert_frame_equal(self.mixed_frame, recons) + tm.assert_frame_equal(mixed_frame, recons) - def test_ts_frame(self, *_): - df = tm.makeTimeDataFrame()[:5] + def test_ts_frame(self, tsframe, *_): + df = tsframe df.to_excel(self.path, "test1") reader = ExcelFile(self.path) @@ -1226,33 +1229,34 @@ def test_ts_frame(self, *_): recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) - def test_basics_with_nan(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + def test_basics_with_nan(self, merge_cells, engine, ext, frame): + frame = frame.copy() + frame['A'][:5] = nan + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) @pytest.mark.parametrize("np_type", [ np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, merge_cells, engine, ext, np_type): # Test np.int values read come back as int # (rather than float which is Excel's format). - frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), - dtype=np_type) - frame.to_excel(self.path, "test1") + df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), + dtype=np_type) + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) - int_frame = frame.astype(np.int64) + int_frame = df.astype(np.int64) tm.assert_frame_equal(int_frame, recons) recons2 = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) # Test with convert_float=False comes back as float. - float_frame = frame.astype(float) + float_frame = df.astype(float) recons = pd.read_excel(self.path, "test1", convert_float=False, index_col=0) tm.assert_frame_equal(recons, float_frame, @@ -1263,120 +1267,123 @@ def test_int_types(self, merge_cells, engine, ext, np_type): np.float16, np.float32, np.float64]) def test_float_types(self, merge_cells, engine, ext, np_type): # Test np.float values read come back as float. - frame = DataFrame(np.random.random_sample(10), dtype=np_type) - frame.to_excel(self.path, "test1") + df = DataFrame(np.random.random_sample(10), dtype=np_type) + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) - tm.assert_frame_equal(frame, recons, check_dtype=False) + tm.assert_frame_equal(df, recons, check_dtype=False) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, merge_cells, engine, ext, np_type): # Test np.bool values read come back as float. - frame = (DataFrame([1, 0, True, False], dtype=np_type)) - frame.to_excel(self.path, "test1") + df = (DataFrame([1, 0, True, False], dtype=np_type)) + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) - tm.assert_frame_equal(frame, recons) + tm.assert_frame_equal(df, recons) def test_inf_roundtrip(self, *_): - frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - frame.to_excel(self.path, "test1") + df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) - tm.assert_frame_equal(frame, recons) + tm.assert_frame_equal(df, recons) - def test_sheets(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan + def test_sheets(self, merge_cells, engine, ext, frame, tsframe): + frame = frame.copy() + frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(self.path) - self.frame.to_excel(writer, 'test1') - self.tsframe.to_excel(writer, 'test2') + frame.to_excel(writer, 'test1') + tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(self.path) recons = pd.read_excel(reader, 'test1', index_col=0) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) recons = pd.read_excel(reader, 'test2', index_col=0) - tm.assert_frame_equal(self.tsframe, recons) + tm.assert_frame_equal(tsframe, recons) assert 2 == len(reader.sheet_names) assert 'test1' == reader.sheet_names[0] assert 'test2' == reader.sheet_names[1] - def test_colaliases(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan + def test_colaliases(self, merge_cells, engine, ext, frame, frame2): + frame = frame.copy() + frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) - self.frame2.to_excel(self.path, 'test1', header=col_aliases) + frame2.to_excel(self.path, 'test1', header=col_aliases) reader = ExcelFile(self.path) rs = pd.read_excel(reader, 'test1', index_col=0) - xp = self.frame2.copy() + xp = frame2.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) - def test_roundtrip_indexlabels(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan + def test_roundtrip_indexlabels(self, merge_cells, engine, ext, frame): + frame = frame.copy() + frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) # test index_label - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, 'test1', - index_label=['test'], - merge_cells=merge_cells) + df = (DataFrame(np.random.randn(10, 2)) >= 0) + df.to_excel(self.path, 'test1', + index_label=['test'], + merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel( reader, 'test1', index_col=0).astype(np.int64) - frame.index.names = ['test'] - assert frame.index.names == recons.index.names - - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, - 'test1', - index_label=['test', 'dummy', 'dummy2'], - merge_cells=merge_cells) + df.index.names = ['test'] + assert df.index.names == recons.index.names + + df = (DataFrame(np.random.randn(10, 2)) >= 0) + df.to_excel(self.path, + 'test1', + index_label=['test', 'dummy', 'dummy2'], + merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel( reader, 'test1', index_col=0).astype(np.int64) - frame.index.names = ['test'] - assert frame.index.names == recons.index.names - - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, - 'test1', - index_label='test', - merge_cells=merge_cells) + df.index.names = ['test'] + assert df.index.names == recons.index.names + + df = (DataFrame(np.random.randn(10, 2)) >= 0) + df.to_excel(self.path, + 'test1', + index_label='test', + merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel( reader, 'test1', index_col=0).astype(np.int64) - frame.index.names = ['test'] - tm.assert_frame_equal(frame, recons.astype(bool)) + df.index.names = ['test'] + tm.assert_frame_equal(df, recons.astype(bool)) - self.frame.to_excel(self.path, - 'test1', - columns=['A', 'B', 'C', 'D'], - index=False, merge_cells=merge_cells) + frame.to_excel(self.path, + 'test1', + columns=['A', 'B', 'C', 'D'], + index=False, merge_cells=merge_cells) # take 'A' and 'B' as indexes (same row as cols 'C', 'D') - df = self.frame.copy() + df = frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(self.path) @@ -1395,17 +1402,17 @@ def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): tm.assert_frame_equal(result, df) assert result.index.name == 'foo' - def test_excel_roundtrip_datetime(self, merge_cells, *_): + def test_excel_roundtrip_datetime(self, merge_cells, tsframe, *_): # datetime.date, not sure what to test here exactly - tsf = self.tsframe.copy() + tsf = tsframe.copy() - tsf.index = [x.date() for x in self.tsframe.index] + tsf.index = [x.date() for x in tsframe.index] tsf.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) - tm.assert_frame_equal(self.tsframe, recons) + tm.assert_frame_equal(tsframe, recons) def test_excel_date_datetime_format(self, merge_cells, engine, ext): # see gh-4133 @@ -1450,14 +1457,14 @@ def test_to_excel_interval_no_labels(self, *_): # see gh-19242 # # Test writing Interval without labels. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) - expected = frame.copy() + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + dtype=np.int64) + expected = df.copy() - frame["new"] = pd.cut(frame[0], 10) + df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype(str) - frame.to_excel(self.path, "test1") + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) @@ -1467,15 +1474,15 @@ def test_to_excel_interval_labels(self, *_): # see gh-19242 # # Test writing Interval with labels. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) - expected = frame.copy() - intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E", - "F", "G", "H", "I", "J"]) - frame["new"] = intervals + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + dtype=np.int64) + expected = df.copy() + intervals = pd.cut(df[0], 10, labels=["A", "B", "C", "D", "E", + "F", "G", "H", "I", "J"]) + df["new"] = intervals expected["new"] = pd.Series(list(intervals)) - frame.to_excel(self.path, "test1") + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) @@ -1485,23 +1492,23 @@ def test_to_excel_timedelta(self, *_): # see gh-19242, gh-9155 # # Test writing timedelta to xls. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - columns=["A"], dtype=np.int64) - expected = frame.copy() + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + columns=["A"], dtype=np.int64) + expected = df.copy() - frame["new"] = frame["A"].apply(lambda x: timedelta(seconds=x)) + df["new"] = df["A"].apply(lambda x: timedelta(seconds=x)) expected["new"] = expected["A"].apply( lambda x: timedelta(seconds=x).total_seconds() / float(86400)) - frame.to_excel(self.path, "test1") + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_periodindex(self, merge_cells, engine, ext): - frame = self.tsframe - xp = frame.resample('M', kind='period').mean() + def test_to_excel_periodindex( + self, merge_cells, engine, ext, tsframe): + xp = tsframe.resample('M', kind='period').mean() xp.to_excel(self.path, 'sht1') @@ -1509,8 +1516,7 @@ def test_to_excel_periodindex(self, merge_cells, engine, ext): rs = pd.read_excel(reader, 'sht1', index_col=0) tm.assert_frame_equal(xp, rs.to_period('M')) - def test_to_excel_multiindex(self, merge_cells, engine, ext): - frame = self.frame + def test_to_excel_multiindex(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) @@ -1526,21 +1532,21 @@ def test_to_excel_multiindex(self, merge_cells, engine, ext): tm.assert_frame_equal(frame, df) # GH13511 - def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): - frame = pd.DataFrame({'A': [None, 2, 3], - 'B': [10, 20, 30], - 'C': np.random.sample(3)}) - frame = frame.set_index(['A', 'B']) - - frame.to_excel(self.path, merge_cells=merge_cells) - df = pd.read_excel(self.path, index_col=[0, 1]) - tm.assert_frame_equal(frame, df) + def test_to_excel_multiindex_nan_label( + self, merge_cells, engine, ext): + df = pd.DataFrame({'A': [None, 2, 3], + 'B': [10, 20, 30], + 'C': np.random.sample(3)}) + df = df.set_index(['A', 'B']) + + df.to_excel(self.path, merge_cells=merge_cells) + df1 = pd.read_excel(self.path, index_col=[0, 1]) + tm.assert_frame_equal(df, df1) # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells - def test_to_excel_multiindex_cols(self, merge_cells, engine, ext): - frame = self.frame + def test_to_excel_multiindex_cols(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) @@ -1563,9 +1569,9 @@ def test_to_excel_multiindex_cols(self, merge_cells, engine, ext): frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) - def test_to_excel_multiindex_dates(self, merge_cells, engine, ext): + def test_to_excel_multiindex_dates( + self, merge_cells, engine, ext, tsframe): # try multiindex with dates - tsframe = self.tsframe.copy() new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) From 6a37e19db73155e514b8d14a36e0ee53b692609d Mon Sep 17 00:00:00 2001 From: DanielFEvans <41120183+DanielFEvans@users.noreply.github.com> Date: Wed, 5 Jun 2019 19:44:38 +0100 Subject: [PATCH 37/43] ERR: include original error message for missing required dependencies (#26665) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/__init__.py | 8 +++++--- pandas/tests/test_base.py | 27 +++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4018418294963..8fd9f07442810 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -82,7 +82,7 @@ Other Enhancements - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) -- +- Error message for missing required imports now includes the original ImportError's text (:issue:`23868`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/__init__.py b/pandas/__init__.py index 4c494b4a62e39..11ea3047bb62a 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -10,11 +10,13 @@ try: __import__(dependency) except ImportError as e: - missing_dependencies.append(dependency) + missing_dependencies.append((dependency, e)) if missing_dependencies: - raise ImportError( - "Missing required dependencies {0}".format(missing_dependencies)) + msg = "Unable to import required dependencies:" + for dependency, e in missing_dependencies: + msg += "\n{0}: {1}".format(dependency, str(e)) + raise ImportError(msg) del hard_dependencies, dependency, missing_dependencies # numpy compat diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 3b4f85e680f6e..f8319999682e8 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1,7 +1,9 @@ from datetime import datetime, timedelta +from importlib import reload from io import StringIO import re import sys +from unittest.mock import patch import numpy as np import pytest @@ -1341,3 +1343,28 @@ def test_to_numpy_dtype(as_series): expected = np.array(['2000-01-01T05', '2001-01-01T05'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) + + +@patch("builtins.__import__") +def test_missing_required_dependency(mock_import): + def mock_import_fail(name, *args, **kwargs): + if name == "numpy": + raise ImportError("cannot import name numpy") + elif name == "pytz": + raise ImportError("cannot import name some_dependency") + elif name == "dateutil": + raise ImportError("cannot import name some_other_dependency") + else: + return __import__(name, *args, **kwargs) + + mock_import.side_effect = mock_import_fail + + expected_msg = ( + "Unable to import required dependencies:" + "\nnumpy: cannot import name numpy" + "\npytz: cannot import name some_dependency" + "\ndateutil: cannot import name some_other_dependency" + ) + + with pytest.raises(ImportError, match=expected_msg): + reload(pd) From 5271868402a9cd2b24c2e161bd3a9b677936e459 Mon Sep 17 00:00:00 2001 From: nathalier Date: Wed, 5 Jun 2019 20:06:13 +0100 Subject: [PATCH 38/43] BUG: fix TypeError for invalid integer dates %Y%m%d with errors='ignore' (# GH 26583) (#26585) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/tslibs/strptime.pyx | 6 +++--- pandas/tests/indexes/datetimes/test_tools.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8fd9f07442810..02ee275bab364 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -537,6 +537,7 @@ Datetimelike - Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) - Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`) - Bug in :func:`to_datetime` which raises unhandled ``OverflowError`` when called with mix of invalid dates and ``NaN`` values with ``format='%Y%m%d'`` and ``error='coerce'`` (:issue:`25512`) +- Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index af3d3fa646a12..d93858cff5e05 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -140,13 +140,13 @@ def array_strptime(object[:] values, object fmt, iresult[i] = NPY_NAT continue raise ValueError("time data %r does not match " - "format %r (match)" % (values[i], fmt)) + "format %r (match)" % (val, fmt)) if len(val) != found.end(): if is_coerce: iresult[i] = NPY_NAT continue raise ValueError("unconverted data remains: %s" % - values[i][found.end():]) + val[found.end():]) # search else: @@ -156,7 +156,7 @@ def array_strptime(object[:] values, object fmt, iresult[i] = NPY_NAT continue raise ValueError("time data %r does not match format " - "%r (search)" % (values[i], fmt)) + "%r (search)" % (val, fmt)) iso_year = -1 year = 1900 diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index c507c31ee54dd..ea33e563b31be 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -133,6 +133,25 @@ def test_to_datetime_format_integer(self, cache): result = to_datetime(s, format='%Y%m', cache=cache) assert_series_equal(result, expected) + @pytest.mark.parametrize('int_date, expected', [ + # valid date, length == 8 + [20121030, datetime(2012, 10, 30)], + # short valid date, length == 6 + [199934, datetime(1999, 3, 4)], + # long integer date partially parsed to datetime(2012,1,1), length > 8 + [2012010101, 2012010101], + # invalid date partially parsed to datetime(2012,9,9), length == 8 + [20129930, 20129930], + # short integer date partially parsed to datetime(2012,9,9), length < 8 + [2012993, 2012993], + # short invalid date, length == 4 + [2121, 2121]]) + def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, + expected): + # GH 26583 + result = to_datetime(int_date, format='%Y%m%d', errors='ignore') + assert result == expected + @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_format_microsecond(self, cache): From 2cc1ca0287266ee83c1cd6dab473b8f184d5fd36 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 5 Jun 2019 22:30:45 +0200 Subject: [PATCH 39/43] Revert "ERR: include original error message for missing required dependencies (#26665)" This reverts commit 047d32d20640898978dbf6d9855cd6fecbbcf0d5. --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/__init__.py | 8 +++----- pandas/tests/test_base.py | 27 --------------------------- 3 files changed, 4 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 02ee275bab364..1fb9b5ae695a0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -82,7 +82,7 @@ Other Enhancements - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) -- Error message for missing required imports now includes the original ImportError's text (:issue:`23868`) +- .. _whatsnew_0250.api_breaking: diff --git a/pandas/__init__.py b/pandas/__init__.py index 11ea3047bb62a..4c494b4a62e39 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -10,13 +10,11 @@ try: __import__(dependency) except ImportError as e: - missing_dependencies.append((dependency, e)) + missing_dependencies.append(dependency) if missing_dependencies: - msg = "Unable to import required dependencies:" - for dependency, e in missing_dependencies: - msg += "\n{0}: {1}".format(dependency, str(e)) - raise ImportError(msg) + raise ImportError( + "Missing required dependencies {0}".format(missing_dependencies)) del hard_dependencies, dependency, missing_dependencies # numpy compat diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index f8319999682e8..3b4f85e680f6e 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1,9 +1,7 @@ from datetime import datetime, timedelta -from importlib import reload from io import StringIO import re import sys -from unittest.mock import patch import numpy as np import pytest @@ -1343,28 +1341,3 @@ def test_to_numpy_dtype(as_series): expected = np.array(['2000-01-01T05', '2001-01-01T05'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) - - -@patch("builtins.__import__") -def test_missing_required_dependency(mock_import): - def mock_import_fail(name, *args, **kwargs): - if name == "numpy": - raise ImportError("cannot import name numpy") - elif name == "pytz": - raise ImportError("cannot import name some_dependency") - elif name == "dateutil": - raise ImportError("cannot import name some_other_dependency") - else: - return __import__(name, *args, **kwargs) - - mock_import.side_effect = mock_import_fail - - expected_msg = ( - "Unable to import required dependencies:" - "\nnumpy: cannot import name numpy" - "\npytz: cannot import name some_dependency" - "\ndateutil: cannot import name some_other_dependency" - ) - - with pytest.raises(ImportError, match=expected_msg): - reload(pd) From ae50e39a611a337be06109a66c0a23e37e20013e Mon Sep 17 00:00:00 2001 From: AlexTereshenkov <50622389+AlexTereshenkov@users.noreply.github.com> Date: Wed, 5 Jun 2019 22:37:54 +0100 Subject: [PATCH 40/43] Remove redundant check arr_or_dtype is None (#26655) --- pandas/core/dtypes/common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4029e6f4bfdb5..52011d53d22cd 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1931,8 +1931,6 @@ def _is_dtype_type(arr_or_dtype, condition): if issubclass(arr_or_dtype, ExtensionDtype): arr_or_dtype = arr_or_dtype.type return condition(np.dtype(arr_or_dtype).type) - elif arr_or_dtype is None: - return condition(type(None)) # if we have an array-like if hasattr(arr_or_dtype, 'dtype'): From 077c7c276ab41f4717b2bbe32dd0b2fd17dd9f69 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Jun 2019 16:48:47 -0500 Subject: [PATCH 41/43] filter warning in repr (#26669) --- pandas/core/sparse/frame.py | 5 +++++ pandas/core/sparse/series.py | 10 ++++++---- pandas/tests/sparse/test_format.py | 13 +++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 0320da6d9a48d..67ecbcbea67f9 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -242,6 +242,11 @@ def _init_spmatrix(self, data, index, columns, dtype=None, def to_coo(self): return SparseFrameAccessor(self).to_coo() + def __repr__(self): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Sparse") + return super().__repr__() + def __getstate__(self): # pickling return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data, diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 3814d8bb66635..3e3bae6444082 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -214,10 +214,12 @@ def as_sparse_array(self, kind=None, fill_value=None, copy=False): fill_value=fill_value, kind=kind, copy=copy) def __repr__(self): - series_rep = Series.__repr__(self) - rep = '{series}\n{index!r}'.format(series=series_rep, - index=self.sp_index) - return rep + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Sparse") + series_rep = Series.__repr__(self) + rep = '{series}\n{index!r}'.format(series=series_rep, + index=self.sp_index) + return rep def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index 37c2acc587cf6..7ed8c48fce333 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pytest @@ -133,3 +135,14 @@ def test_sparse_repr_after_set(self): repr(sdf) tm.assert_sp_frame_equal(sdf, res) + + +def test_repr_no_warning(): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + df = pd.SparseDataFrame({"A": [1, 2]}) + s = df['A'] + + with tm.assert_produces_warning(None): + repr(df) + repr(s) From 52ed9153feda581c85b79a0822f2a44fc00ff55e Mon Sep 17 00:00:00 2001 From: Vaibhav Vishal Date: Fri, 7 Jun 2019 00:06:45 +0530 Subject: [PATCH 42/43] convert DatetimeLikeScalar to TypeVar --- pandas/_typing.py | 3 ++- pandas/core/arrays/datetimelike.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 24ee65645905b..9c059cb610c6e 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -18,6 +18,7 @@ ABCSparseSeries, np.ndarray) ArrayLike = TypeVar('ArrayLike', ABCExtensionArray, np.ndarray) -DatetimeLikeScalar = Type[Union[Period, Timestamp, Timedelta]] +DatetimeLikeScalar = TypeVar('DatetimeLikeScalar', Period, Timestamp, + Timedelta) Dtype = Union[str, np.dtype, ExtensionDtype] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c32f8642dc2ed..c99c09cdac96c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta import operator -from typing import Any, Sequence, Union, cast +from typing import Any, Sequence, Type, Union, cast import warnings import numpy as np @@ -58,7 +58,7 @@ def _get_attributes_dict(self): return {k: getattr(self, k, None) for k in self._attributes} @property - def _scalar_type(self) -> DatetimeLikeScalar: + def _scalar_type(self) -> Type[DatetimeLikeScalar]: """The scalar associated with this datelike * PeriodArray : Period From 2d3376a07abc1dd443863d25109cf41c3923398b Mon Sep 17 00:00:00 2001 From: Vaibhav Vishal Date: Fri, 7 Jun 2019 00:07:55 +0530 Subject: [PATCH 43/43] remove unused import --- pandas/_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 9c059cb610c6e..a2bb168c1e2da 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import IO, AnyStr, Type, TypeVar, Union +from typing import IO, AnyStr, TypeVar, Union import numpy as np